from torchvision import datasets

data_folder = "~/data"
fmnist = datasets.FashionMNIST(data_folder, download=True, train=True)

targets = fmnist.targets
data = fmnist.data

fig = plt.figure(figsize=(10, 10))
for idx, target in enumerate(classes, 1):
  target_idx = torch.where(targets == target)[0][0]
  sample = data[target_idx, :, :]
  ax = fig.add_subplot(2, 5, idx)
  ax.imshow(sample, cmap="gray")
  ax.set_title(f"class: {int(target)}")
  plt.tight_layout()
  plt.axis("off")
plt.show()

Defining dataloader and model

Python

Copy

from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from torch.optim import SGD


class FMnistDataset(Dataset):
def __init__(self, x, y):
		x = x.float().view(-1, 28*28)
		self.x, self.y = x, y
	
	def __getitem__(self, idx):
		x, y = self.x[idx], self.y[idx]
return x.to(device), y.to(device)
def __len__(self):
return len(self.x)
def get_data(X_tr, y_tr, X_val, y_val):

  dataset_train = FMnistDataset(x=X_tr, y=y_tr)
  loader_tr = DataLoader(dataset_train, batch_size=16, shuffle=True)
  
  dataset_val = FMnistDataset(x=X_val, y=y_val)
  loader_val = DataLoader(dataset_val, batch_size=len(X_val))
return loader_tr, loader_val


def get_model():
  model = nn.Sequential(
      nn.Linear(28 * 28, 1000),
      nn.ReLU(),
      nn.Linear(1000, 10),
).to(device)
  loss_fn = nn.CrossEntropyLoss()
  optim = Adam(model.parameters(), lr=.01)
return model, loss_fn, optim

CrossEntropyLoss expects unconstrained values, that become probabilities after internal Softmax

Defining training and accuracy methods

Python

Copy

def train_batch(x, y, model, loss_func, optim):
	model.train()
	y_hat = model(x)
	batch_loss = loss_func(y_hat, y)
	batch_loss.backward()
	optim.step()
	optim.zero_grad()
return batch_loss.item()
def get_accuracy(x, y, model):
	model.eval()
with torch.no_grad():
		y_hat = model(x)
	maxvalues, argmax = y_hat.max(-1)
	is_correct = argmax == y
	
	return is_correct.cpu().numpy().tolist()
@torch.no_grad()
def get_val_loss(x, y, model, loss_fn):
	model.eval()
  y_hat = model(x)
  loss_val = loss_fn(y_hat, y)
return loss_val.item()

Run script

Python

Copy

loader_tr, loader_val = get_data(X_tr, y_tr, X_val, y_val)
model, loss_fn, optim = get_model()
epoch = 10

losses_tr, losses_val = [], []
accuracies_tr, accuracies_val = [], []
for idx in range(epoch):

  epoch_losses_tr, epoch_accuracies_tr = [], []
for x, y in loader_tr:
    batch_loss_tr = train_batch(x, y, model, loss_fn, optim)
    epoch_losses_tr.append(batch_loss_tr)
for x, y in loader_tr:
    batch_accuracy_tr = accuracy(x, y, model)
    epoch_accuracies_tr.extend(batch_accuracy_tr)
for x, y in loader_val:
    epoch_loss_val = get_val_loss(x, y, model, loss_fn)
    epoch_accuracies_val = accuracy(x, y, model)

  losses_tr.append(np.mean(epoch_losses_tr))
  accuracies_tr.append(np.mean(epoch_accuracies_tr))
  losses_val.append(epoch_loss_val)
  accuracies_val.append(np.mean(epoch_accuracies_val))
print(
f"epoch: {idx} -- mean train accuracy: {np.mean(epoch_accuracies_tr)}"
f" -- val accuracy: {np.mean(epoch_accuracies_val)}"
)

Tuning

Normalizing input yields high accuracy boost:

Python

Copy

class FMnistDataset(Dataset):
def __init__(self, x, y):
		x = x.float() / 255
		x = x.view(-1, 28*28)
		self.x, self.y = x, y

It’s because the sigmoid will quickly saturate when inputs aren’t normalized

sigmoid=1/(1+e−input∗weight)sigmoid=1/(1+e^{-input*weight})sigmoid=1/(1+e−input∗weight)﻿​

Batchsize

Higher batch size reduce the number of weight update, hence computation is faster but accuracy drops.

Optimizer

Here Adam performances are slightly below those of SGD

Learning rate

High learning rate (0.1) leads to underfitting with poor performances

Low learning rate (1e-5) needs to be associated with 100 epochs instead of 10 for performances to gain a slight boost

Learning rate

Parameters distribution

So far we only have 4 parameter groups, 2 per layers (1 weight and 1 bias)

Lower learning leads to a larger distribution

Large distribution leads to overfitting

Non-scaled data

High and medium learning can’t learn over unscaled data here

However, lower learning rate overfits, but its validation is the same than scaled with lr of 1e-3, why is that?

because this time weights are much smaller due to the low increment with low lr

Learning rate annealing

Best is to reduce weight dynamically when the training model starts to overfit

If the validation loss doesn’t decrease for x epochs in a row, the learning rate is decreased

Python

Copy

scheduler = optim.lr_scheduler.ReduceLROnPlateau(
	optimizer,factor=0.5,
	patience=0,
	threshold = 0.001,
	verbose=True,
	min_lr = 1e-5,
	threshold_mode = 'abs'
)
...

scheduler.step(validation_loss)

Layers

We remove the hidden layers (our model become a logistic regression) and add a second hidden layers

Unable to learn with a single layer

According to validation loss, the model overfit much more with 2 hidden layers

Batch normalization

Deep nets can have saturating values for sigmoid and non linearities, due to weight multiplications between hidden layers

When the input is very small, vanishing gradient prevent from optimal training

Batch norm normalizes the values at each node

μ=1m∑i=0mxi\mu =\frac{1}{m}\sum^m_{i=0}x_iμ=m1​∑i=0m​xi​﻿​

σ2=1m∑im(xi−μ)2\sigma^2=\frac{1}{m}\sum_i^m(x_i-\mu)^2σ2=m1​∑im​(xi​−μ)2﻿​

xˉi=xi−μσ2+ϵ\bar{x}_i=\frac{x_i-\mu}{\sqrt{\sigma^2+\epsilon}}xˉi​=σ2+ϵ​xi​−μ​﻿​

xiBN=αxˉi+βx^{BN}_i=\alpha\bar{x}_i+\betaxiBN​=αxˉi​+β﻿​

The network identifies the best α\alphaα﻿ and β\betaβ﻿ parameters

Specified after linear activation

Python

Copy

nn.Linear(28**2, 1000)
nn.BatchNorm1d(1000)

Avoiding overfitting

Dropout

Switch some weight at 0 randomly for each iteration, this way the network will have limited opportunities to overfit

Weights will be downscaled automatically during prediction, to account for the missing weights during training

Specified before linear activation

Python

Copy

nn.Dropout(0.25)
nn.Linear(28**2, 1000)

Regularization

Penalize model for having weights with too much values, leading to overfitting

Resulting parameters distribution is less wide

L1 Loss

L1  loss=−1n∑in(yilog(pi)+(1−yi)log(1−pi))+A∑jm∣wj∣L_1\;loss=-\frac{1}{n}\sum_i^n \big(y_i log(p_i)+(1-y_i)log(1-p_i)\big) + \Alpha \sum_j^m |w_j|L1​loss=−n1​∑in​(yi​log(pi​)+(1−yi​)log(1−pi​))+A∑jm​∣wj​∣﻿​

Python

Copy

def train_batch():
	model.train()
	y_hat = model(x)
	l1_regularization = 0
for param in model.parameters():
		l1_regulariation += torch.norm(param, 1)
	batch_loss = loss_fn(y_hat, y) + 1e-4 * l1_regularization
	...

L2 Loss

B∑imwj2\Beta\sum^m_i w_j^2B∑im​wj2​﻿​

Python

Copy

l2_regularization += torch.norm(param, 2)