Fetching training images
Python
Copy
from torchvision import datasets
data_folder = "~/data"
fmnist = datasets.FashionMNIST(data_folder, download=True, train=True)
targets = fmnist.targets
data = fmnist.data
fig = plt.figure(figsize=(10, 10))
for idx, target in enumerate(classes, 1):
target_idx = torch.where(targets == target)[0][0]
sample = data[target_idx, :, :]
ax = fig.add_subplot(2, 5, idx)
ax.imshow(sample, cmap="gray")
ax.set_title(f"class: {int(target)}")
plt.tight_layout()
plt.axis("off")
plt.show()
Defining dataloader and model
Python
Copy
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from torch.optim import SGD
class FMnistDataset(Dataset):
def __init__(self, x, y):
x = x.float().view(-1, 28*28)
self.x, self.y = x, y
def __getitem__(self, idx):
x, y = self.x[idx], self.y[idx]
return x.to(device), y.to(device)
def __len__(self):
return len(self.x)
def get_data(X_tr, y_tr, X_val, y_val):
dataset_train = FMnistDataset(x=X_tr, y=y_tr)
loader_tr = DataLoader(dataset_train, batch_size=16, shuffle=True)
dataset_val = FMnistDataset(x=X_val, y=y_val)
loader_val = DataLoader(dataset_val, batch_size=len(X_val))
return loader_tr, loader_val
def get_model():
model = nn.Sequential(
nn.Linear(28 * 28, 1000),
nn.ReLU(),
nn.Linear(1000, 10),
).to(device)
loss_fn = nn.CrossEntropyLoss()
optim = Adam(model.parameters(), lr=.01)
return model, loss_fn, optim
CrossEntropyLoss expects unconstrained values, that become probabilities after internal Softmax
Defining training and accuracy methods
Python
Copy
def train_batch(x, y, model, loss_func, optim):
model.train()
y_hat = model(x)
batch_loss = loss_func(y_hat, y)
batch_loss.backward()
optim.step()
optim.zero_grad()
return batch_loss.item()
def get_accuracy(x, y, model):
model.eval()
with torch.no_grad():
y_hat = model(x)
maxvalues, argmax = y_hat.max(-1)
is_correct = argmax == y
return is_correct.cpu().numpy().tolist()
@torch.no_grad()
def get_val_loss(x, y, model, loss_fn):
model.eval()
y_hat = model(x)
loss_val = loss_fn(y_hat, y)
return loss_val.item()
Run script
Python
Copy
loader_tr, loader_val = get_data(X_tr, y_tr, X_val, y_val)
model, loss_fn, optim = get_model()
epoch = 10
losses_tr, losses_val = [], []
accuracies_tr, accuracies_val = [], []
for idx in range(epoch):
epoch_losses_tr, epoch_accuracies_tr = [], []
for x, y in loader_tr:
batch_loss_tr = train_batch(x, y, model, loss_fn, optim)
epoch_losses_tr.append(batch_loss_tr)
for x, y in loader_tr:
batch_accuracy_tr = accuracy(x, y, model)
epoch_accuracies_tr.extend(batch_accuracy_tr)
for x, y in loader_val:
epoch_loss_val = get_val_loss(x, y, model, loss_fn)
epoch_accuracies_val = accuracy(x, y, model)
losses_tr.append(np.mean(epoch_losses_tr))
accuracies_tr.append(np.mean(epoch_accuracies_tr))
losses_val.append(epoch_loss_val)
accuracies_val.append(np.mean(epoch_accuracies_val))
print(
f"epoch: {idx} -- mean train accuracy: {np.mean(epoch_accuracies_tr)}"
f" -- val accuracy: {np.mean(epoch_accuracies_val)}"
)
Tuning
Normalizing input yields high accuracy boost:
Python
Copy
class FMnistDataset(Dataset):
def __init__(self, x, y):
x = x.float() / 255
x = x.view(-1, 28*28)
self.x, self.y = x, y
It’s because the sigmoid will quickly saturate when inputs aren’t normalized
​
Batchsize
Higher batch size reduce the number of weight update, hence computation is faster but accuracy drops.
Optimizer
Here Adam performances are slightly below those of SGD
Learning rate
High learning rate (0.1) leads to underfitting with poor performances
Low learning rate (1e-5) needs to be associated with 100 epochs instead of 10 for performances to gain a slight boost
Learning rate
Parameters distribution
So far we only have 4 parameter groups, 2 per layers (1 weight and 1 bias)
Lower learning leads to a larger distribution
Large distribution leads to overfitting
Non-scaled data
High and medium learning can’t learn over unscaled data here
However, lower learning rate overfits, but its validation is the same than scaled with lr of 1e-3, why is that?
because this time weights are much smaller due to the low increment with low lr
Learning rate annealing
Best is to reduce weight dynamically when the training model starts to overfit
If the validation loss doesn’t decrease for x epochs in a row, the learning rate is decreased
Python
Copy
scheduler = optim.lr_scheduler.ReduceLROnPlateau(
optimizer,factor=0.5,
patience=0,
threshold = 0.001,
verbose=True,
min_lr = 1e-5,
threshold_mode = 'abs'
)
...
scheduler.step(validation_loss)
Layers
We remove the hidden layers (our model become a logistic regression) and add a second hidden layers
Unable to learn with a single layer
According to validation loss, the model overfit much more with 2 hidden layers
Batch normalization
Deep nets can have saturating values for sigmoid and non linearities, due to weight multiplications between hidden layers
When the input is very small, vanishing gradient prevent from optimal training
Batch norm normalizes the values at each node
​
​
​
​
The network identifies the best  and  parameters
Specified after linear activation
Python
Copy
nn.Linear(28**2, 1000)
nn.BatchNorm1d(1000)
Avoiding overfitting
Dropout
Switch some weight at 0 randomly for each iteration, this way the network will have limited opportunities to overfit
Weights will be downscaled automatically during prediction, to account for the missing weights during training
Specified before linear activation
Python
Copy
nn.Dropout(0.25)
nn.Linear(28**2, 1000)
Regularization
Penalize model for having weights with too much values, leading to overfitting
Resulting parameters distribution is less wide
L1 Loss
​
Python
Copy
def train_batch():
model.train()
y_hat = model(x)
l1_regularization = 0
for param in model.parameters():
l1_regulariation += torch.norm(param, 1)
batch_loss = loss_fn(y_hat, y) + 1e-4 * l1_regularization
...
L2 Loss
​
Python
Copy
l2_regularization += torch.norm(param, 2)