Chapter 2: PyTorch fundamentals

API Overview

Auto gradient of tensor objects

Torch vs Numpy

Building a neural net

Dataset, DataLoader, and batch size

Inference

Sequential model

Saving and loading model

API Overview

torch.tensor behaves like numpy.array

type generalization

Python

Copy

>> torch.tensor([False, 1, 2.0])
tensor([0., 1., 2.])

random

Python

Copy

torch.randint(low=0, high=10, size=(3,4))
torch.rand((3,4))
torch.randn((3,4))

adding and multiplying scalar

Python

Copy

x = torch.tensor([[1,2,3,4], [5,6,7,8]])
print(x * 10)

y = x.add(10)
print(y)

reshapping

Python

Copy

y = torch.tensor([2, 3, 1, 0])
y = y.view(4,1)

squeeze can also be used for reshapping if the dimension to remove has only one item

Python

Copy

x = torch.randn(10,1,10)
z1 = torch.squeeze(x, 1)
#or
z2 = x.squeeze(1)

opposite of squeeze is unsqueeze

Python

Copy

x = torch.randn(10,10)
z1 = x.unsqueeze(0)
print(z1.shape)
# torch.size(1,10,10)

the same can be obtained by adding fake dim

Python

Copy

z2, z3, z4 = x[None], x[:,None], x[:,:,None]
print(z2.shape, z3.shape, z4.shape)
# torch.Size([1, 10, 10])
# torch.Size([10, 1, 10])
# torch.Size([10, 10, 1])

Matrix multiply

Python

Copy

torch.matmul(x, y)
## or
x @ y

Concat

Python

Copy

x = torch.randn(10, 10, 10)
z = torch.cat([x, x], axis=0)
print('Cat axis 0:', x.shape, z.shape)
# Cat axis 0: 
# torch.Size([10, 10, 10])
# torch.Size([20, 10, 10])

Max

Python

Copy

x = torch.arange(25).reshape(5,5)
print(x.max())
# tensor(24)

x.max(dim=0)
# torch.return_types.max(values=tensor([20, 21, 22, 23, 24]),
# indices=tensor([4, 4, 4, 4, 4]))

x.max(dim=1)
# tensor([ 4, 9, 14, 19, 24])
# tensor([4, 4, 4, 4, 4])

Permute

always use permute over view to swap

Python

Copy

x = torch.randn(10,20,30)
z = x.permute(2,0,1)
print('Permute dimensions:', x.shape, z.shape)
# Permute dimensions:
# torch.Size([10, 20, 30])
# torch.Size([30, 10, 20])

Auto gradient of tensor objects

The requires_grad parameter specifies that the gradient is to be calculated for the tensor object

Python

Copy

x = torch.tensor([[2., -1.], [1., 1.]], requires_grad=True)

Defining our function. Its gradient is 2 * x

Python

Copy

out = x.pow(2).sum()

compute the gradient

Python

Copy

out.backward()

We are now in a position to obtain the gradient of out with respect to x

Python

Copy

out.grad
# tensor([[4., 2.],
#         [2., 2.]])

Torch vs Numpy

Register tensor objects, aka storing information in a device

Python

Copy

x = torch.rand(1, 6400)
y = torch.rand(6400, 5000)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
x, y = x.to(device), y.to(device)

GPU matmul: fast

Python

Copy

%timeit z = (x @ y)
# It takes 0.515 milli seconds on an average to
# perform matrix multiplication

CPU matmul: 20x slower

Python

Copy

x, y = x.cpu(), y.cpu()
%timeit z = (x @ y)
# It takes 9 milli seconds on an average to
# perform matrix multiplication

Numpy CPU matmul: 40x slower

Python

Copy

x = np.random.random((1, 6400))
y = np.random.random((6400, 5000))
%timeit z = np.matmul(x, y)
# It takes 19 milli seconds on an average to
# perform matrix multiplication

Building a neural net

Define input and output as

Python

Copy

x = [[1, 2], [3, 4], [5, 6], [7, 8]]
y = [[3], [7], [11], [15]]

device = "cuda" if torch.cuda.is_available() else "cpu"

X = torch.tensor(x).float().to(device)
Y = torch.tensor(y).float().to(device)

Define module as

must call super in __init__ and overwrite forward

Python

Copy

import torch.nn as nn

class MyModule(nn.Module):
def __init__(self):
super().__init__()
        self.input_to_hidden = nn.Linear(2, 8)
        self.activation = nn.ReLU()
        self.hidden_to_class = nn.Linear(8, 1)
def forward(self, x):
        x = self.input_to_hidden(x)
        x = self.activation(x)
        x = self.hidden_to_class(x)
return x

Instantiate and check weights

Python

Copy

my_module = MyModule().to(device)
my_module.input_to_hidden.weight
#Parameter containing:
#tensor([[-0.1474,  0.0559],
#        [ 0.0454,  0.3088],
#        [-0.1240, -0.5073],
#        [-0.3796,  0.6139],
#        [ 0.0052, -0.6587],
#        [ 0.0177,  0.5601],
#        [-0.0376, -0.6641],
#        [-0.6778,  0.6188]], requires_grad=True)

mymodule.parameters()
# all parameters

Parameter can replace any tensor creator in __init__

Python

Copy

self.input_to_hidden = nn.Parameter(torch.rand(2,8))
# and in forward
x = x @ self.input_to_hidden

Define loss function that we optimize for

PyTorch convention: in the loss function, always put the prediction first and then the ground truth

Python

Copy

loss_func = nn.MSELoss()
# also popular
# nn.CrossEntropyLoss, for categorical
# nn.BCELoss, for binary

Y_hat = mymodule(X)
print(loss_func(Y_hat, Y))

Custom loss can also be defined

Python

Copy

def my_MAELoss(y_hat, y):
		loss = (y_hat - y).abs()
		loss = loss.mean()
return loss

Define optimizer

Python

Copy

from torch.optim import SGD

opt = SGD(mymodule.parameters(), lr=.001)

Wrapup

reset gradient at each epoch with zero_grad

Python

Copy

mymodule = MyModule()
opt = nn.SGD(mymodule.parameters(), lr=.001)
loss_history = []
for idx in range(n_epoch):
	  opt.zero_grad()
		Y_hat = mymodule(X)
		loss_value = loss_func(Y_hat, Y)
		loss_value.backward()
		opt.step()
		loss_history.append(loss_value)

Dataset, DataLoader, and batch size

Batch size is the number of point we consider to compute the loss function

representative enough of the dataset, and memory efficient

Implement dataset class

Python

Copy

import torch
import torch.nn as nn
from torch.utils.data import DataSet, DataLoader

class MyDataSet(DataSet):
def __init__(self, x, y):
				self.X = torch.tensor(x).float()
				self.Y = torch.tensor(y).float()
def __getitem__(self, idx):
return self.X[idx], self.Y[idx]
def __len__(self):
return len(self.X)

Use it with dataloader

Python

Copy

ds = MyDataSet(x, y)
dl = DataLoader(ds, batch_size=2, shuffle=True)
for _ in range(n_epoch):
for x, y in dl:
...

Inference

Simple as

Python

Copy

x_val = [[12, 13]]
X_val = torch.tensor(x_val).float().to(device)
mymodule(X_val)

Sequential model

Define the whole model in a single shot

Python

Copy

model = nn.Sequential(
		nn.Linear(8, 2),
		nn.ReLu(),
		nn.Linear(8, 1)
).to(device)

Summary of the model, along with its input size

Python

Copy

!pip install torch_summary
from torchsummary import summary

summary(model, torch.zeros(1, 2))

Saving and loading model

Model descriptor

Python

Copy

model.state_dict()
>> OrderedDict([
('input_to_hidden.weight', tensor(..)),
('input_to_hidden.bias'),
...
])

Save using CPU, even if Cuda is available, as it will help with loading

Python

Copy

torch.save(model.to("cpu").state_dict(), "mymodel.pth")

Load

Create an empty model with the same structure

Python

Copy

model = nn.Sequential(
	nn.Linear(2, 8),
	nn.ReLU(),
	nn.Linear(8, 1)
).to(device)

Load the model from disk

Python

Copy

state_dict = torch.load("mymodel.pth")

Load state_dict onto the model

Python

Copy

model.load_state_dict(state_dict)
model.to(device)
model(torch.tensor(val).float().to(device))