In this blog, we will go step by step with how to create a mlp from scratch.
# all of these libraries are used for plotting
import numpy as np
import matplotlib.pyplot as plt
# Plot the dataset
def plot_data(ax, X, Y):
plt.axis('off')
ax.scatter(X[:, 0], X[:, 1], s=1, c=Y, cmap='bone')
from sklearn.datasets import make_moons
X, Y = make_moons(n_samples=2000, noise=0.1)
xx, yy = np.meshgrid(np.arange(x_min, x_max, .1),
np.arange(y_min, y_max, .1))
to_forward = np.array(list(zip(xx.ravel(), yy.ravel())))
# plot the decision boundary of our classifier
def plot_decision_boundary(ax, X, Y, classifier):
# forward pass on the grid, then convert to numpy for plotting
Z = classifier.forward(to_forward)
Z = Z.reshape(xx.shape)
# plot contour lines of the values of our classifier on the grid
ax.contourf(xx, yy, Z>0.5, cmap='Blues')
# then plot the dataset
plot_data(ax, X,Y)
# Define the grid on which we will evaluate our classifier
xx, yy = np.meshgrid(np.arange(x_min, x_max, .1),
np.arange(y_min, y_max, .1))
to_forward = np.array(list(zip(xx.ravel(), yy.ravel())))
# plot the decision boundary of our classifier
def plot_decision_boundary(ax, X, Y, classifier):
# forward pass on the grid, then convert to numpy for plotting
Z = classifier.forward(to_forward)
Z = Z.reshape(xx.shape)
# plot contour lines of the values of our classifier on the grid
ax.contourf(xx, yy, Z>0.5, cmap='Blues')
# then plot the dataset
plot_data(ax, X,Y)
class MyReLU(object):
def forward(self, x):
# the relu is y_i = max(0, x_i)
# YOUR CODE HERE
raise NotImplementedError()
def backward(self, grad_output):
# the gradient is 1 for the inputs that were above 0, 0 elsewhere
# YOUR CODE HERE
raise NotImplementedError()
def step(self, learning_rate):
# no need to do anything here, since ReLU has no parameters
# YOUR CODE HERE
raise NotImplementedError()
class MySigmoid(object):
def forward(self, x):
# the sigmoid is y_i = 1./(1+exp(-x_i))
# YOUR CODE HERE
raise NotImplementedError()
def backward(self, grad_output):
# the partial derivative is e^-x / (e^-x + 1)^2
# YOUR CODE HERE
raise NotImplementedError()
def step(self, learning_rate):
# no need to do anything here since Sigmoid has no parameters
# YOUR CODE HERE
raise NotImplementedError()
class MyLinear(object):
def __init__(self, n_input, n_output):
# initialize two random matrices for W and b (use np.random.randn)
# YOUR CODE HERE
raise NotImplementedError()
def forward(self, x):
# save a copy of x, you'll need it for the backward
# return xW + b
# YOUR CODE HERE
raise NotImplementedError()
def backward(self, grad_output):
# y_i = \sum_j x_j W_{j,i} + b_i
# d y_i / d W_{j, i} = x_j
# d loss / d y_i = grad_output[i]
# so d loss / d W_{j,i} = x_j * grad_output[i] (by the chain rule)
# YOUR CODE HERE
raise NotImplementedError()
# d y_i / d b_i = 1
# d loss / d y_i = grad_output[i]
# YOUR CODE HERE
raise NotImplementedError()
# now we need to compute the gradient with respect to x to
# continue the back propagation d y_i / d x_j = W_{j, i}
# to compute the gradient of the loss, we have to sum over
# all possible y_i in the chain rule d loss / d x_j = \sum_i
# (d loss / d y_i) (d y_i / d x_j)
# YOUR CODE HERE
raise NotImplementedError()
def step(self, learning_rate):
# update self.W and self.b in the opposite direction of the
# stored gradients, for learning_rate
# YOUR CODE HERE
raise NotImplementedError()
class Sequential(object):
def __init__(self, layers):
# YOUR CODE HERE
raise NotImplementedError()
def forward(self, x):
# YOUR CODE HERE
raise NotImplementedError()
def compute_loss(self, out, label):
# use the BCE loss
# -(label * log(output) + (1-label) * log(1-output))
# save the gradient, and return the loss
# beware of dividing by zero in the gradient.
# split the computation in two cases, one where the label is
# 0 and another one where the label is 1
# add a small value (1e-10) to the denominator
# YOUR CODE HERE
raise NotImplementedError()
def backward(self):
# apply backprop sequentially, starting from the gradient of the loss
# YOUR CODE HERE
raise NotImplementedError()
def step(self, learning_rate):
# take a gradient step for each layers
# YOUR CODE HERE
raise NotImplementedError()
h=50
# define your network with your Sequential
# it should be a linear layer with 2 inputs and h outputs, followed by a ReLU
# then a linear layer with h inputs and 1 outputs, followed by a sigmoid
# feel free to try other architectures
# YOUR CODE HERE
raise NotImplementedError()
# unfortunately animation is not working on colab
# you should comment the following line if on colab
%matplotlib notebook
fig, ax = plt.subplots(1, 1, facecolor='#4B6EA9')
ax.set_xlim(x_min, x_max)
ax.set_ylim(y_min, y_max)
losses = []
learning_rate = 1e-2
for it in range(10000):
# pick a random example id
j = np.random.randint(1, len(X))
# select the corresponding example and label
example = X[j:j+1]
label = Y[j]
# do a forward pass on the example
# YOUR CODE HERE
raise NotImplementedError()
# compute the loss according to your output and the label
# YOUR CODE HERE
raise NotImplementedError()
# backward pass
# YOUR CODE HERE
raise NotImplementedError()
# gradient step
# YOUR CODE HERE
raise NotImplementedError()
# draw the current decision boundary every 250 examples seen
if it % 250 == 0 :
plot_decision_boundary(ax, X,Y, net)
fig.canvas.draw()
plot_decision_boundary(ax, X,Y, net)
fig.canvas.draw()
%matplotlib inline
plt.plot(losses)
import torch
import torch.nn as nn
# y = xw + b
class MyLinear_mod(nn.Module):
def __init__(self, n_input, n_output):
super(MyLinear_mod, self).__init__()
# define self.A and self.b the weights and biases
# initialize them with a normal distribution
# use nn.Parameters
# YOUR CODE HERE
raise NotImplementedError()
def forward(self, x):
# YOUR CODE HERE
raise NotImplementedError()
class MyReLU_mod(nn.Module):
def __init__(self):
super(MyReLU_mod, self).__init__()
def forward(self, x):
# YOUR CODE HERE
raise NotImplementedError()
Subsequent section defines the network using MyLinearmod, MyReLUmod and nn.Sigmoid
from torch import optim
optimizer = optim.SGD(net.parameters(), lr=1e-2)
X_torch = torch.from_numpy(X).float()
Y_torch = torch.from_numpy(Y).float()
# you should comment the following line if on colab
%matplotlib notebook
fig, ax = plt.subplots(1, 1, facecolor='#4B6EA9')
ax.set_xlim(x_min, x_max)
ax.set_ylim(y_min, y_max)
losses = []
criterion = nn.BCELoss()
for it in range(10000):
# pick a random example id
j = np.random.randint(1, len(X))
# select the corresponding example and label
example = X_torch[j:j+1]
label = Y_torch[j:j+1].unsqueeze(1)
# do a forward pass on the example
# YOUR CODE HERE
raise NotImplementedError()
# compute the loss according to your output and the label
# YOUR CODE HERE
raise NotImplementedError()
# zero the gradients
# YOUR CODE HERE
raise NotImplementedError()
# backward pass
# YOUR CODE HERE
raise NotImplementedError()
# gradient step
# YOUR CODE HERE
raise NotImplementedError()
# draw the current decision boundary every 250 examples seen
if it % 250 == 0 :
plot_decision_boundary(ax, X,Y, net)
fig.canvas.draw()
plot_decision_boundary(ax, X,Y, net)
fig.canvas.draw()
%matplotlib inline
plt.plot(losses)