How to train Multilayer Perceptron in PyTorch

Multilayer Perceptrons (MLPs) are fundamental neural network architectures that can solve complex problems through their ability to learn non-linear relationships. In this guide, we will walk through the complete process of implementing and training an MLP using PyTorch, one of the most popular deep learning frameworks.

Understanding Multilayer Perceptrons

An MLP consists of at least three layers: an input layer, one or more hidden layers, and an output layer. Each layer contains neurons (or nodes) that are fully connected to the next layer's neurons. This architecture allows MLPs to learn complex patterns in data.

First, let's ensure we have the necessary libraries:

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import matplotlib.pyplot as plt
import numpy as np

Preparing the Data

For demonstration, let's create a synthetic classification dataset:

# Create synthetic data
def generate_data(n_samples=1000):
    # Generate binary classification data
    np.random.seed(42)
    X = np.random.randn(n_samples, 2)
    # Create non-linear decision boundary
    y = (X[:, 0]**2 + X[:, 1]**2 > 1.5).astype(np.float32)
    
    # Convert to PyTorch tensors
    X_tensor = torch.FloatTensor(X)
    y_tensor = torch.FloatTensor(y).reshape(-1, 1)
    
    # Split into train and test
    train_size = int(0.8 * n_samples)
    X_train, X_test = X_tensor[:train_size], X_tensor[train_size:]
    y_train, y_test = y_tensor[:train_size], y_tensor[train_size:]
    
    return X_train, y_train, X_test, y_test

X_train, y_train, X_test, y_test = generate_data(1000)

# Create DataLoader for batch processing
train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

Defining the MLP Model

Now, let's define our MLP architecture:

class MLP(nn.Module):
    def __init__(self, input_size, hidden_sizes, output_size, dropout_rate=0.2):
        super(MLP, self).__init__()
        
        # Create list to hold all layers
        layers = []
        
        # Input layer
        layers.append(nn.Linear(input_size, hidden_sizes[0]))
        layers.append(nn.ReLU())
        layers.append(nn.Dropout(dropout_rate))
        
        # Hidden layers
        for i in range(len(hidden_sizes)-1):
            layers.append(nn.Linear(hidden_sizes[i], hidden_sizes[i+1]))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout_rate))
        
        # Output layer
        layers.append(nn.Linear(hidden_sizes[-1], output_size))
        layers.append(nn.Sigmoid())  # For binary classification
        
        # Combine all layers into a sequential model
        self.model = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.model(x)

Model Initialization

Let's instantiate our model with specific parameters:

# Define model hyperparameters
input_size = 2  # Our data has 2 features
hidden_sizes = [64, 32]  # Two hidden layers with 64 and 32 neurons
output_size = 1  # Binary classification (single output with sigmoid)

# Create the model
model = MLP(input_size, hidden_sizes, output_size)
print(model)

# Define loss function and optimizer
criterion = nn.BCELoss()  # Binary Cross Entropy Loss
optimizer = optim.Adam(model.parameters(), lr=0.001)

Training the Model

Now for the actual training process:

def train_model(model, train_loader, criterion, optimizer, num_epochs=100):
    # Lists to store metrics
    train_losses = []
    
    # Training loop
    for epoch in range(num_epochs):
        model.train()  # Set model to training mode
        running_loss = 0.0
        
        for inputs, targets in train_loader:
            # Zero the parameter gradients
            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            
            # Backward pass and optimize
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item() * inputs.size(0)
        
        # Calculate epoch loss
        epoch_loss = running_loss / len(train_loader.dataset)
        train_losses.append(epoch_loss)
        
        # Print progress
        if (epoch+1) % 10 == 0:
            print(f'Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}')
    
    return train_losses

# Train the model
train_losses = train_model(model, train_loader, criterion, optimizer, num_epochs=100)

This is what we get we train the model and plot training, validation loss

Evaluating the Model

After training, we need to evaluate our model's performance:

def evaluate_model(model, test_loader, criterion):
    model.eval()  # Set model to evaluation mode
    test_loss = 0.0
    correct = 0
    total = 0
    
    # No gradient calculation needed for evaluation
    with torch.no_grad():
        for inputs, targets in test_loader:
            # Forward pass
            outputs = model(inputs)
            
            # Calculate loss
            loss = criterion(outputs, targets)
            test_loss += loss.item() * inputs.size(0)
            
            # Calculate accuracy
            predicted = (outputs > 0.5).float()
            total += targets.size(0)
            correct += (predicted == targets).sum().item()
    
    # Calculate metrics
    avg_loss = test_loss / len(test_loader.dataset)
    accuracy = correct / total
    
    return avg_loss, accuracy

# Evaluate the model
test_loss, test_accuracy = evaluate_model(model, test_loader, criterion)
print(f'Test Loss: {test_loss:.4f}')
print(f'Test Accuracy: {test_accuracy:.4f}')

Visualizing Results

Let's create visualizations to better understand our model:

# Plot training loss
plt.figure(figsize=(10, 5))
plt.plot(train_losses, label='Training Loss')
plt.title('Training Loss Over Epochs')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)
plt.show()

# Visualize decision boundary
def plot_decision_boundary(model, X, y):
    # Set min and max values with some margin
    x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
    y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5
    
    # Create a mesh grid
    h = 0.01  # Step size
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    
    # Predict for each point in the mesh
    Z = model(torch.FloatTensor(np.c_[xx.ravel(), yy.ravel()])).detach().numpy()
    Z = Z.reshape(xx.shape)
    
    # Plot the contour and data points
    plt.figure(figsize=(10, 8))
    plt.contourf(xx, yy, Z, cmap=plt.cm.Spectral, alpha=0.8)
    plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Spectral)
    plt.title('Decision Boundary')
    plt.xlabel('Feature 1')
    plt.ylabel('Feature 2')
    plt.show()

# Convert tensors to numpy for plotting
X_all = torch.cat((X_train, X_test), 0).numpy()
y_all = torch.cat((y_train, y_test), 0).numpy().flatten()

# Plot decision boundary
plot_decision_boundary(model, X_all, y_all)

Saving and Loading the Model

To use your trained model later

# Save the model
torch.save(model.state_dict(), 'mlp_model.pth')

# Load the model (for later use)
loaded_model = MLP(input_size, hidden_sizes, output_size)
loaded_model.load_state_dict(torch.load('mlp_model.pth'))
loaded_model.eval()  # Set to evaluation mode

Advanced Techniques for Improving MLP Performance

Learning Rate Scheduling

# Define a learning rate scheduler
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=5, factor=0.5)

# Modify training loop to use scheduler
def train_with_scheduler(model, train_loader, criterion, optimizer, scheduler, num_epochs=100):
    # Training code as before
    # ...
    
    # Add scheduler step after each epoch
    scheduler.step(epoch_loss)
    
    # ...

Weight Initialization

# Custom weight initialization
def init_weights(m):
    if type(m) == nn.Linear:
        torch.nn.init.xavier_uniform_(m.weight)
        m.bias.data.fill_(0.01)

# Apply to model
model.apply(init_weights)

Early Stopping

def train_with_early_stopping(model, train_loader, val_loader, criterion, optimizer, patience=10, num_epochs=100):
    best_val_loss = float('inf')
    counter = 0
    
    for epoch in range(num_epochs):
        # Training code as before
        # ...
        
        # Validation after each epoch
        val_loss, _ = evaluate_model(model, val_loader, criterion)
        
        # Check if validation loss improved
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            counter = 0
            # Save best model
            torch.save(model.state_dict(), 'best_model.pth')
        else:
            counter += 1
            
        # Check early stopping condition
        if counter >= patience:
            print(f'Early stopping at epoch {epoch+1}')
            break

Multilayer Perceptrons provide a powerful foundation for understanding neural networks. Through this guide, we've covered setting up, training, evaluating, and improving MLPs using PyTorch. These concepts form the basis for more complex deep learning architectures.