%%capture
!uv pip install pandas safetensors torch scipy tqdm torchvision torch torchinfo timm tensorboard

import os, inspect

from datetime import datetime
from typing import Literal

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from tqdm import tqdm
from PIL import Image
from torchinfo import summary
from torchvision import datasets, transforms, models
from torch.utils.data import DataLoader, Dataset, random_split, Subset

from safetensors.torch import save_model, load_model

%load_ext autoreload
%matplotlib inline
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

csv_path = 'test.csv'
df = pd.read_csv(csv_path)

print(f'Total samples in CSV: {len(df)}')


# Function to reverse the normalization
def unnormalize(tensor, mean, std):
  for t, m, s in zip(tensor, mean, std):
    t.mul_(s).add_(m)
  return tensor


CIFAR100_MEAN = (0.5071, 0.4867, 0.4408)
CIFAR100_STD = (0.2675, 0.2565, 0.2761)

# Select a few samples to display (e.g., first 5)
num_samples = 5
samples = df.sample(n=num_samples)  # Randomly select samples for diversity

# Iterate through the selected samples and display images
for index, row in samples.iterrows():
  # Extract pixel data and convert to numpy array
  pixel_data = row[[f'pixel_{i}' for i in range(1, 3073)]].values.astype(np.float32)

  # Reshape to (3, 32, 32)
  image = pixel_data.reshape(3, 32, 32)

  # Convert to torch tensor
  image_tensor = torch.tensor(image)

  # Reverse the normalization
  image_tensor = unnormalize(image_tensor, CIFAR100_MEAN, CIFAR100_STD)

  # Clip the values to [0, 1] range
  image_tensor = torch.clamp(image_tensor, 0, 1)

  # Convert to numpy array and transpose to (32, 32, 3) for plotting
  image_np = image_tensor.numpy().transpose(1, 2, 0)

  # Convert to PIL Image for better handling (optional)
  image_pil = Image.fromarray((image_np * 255).astype(np.uint8))

  # Display the image using matplotlib
  plt.figure(figsize=(2, 2))
  plt.imshow(image_pil)
  plt.axis('off')

Total samples in CSV: 10000

# Define CIFAR-100 mean and std
CIFAR100_MEAN = (0.5071, 0.4867, 0.4408)
CIFAR100_STD = (0.2675, 0.2565, 0.2761)

# Hyperparameters
num_epochs = 30
lr = 0.001
weight_decay = 1e-4
batch_size = 128
model_prefix = f'efficientnet_v2_{lr}_{num_epochs}'
device = 'cuda' if torch.cuda.is_available() else 'cpu'

ncols = 100

# CIFAR-100 dataset (download and create DataLoader)
def get_dataloaders(batch_size):
  transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(CIFAR100_MEAN, CIFAR100_STD),
  ])

  transform_test = transforms.Compose([transforms.ToTensor(), transforms.Normalize(CIFAR100_MEAN, CIFAR100_STD)])

  train_dataset = torchvision.datasets.CIFAR100(root='./data', train=True, download=True, transform=transform_train)
  train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)

  test_dataset = torchvision.datasets.CIFAR100(root='./data', train=False, download=True, transform=transform_test)
  test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=4)

  return train_loader, test_loader


def _holistic_patch(model, num_features=100):
  model.classifier[1] = nn.Linear(model.classifier[1].in_features, num_features)


# Load EfficientNetV2 model
def init_model(variants: Literal['S', 'M', 'L'] = 'S', patch=_holistic_patch):
  if variants == 'S':
    model = models.efficientnet_v2_s(weights=models.EfficientNet_V2_S_Weights.DEFAULT)
  elif variants == 'M':
    model = models.efficientnet_v2_m(weights=models.EfficientNet_V2_M_Weights.DEFAULT)
  elif variants == 'L':
    model = models.efficientnet_v2_l(weights=models.EfficientNet_V2_L_Weights.DEFAULT)
  patch(model)
  model.variants = variants
  model = model.to(device)
  return model


# Load model if exists
def load_checkpoint(filepath, model=None, variants='S'):
  if model is None:
    model = init_model(variants)
  load_model(model, filepath)
  model.eval()
  return model


# Save model to safetensors
def save_checkpoint(model, accuracy, model_prefix, basedir='./model'):
  timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
  os.makedirs(basedir, exist_ok=True)

  variants = 'default'
  if hasattr(model, 'variants'):
    variants = model.variants

  filepath = os.path.join(basedir, f'{model_prefix}_{variants}_{accuracy:.2f}_{timestamp}.safetensors')
  save_model(model, filepath)
  print(f'Model checkpoint saved to {filepath}.')

# Train the model
def train(model, train_loader, criterion, optimizer, scheduler, num_epochs, *, ncols=100):
  best_accuracy = 0.0
  train_losses = []
  train_accuracies = []

  for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    with tqdm(enumerate(train_loader), total=len(train_loader), ncols=ncols) as bar:
      for i, (images, labels) in bar:
        images, labels = images.to(device), labels.to(device)

        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

        bar.set_description(f'Epoch [{epoch + 1}/{num_epochs}]')
        bar.set_postfix(loss=loss.item())

    scheduler.step()
    epoch_loss = running_loss / len(train_loader)
    epoch_acc = 100 * correct / total
    train_losses.append(epoch_loss)
    train_accuracies.append(epoch_acc)
    print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {epoch_loss:.4f}, Accuracy: {epoch_acc:.2f}%')

    # Evaluate the model on test set after each epoch
    test_acc = evaluate(model, test_loader)
    if test_acc > best_accuracy:
      best_accuracy = test_acc
      save_checkpoint(model, best_accuracy, model_prefix)

  # Plotting training history
  plot_training_history(train_losses, train_accuracies)


# Evaluate the model
def evaluate(model, test_loader):
  model.eval()
  correct = 0
  total = 0
  with torch.no_grad():
    for images, labels in test_loader:
      images, labels = images.to(device), labels.to(device)
      outputs = model(images)
      _, predicted = torch.max(outputs.data, 1)
      total += labels.size(0)
      correct += (predicted == labels).sum().item()

  accuracy = 100 * correct / total
  print(f'Test Accuracy: {accuracy:.2f}%')
  return accuracy


# Plot training history
def plot_training_history(train_losses, train_accuracies):
  plt.figure(figsize=(12, 5))

  # Plot training loss
  plt.subplot(1, 2, 1)
  plt.plot(train_losses, label='Training Loss')
  plt.xlabel('Epoch')
  plt.ylabel('Loss')
  plt.title('Training Loss over Epochs')
  plt.legend()

  # Plot training accuracy
  plt.subplot(1, 2, 2)
  plt.plot(train_accuracies, label='Training Accuracy')
  plt.xlabel('Epoch')
  plt.ylabel('Accuracy (%)')
  plt.title('Training Accuracy over Epochs')
  plt.legend()

  plt.show()


def validations(model, test_loader, classes, num_examples=16):
  model.eval()
  SAMPLES, PREDS, LABELS = [], [], []

  with torch.no_grad():
    for _ in range(num_examples):
      idx = np.random.randint(len(test_loader.dataset))
      sample_image, actual_label = test_loader.dataset[idx]
      sample_image = sample_image.unsqueeze(0).to(device)
      SAMPLES.append(sample_image.squeeze(0))
      LABELS.append(actual_label)

      output = F.softmax(model(sample_image), dim=-1)
      pred_values, pred_labels = output.max(-1)
      PREDS.append(round(float(pred_values), 4))
      LABELS.append(int(pred_labels))

  fig, ax = plt.subplots(nrows=4, ncols=4, figsize=(21, 19))
  i = 0
  for R in range(4):
    for C in range(4):
      image_np = SAMPLES[i].cpu().numpy().transpose(1, 2, 0)
      image_np = image_np * np.array((0.2675, 0.2565, 0.2761)) + np.array((0.5071, 0.4867, 0.4408))  # Unnormalize
      image_np = np.clip(image_np, 0, 1)
      ax[R, C].imshow(image_np)
      ax[R, C].set_title('Actual: ' + classes[LABELS[i]], fontsize=16).set_color('k')
      ax[R, C].set_ylabel(PREDS[i], fontsize=16, rotation=0, labelpad=25).set_color('m')
      if LABELS[i] == LABELS[i]:
        ax[R, C].set_xlabel('Predicted: ' + classes[LABELS[i]], fontsize=16).set_color('b')
      else:
        ax[R, C].set_xlabel('Predicted: ' + classes[LABELS[i]], fontsize=16).set_color('r')
      ax[R, C].set_xticks([])
      ax[R, C].set_yticks([])
      i += 1

  plt.show()

# Training parameters
train_loader, test_loader = get_dataloaders(batch_size)
model = init_model()

criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=lr)
scheduler.step(epoch + i / len(train_loader))

Files already downloaded and verified
Files already downloaded and verified

summary(model, input_size=(32, 3, 224, 224))

=========================================================================================================
Layer (type:depth-idx)                                  Output Shape              Param #
=========================================================================================================
EfficientNet                                            [32, 100]                 --
├─Sequential: 1-1                                       [32, 1280, 7, 7]          --
│    └─Conv2dNormActivation: 2-1                        [32, 24, 112, 112]        --
│    │    └─Conv2d: 3-1                                 [32, 24, 112, 112]        648
│    │    └─BatchNorm2d: 3-2                            [32, 24, 112, 112]        48
│    │    └─SiLU: 3-3                                   [32, 24, 112, 112]        --
│    └─Sequential: 2-2                                  [32, 24, 112, 112]        --
│    │    └─FusedMBConv: 3-4                            [32, 24, 112, 112]        5,232
│    │    └─FusedMBConv: 3-5                            [32, 24, 112, 112]        5,232
│    └─Sequential: 2-3                                  [32, 48, 56, 56]          --
│    │    └─FusedMBConv: 3-6                            [32, 48, 56, 56]          25,632
│    │    └─FusedMBConv: 3-7                            [32, 48, 56, 56]          92,640
│    │    └─FusedMBConv: 3-8                            [32, 48, 56, 56]          92,640
│    │    └─FusedMBConv: 3-9                            [32, 48, 56, 56]          92,640
│    └─Sequential: 2-4                                  [32, 64, 28, 28]          --
│    │    └─FusedMBConv: 3-10                           [32, 64, 28, 28]          95,744
│    │    └─FusedMBConv: 3-11                           [32, 64, 28, 28]          164,480
│    │    └─FusedMBConv: 3-12                           [32, 64, 28, 28]          164,480
│    │    └─FusedMBConv: 3-13                           [32, 64, 28, 28]          164,480
│    └─Sequential: 2-5                                  [32, 128, 14, 14]         --
│    │    └─MBConv: 3-14                                [32, 128, 14, 14]         61,200
│    │    └─MBConv: 3-15                                [32, 128, 14, 14]         171,296
│    │    └─MBConv: 3-16                                [32, 128, 14, 14]         171,296
│    │    └─MBConv: 3-17                                [32, 128, 14, 14]         171,296
│    │    └─MBConv: 3-18                                [32, 128, 14, 14]         171,296
│    │    └─MBConv: 3-19                                [32, 128, 14, 14]         171,296
│    └─Sequential: 2-6                                  [32, 160, 14, 14]         --
│    │    └─MBConv: 3-20                                [32, 160, 14, 14]         281,440
│    │    └─MBConv: 3-21                                [32, 160, 14, 14]         397,800
│    │    └─MBConv: 3-22                                [32, 160, 14, 14]         397,800
│    │    └─MBConv: 3-23                                [32, 160, 14, 14]         397,800
│    │    └─MBConv: 3-24                                [32, 160, 14, 14]         397,800
│    │    └─MBConv: 3-25                                [32, 160, 14, 14]         397,800
│    │    └─MBConv: 3-26                                [32, 160, 14, 14]         397,800
│    │    └─MBConv: 3-27                                [32, 160, 14, 14]         397,800
│    │    └─MBConv: 3-28                                [32, 160, 14, 14]         397,800
│    └─Sequential: 2-7                                  [32, 256, 7, 7]           --
│    │    └─MBConv: 3-29                                [32, 256, 7, 7]           490,152
│    │    └─MBConv: 3-30                                [32, 256, 7, 7]           1,005,120
│    │    └─MBConv: 3-31                                [32, 256, 7, 7]           1,005,120
│    │    └─MBConv: 3-32                                [32, 256, 7, 7]           1,005,120
│    │    └─MBConv: 3-33                                [32, 256, 7, 7]           1,005,120
│    │    └─MBConv: 3-34                                [32, 256, 7, 7]           1,005,120
│    │    └─MBConv: 3-35                                [32, 256, 7, 7]           1,005,120
│    │    └─MBConv: 3-36                                [32, 256, 7, 7]           1,005,120
│    │    └─MBConv: 3-37                                [32, 256, 7, 7]           1,005,120
│    │    └─MBConv: 3-38                                [32, 256, 7, 7]           1,005,120
│    │    └─MBConv: 3-39                                [32, 256, 7, 7]           1,005,120
│    │    └─MBConv: 3-40                                [32, 256, 7, 7]           1,005,120
│    │    └─MBConv: 3-41                                [32, 256, 7, 7]           1,005,120
│    │    └─MBConv: 3-42                                [32, 256, 7, 7]           1,005,120
│    │    └─MBConv: 3-43                                [32, 256, 7, 7]           1,005,120
│    └─Conv2dNormActivation: 2-8                        [32, 1280, 7, 7]          --
│    │    └─Conv2d: 3-44                                [32, 1280, 7, 7]          327,680
│    │    └─BatchNorm2d: 3-45                           [32, 1280, 7, 7]          2,560
│    │    └─SiLU: 3-46                                  [32, 1280, 7, 7]          --
├─AdaptiveAvgPool2d: 1-2                                [32, 1280, 1, 1]          --
├─Sequential: 1-3                                       [32, 100]                 --
│    └─Dropout: 2-9                                     [32, 1280]                --
│    └─Linear: 2-10                                     [32, 100]                 128,100
=========================================================================================================
Total params: 20,305,588
Trainable params: 20,305,588
Non-trainable params: 0
Total mult-adds (Units.GIGABYTES): 91.17
=========================================================================================================
Input size (MB): 19.27
Forward/backward pass size (MB): 6234.07
Params size (MB): 81.22
Estimated Total Size (MB): 6334.56
=========================================================================================================

# first pass
train(model, train_loader, criterion, optimizer, scheduler, num_epochs, ncols=ncols)
evaluate(model, test_loader)

Epoch [1/30]: 100%|████████████████████████████████████| 391/391 [00:20<00:00, 18.87it/s, loss=2.13]

Epoch [1/30], Loss: 2.8834, Accuracy: 28.73%

Test Accuracy: 46.47%
Model checkpoint saved to ./model/efficientnet_v2_0.001_30_46.47_20241204_055710.safetensors.

Epoch [2/30]: 100%|████████████████████████████████████| 391/391 [00:20<00:00, 18.89it/s, loss=1.88]

Epoch [2/30], Loss: 1.8471, Accuracy: 49.24%

Test Accuracy: 54.34%
Model checkpoint saved to ./model/efficientnet_v2_0.001_30_54.34_20241204_055733.safetensors.

Epoch [3/30]: 100%|████████████████████████████████████| 391/391 [00:20<00:00, 19.33it/s, loss=1.54]

Epoch [3/30], Loss: 1.5800, Accuracy: 55.52%

%%capture
load_checkpoint('./model/efficientnet_v2_0.001_30_65.64_20241204_060546.safetensors', model)

validations(model, test_loader, classes=test_loader.dataset.classes, num_examples=20)

## hmm, we might need to patch efficientnet such that it works with CIFAR100, let's just use a bigger variants for now
model = init_model(variants='L')
optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=1e-4)
criterion = nn.CrossEntropyLoss(label_smoothing=0.1)  # Add label smoothing
scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=5, T_mult=2, eta_min=1e-6)

summary(model, input_size=(32, 3, 224, 224))

=========================================================================================================
Layer (type:depth-idx)                                  Output Shape              Param #
=========================================================================================================
EfficientNet                                            [32, 100]                 --
├─Sequential: 1-1                                       [32, 1280, 7, 7]          --
│    └─Conv2dNormActivation: 2-1                        [32, 32, 112, 112]        --
│    │    └─Conv2d: 3-1                                 [32, 32, 112, 112]        864
│    │    └─BatchNorm2d: 3-2                            [32, 32, 112, 112]        64
│    │    └─SiLU: 3-3                                   [32, 32, 112, 112]        --
│    └─Sequential: 2-2                                  [32, 32, 112, 112]        --
│    │    └─FusedMBConv: 3-4                            [32, 32, 112, 112]        9,280
│    │    └─FusedMBConv: 3-5                            [32, 32, 112, 112]        9,280
│    │    └─FusedMBConv: 3-6                            [32, 32, 112, 112]        9,280
│    │    └─FusedMBConv: 3-7                            [32, 32, 112, 112]        9,280
│    └─Sequential: 2-3                                  [32, 64, 56, 56]          --
│    │    └─FusedMBConv: 3-8                            [32, 64, 56, 56]          45,440
│    │    └─FusedMBConv: 3-9                            [32, 64, 56, 56]          164,480
│    │    └─FusedMBConv: 3-10                           [32, 64, 56, 56]          164,480
│    │    └─FusedMBConv: 3-11                           [32, 64, 56, 56]          164,480
│    │    └─FusedMBConv: 3-12                           [32, 64, 56, 56]          164,480
│    │    └─FusedMBConv: 3-13                           [32, 64, 56, 56]          164,480
│    │    └─FusedMBConv: 3-14                           [32, 64, 56, 56]          164,480
│    └─Sequential: 2-4                                  [32, 96, 28, 28]          --
│    │    └─FusedMBConv: 3-15                           [32, 96, 28, 28]          172,736
│    │    └─FusedMBConv: 3-16                           [32, 96, 28, 28]          369,600
│    │    └─FusedMBConv: 3-17                           [32, 96, 28, 28]          369,600
│    │    └─FusedMBConv: 3-18                           [32, 96, 28, 28]          369,600
│    │    └─FusedMBConv: 3-19                           [32, 96, 28, 28]          369,600
│    │    └─FusedMBConv: 3-20                           [32, 96, 28, 28]          369,600
│    │    └─FusedMBConv: 3-21                           [32, 96, 28, 28]          369,600
│    └─Sequential: 2-5                                  [32, 192, 14, 14]         --
│    │    └─MBConv: 3-22                                [32, 192, 14, 14]         134,808
│    │    └─MBConv: 3-23                                [32, 192, 14, 14]         379,824
│    │    └─MBConv: 3-24                                [32, 192, 14, 14]         379,824
│    │    └─MBConv: 3-25                                [32, 192, 14, 14]         379,824
│    │    └─MBConv: 3-26                                [32, 192, 14, 14]         379,824
│    │    └─MBConv: 3-27                                [32, 192, 14, 14]         379,824
│    │    └─MBConv: 3-28                                [32, 192, 14, 14]         379,824
│    │    └─MBConv: 3-29                                [32, 192, 14, 14]         379,824
│    │    └─MBConv: 3-30                                [32, 192, 14, 14]         379,824
│    │    └─MBConv: 3-31                                [32, 192, 14, 14]         379,824
│    └─Sequential: 2-6                                  [32, 224, 14, 14]         --
│    │    └─MBConv: 3-32                                [32, 224, 14, 14]         606,448
│    │    └─MBConv: 3-33                                [32, 224, 14, 14]         771,960
│    │    └─MBConv: 3-34                                [32, 224, 14, 14]         771,960
│    │    └─MBConv: 3-35                                [32, 224, 14, 14]         771,960
│    │    └─MBConv: 3-36                                [32, 224, 14, 14]         771,960
│    │    └─MBConv: 3-37                                [32, 224, 14, 14]         771,960
│    │    └─MBConv: 3-38                                [32, 224, 14, 14]         771,960
│    │    └─MBConv: 3-39                                [32, 224, 14, 14]         771,960
│    │    └─MBConv: 3-40                                [32, 224, 14, 14]         771,960
│    │    └─MBConv: 3-41                                [32, 224, 14, 14]         771,960
│    │    └─MBConv: 3-42                                [32, 224, 14, 14]         771,960
│    │    └─MBConv: 3-43                                [32, 224, 14, 14]         771,960
│    │    └─MBConv: 3-44                                [32, 224, 14, 14]         771,960
│    │    └─MBConv: 3-45                                [32, 224, 14, 14]         771,960
│    │    └─MBConv: 3-46                                [32, 224, 14, 14]         771,960
│    │    └─MBConv: 3-47                                [32, 224, 14, 14]         771,960
│    │    └─MBConv: 3-48                                [32, 224, 14, 14]         771,960
│    │    └─MBConv: 3-49                                [32, 224, 14, 14]         771,960
│    │    └─MBConv: 3-50                                [32, 224, 14, 14]         771,960
│    └─Sequential: 2-7                                  [32, 384, 7, 7]           --
│    │    └─MBConv: 3-51                                [32, 384, 7, 7]           987,320
│    │    └─MBConv: 3-52                                [32, 384, 7, 7]           2,244,960
│    │    └─MBConv: 3-53                                [32, 384, 7, 7]           2,244,960
│    │    └─MBConv: 3-54                                [32, 384, 7, 7]           2,244,960
│    │    └─MBConv: 3-55                                [32, 384, 7, 7]           2,244,960
│    │    └─MBConv: 3-56                                [32, 384, 7, 7]           2,244,960
│    │    └─MBConv: 3-57                                [32, 384, 7, 7]           2,244,960
│    │    └─MBConv: 3-58                                [32, 384, 7, 7]           2,244,960
│    │    └─MBConv: 3-59                                [32, 384, 7, 7]           2,244,960
│    │    └─MBConv: 3-60                                [32, 384, 7, 7]           2,244,960
│    │    └─MBConv: 3-61                                [32, 384, 7, 7]           2,244,960
│    │    └─MBConv: 3-62                                [32, 384, 7, 7]           2,244,960
│    │    └─MBConv: 3-63                                [32, 384, 7, 7]           2,244,960
│    │    └─MBConv: 3-64                                [32, 384, 7, 7]           2,244,960
│    │    └─MBConv: 3-65                                [32, 384, 7, 7]           2,244,960
│    │    └─MBConv: 3-66                                [32, 384, 7, 7]           2,244,960
│    │    └─MBConv: 3-67                                [32, 384, 7, 7]           2,244,960
│    │    └─MBConv: 3-68                                [32, 384, 7, 7]           2,244,960
│    │    └─MBConv: 3-69                                [32, 384, 7, 7]           2,244,960
│    │    └─MBConv: 3-70                                [32, 384, 7, 7]           2,244,960
│    │    └─MBConv: 3-71                                [32, 384, 7, 7]           2,244,960
│    │    └─MBConv: 3-72                                [32, 384, 7, 7]           2,244,960
│    │    └─MBConv: 3-73                                [32, 384, 7, 7]           2,244,960
│    │    └─MBConv: 3-74                                [32, 384, 7, 7]           2,244,960
│    │    └─MBConv: 3-75                                [32, 384, 7, 7]           2,244,960
│    └─Sequential: 2-8                                  [32, 640, 7, 7]           --
│    │    └─MBConv: 3-76                                [32, 640, 7, 7]           2,835,296
│    │    └─MBConv: 3-77                                [32, 640, 7, 7]           6,199,200
│    │    └─MBConv: 3-78                                [32, 640, 7, 7]           6,199,200
│    │    └─MBConv: 3-79                                [32, 640, 7, 7]           6,199,200
│    │    └─MBConv: 3-80                                [32, 640, 7, 7]           6,199,200
│    │    └─MBConv: 3-81                                [32, 640, 7, 7]           6,199,200
│    │    └─MBConv: 3-82                                [32, 640, 7, 7]           6,199,200
│    └─Conv2dNormActivation: 2-9                        [32, 1280, 7, 7]          --
│    │    └─Conv2d: 3-83                                [32, 1280, 7, 7]          819,200
│    │    └─BatchNorm2d: 3-84                           [32, 1280, 7, 7]          2,560
│    │    └─SiLU: 3-85                                  [32, 1280, 7, 7]          --
├─AdaptiveAvgPool2d: 1-2                                [32, 1280, 1, 1]          --
├─Sequential: 1-3                                       [32, 100]                 --
│    └─Dropout: 2-10                                    [32, 1280]                --
│    └─Linear: 2-11                                     [32, 100]                 128,100
=========================================================================================================
Total params: 117,362,372
Trainable params: 117,362,372
Non-trainable params: 0
Total mult-adds (Units.GIGABYTES): 391.38
=========================================================================================================
Input size (MB): 19.27
Forward/backward pass size (MB): 17633.17
Params size (MB): 469.45
Estimated Total Size (MB): 18121.89
=========================================================================================================

# second pass with patched
train(model, train_loader, criterion, optimizer, scheduler, num_epochs, ncols=ncols)
evaluate(model, test_loader)

Epoch [1/30]: 100%|████████████████████████████████████| 391/391 [00:39<00:00,  9.81it/s, loss=3.45]

Epoch [1/30], Loss: 4.0097, Accuracy: 11.31%

Test Accuracy: 30.47%
Model checkpoint saved to ./model/efficientnet_v2_0.01_30_L_30.47_20241204_070203.safetensors.

Epoch [2/30]: 100%|████████████████████████████████████| 391/391 [00:39<00:00,  9.80it/s, loss=2.85]

Epoch [2/30], Loss: 2.9738, Accuracy: 32.93%

Test Accuracy: 46.06%
Model checkpoint saved to ./model/efficientnet_v2_0.01_30_L_46.06_20241204_070247.safetensors.

Epoch [3/30]: 100%|████████████████████████████████████| 391/391 [00:40<00:00,  9.61it/s, loss=2.66]

Epoch [3/30], Loss: 2.5264, Accuracy: 45.25%

csv_path = 'test.csv'
df = pd.read_csv(csv_path)

test_images = []
for _, row in df.iterrows():
  pixel_data = row[[f'pixel_{i}' for i in range(1, 3073)]].values.astype(np.float32)
  image = pixel_data.reshape(3, 32, 32)
  image_tensor = torch.tensor(image)
  test_images.append(image_tensor)

test_images = torch.stack(test_images)

# Predictions on custom test set
model.eval()
custom_preds = []
with torch.no_grad():
  test_images = test_images.to(device)
  for i in range(0, len(test_images), batch_size):
    batch_images = test_images[i : i + batch_size]
    outputs = model(batch_images)
    _, predicted = torch.max(outputs.data, 1)
    custom_preds.extend(predicted.cpu().numpy())

# Formatting predictions as required
submission = pd.DataFrame({'ID': df['ID'], 'LABEL': custom_preds})
submission.to_csv('submission.csv', index=False)
print('Predictions saved to submission.csv')

Predictions saved to submission.csv

test set exploration¶

implementation¶

hyperparameters¶

helpers¶

training loop¶

finals for submission¶