import os

base_dir = './Data' # Directory dataset
#os.mkdir(base_dir)

# Directories for the training, validation, and test splits
train_dir = os.path.join(base_dir, 'train')   
#os.mkdir(train_dir)

from torchvision.datasets import ImageFolder
import torchvision.transforms as transforms

train_dataset = ImageFolder(root=train_dir, transform=transforms)
train_dataset

Dataset ImageFolder
    Number of datapoints: 2000
    Root location: ./Data\train
    StandardTransform
Transform: <module 'torchvision.transforms' from 'C:\\Users\\mrezv\\anaconda3\\envs\\faiss-env\\lib\\site-packages\\torchvision\\transforms\\__init__.py'>

classes = train_dataset.classes
print(classes)

['cats', 'dogs']

print(train_dataset.class_to_idx)

{'cats': 0, 'dogs': 1}

import random
from pathlib import Path
from PIL import Image
import glob

# Set seed
random.seed(42) 

# 1. Get all image paths (* means "any combination")
image_path_list= glob.glob(f"./Data/train/dogs/*.jpg")

# 2. Get random image path
random_image_path = random.choice(image_path_list)

# 3. Get image class from path name (the image class is the name of the directory where the image is stored)
image_class = Path(random_image_path).parent.stem

# 4. Open image
img = Image.open(random_image_path)

# 5. Print metadata
print(f"Random image path: {random_image_path}")
print(f"Image class: {image_class}")
print(f"Image height: {img.height}") 
print(f"Image width: {img.width}")
img

Random image path: ./Data/train/dogs\dog.688.jpg
Image class: dogs
Image height: 166
Image width: 240

# ✅ Define transform first
from torchvision import transforms

transform = transforms.Compose([
    transforms.Resize((224, 224)),    # Resize images here!
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5]*3, std=[0.5]*3)
])

# ✅ Then apply to ImageFolder
from torchvision.datasets import ImageFolder
train_dataset = ImageFolder(root=train_dir, transform=transform)

# ✅ Load with DataLoader
from torch.utils.data import DataLoader
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Check a single batch to make sure it works
for images, labels in train_loader:
    print(f"Batch image shape: {images.shape}")  # Should be [32, 3, 224, 224]
    print(f"Batch label shape: {labels.shape}")  # Should be [32]
    break

Batch image shape: torch.Size([32, 3, 224, 224])
Batch label shape: torch.Size([32])

import torch
import torch.nn as nn

class ImageBinaryClassifier(nn.Module):
    def __init__(self):
        super(ImageBinaryClassifier, self).__init__()

        # Input: 3 RGB channels (red, green, blue)
        # Output: 16 channels
        # Kernel: 3 x 3 matrix
        # Stride = 1: the kernel moves 1 step
        # Padding = 1: 1 pixel around the border
        self.conv1 = nn.Conv2d(in_channels=3, out_channels=16, kernel_size=3, stride=1, padding=1)

        # ReLU activation introduces non-linearity to help the model learn complex patterns
        self.relu = nn.ReLU()

        # Max pooling layer:
        # using a 2x2 window and a stride of 2
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)

        # Flatten layer:
        # Converts 3D feature maps (batch_size x channels x height x width) into 1D vectors
        # Required before feeding into a fully connected (dense) layer
        self.flatten = nn.Flatten()

        # Fully connected (linear) layer:
        # Input features: 16 channels × 112 × 112 pixels (after pooling)
        # Output: 1 neuron, used for binary classification (output in range 0-1)
        self.fc1 = nn.Linear(16 * 112 * 112, 1)

        # Sigmoid activation:
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        # Apply 1st conv layer, followed by ReLU activation
        x = self.relu(self.conv1(x))

        # Apply max pooling to reduce spatial dimensions
        x = self.pool(x)

        # Flatten the output to prepare for the fully connected layer
        x = self.flatten(x)

        # Apply fully connected layer
        x = self.fc1(x)

        # Apply sigmoid to get a probability output between 0 and 1
        x = self.sigmoid(x)

        return x

# Load model
model = ImageBinaryClassifier()
#model.load_state_dict(torch.load("model.pth", map_location=torch.device('cpu')))
model.eval()

ImageBinaryClassifier(
  (conv1): Conv2d(3, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (relu): ReLU()
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (fc1): Linear(in_features=200704, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

import torch.optim as optim

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = ImageBinaryClassifier().to(device)

# Binary classification — use BCELoss
criterion = nn.BCELoss()

# Optimizer
optimizer = optim.Adam(model.parameters(), lr=0.0001)

num_epochs = 5

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0

    for images, labels in train_loader:
        images = images.to(device)
        labels = labels.float().unsqueeze(1).to(device)  # shape: [batch, 1]

        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss:.4f}")

Epoch [1/5], Loss: 45.0474
Epoch [2/5], Loss: 39.3734
Epoch [3/5], Loss: 36.7471
Epoch [4/5], Loss: 34.7636
Epoch [5/5], Loss: 32.4146

torch.save(model.state_dict(), './models/binary_model.pth')

test_dir = os.path.join(base_dir, 'test') 
#os.mkdir(test_dir)

test_dataset = ImageFolder(root=test_dir, transform=transform)

classes = test_dataset.classes
print(classes)

print(test_dataset.class_to_idx)

['cats', 'dogs']
{'cats': 0, 'dogs': 1}

test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Check a single batch to make sure it works
for images, labels in test_loader:
    print(f"Batch image shape: {images.shape}")  # Should be [32, 3, 224, 224]
    print(f"Batch label shape: {labels.shape}")  # Should be [32]
    break

Batch image shape: torch.Size([32, 3, 224, 224])
Batch label shape: torch.Size([32])

all_preds = []
all_labels = []

with torch.no_grad():  # Disable gradients for faster inference
    for images, labels in test_loader:
        images = images.to(device)
        labels = labels.float().unsqueeze(1).to(device).to(device)

        outputs = model(images)            # Raw output from sigmoid
        preds = (outputs > 0.5).int()      # Convert to 0 (cat) or 1 (dog)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

from sklearn.metrics import accuracy_score

accuracy = accuracy_score(all_labels, all_preds)
print(f"Test Accuracy: {accuracy:.2%}")

Test Accuracy: 63.20%

import matplotlib.pyplot as plt

model.eval()
class_names = train_dataset.classes  # ['cat', 'dog']

with torch.no_grad():
    images, labels = next(iter(test_loader))
    outputs = model(images.to(device))
    preds = (outputs > 0.5).int().cpu().numpy()

    # Plot first 6 images with predictions
    for i in range(5):
        img = images[i].permute(1, 2, 0).numpy() * 0.5 + 0.5  # unnormalize
        plt.imshow(img)
        plt.title(f"True: {class_names[labels[i]]}, Pred: {class_names[preds[i][0]]}")
        plt.axis('off')
        plt.show()

from torchvision import models

model = models.resnet18(pretrained=True)

# Freeze All Layers (except fc)
for param in model.parameters():
    param.requires_grad = False  # Freeze feature extractor

# Replace final layer for binary classification
model.fc = nn.Sequential(
    nn.Linear(model.fc.in_features, 1),
    nn.Sigmoid()
)

C:\Users\mrezv\anaconda3\envs\faiss-env\lib\site-packages\torchvision\models\_utils.py:208: UserWarning: The parameter 'pretrained' is deprecated since 0.13 and may be removed in the future, please use 'weights' instead.
  warnings.warn(
C:\Users\mrezv\anaconda3\envs\faiss-env\lib\site-packages\torchvision\models\_utils.py:223: UserWarning: Arguments other than a weight enum or `None` for 'weights' are deprecated since 0.13 and may be removed in the future. The current behavior is equivalent to passing `weights=ResNet18_Weights.IMAGENET1K_V1`. You can also use `weights=ResNet18_Weights.DEFAULT` to get the most up-to-date weights.
  warnings.warn(msg)

# Train Only model.fc
optimizer = torch.optim.Adam(model.fc.parameters(), lr=1e-4)
loss_fn = nn.BCELoss()  # Use BCE since we have sigmoid in final layer

# Move Model to Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
  )
  (layer2): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (downsample): Sequential(
        (0): Conv2d(64, 128, kernel_size=(1, 1), stride=(2, 2), bias=False)
        (1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (1): BasicBlock(
      (conv1): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
  )
  (layer3): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(128, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (downsample): Sequential(
        (0): Conv2d(128, 256, kernel_size=(1, 1), stride=(2, 2), bias=False)
        (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (1): BasicBlock(
      (conv1): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
  )
  (layer4): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(256, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (downsample): Sequential(
        (0): Conv2d(256, 512, kernel_size=(1, 1), stride=(2, 2), bias=False)
        (1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (1): BasicBlock(
      (conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
  )
  (avgpool): AdaptiveAvgPool2d(output_size=(1, 1))
  (fc): Sequential(
    (0): Linear(in_features=512, out_features=1, bias=True)
    (1): Sigmoid()
  )
)

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for images, labels in train_loader:
        images = images.to(device)
        labels = labels.float().unsqueeze(1).to(device)  # [batch_size, 1]

        outputs = model(images)
        loss = loss_fn(outputs, labels) 

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss:.4f}")

Epoch [1/5], Loss: 42.4384
Epoch [2/5], Loss: 34.8734
Epoch [3/5], Loss: 29.4266
Epoch [4/5], Loss: 25.1113
Epoch [5/5], Loss: 22.2969

all_preds = []
all_labels = []

with torch.no_grad():  # Disable gradients for faster inference
    for images, labels in test_loader:
        images = images.to(device)
        labels = labels.float().unsqueeze(1).to(device).to(device)

        outputs = model(images)            # Raw output from sigmoid
        preds = (outputs > 0.5).int()      # Convert to 0 (cat) or 1 (dog)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

from sklearn.metrics import accuracy_score

accuracy = accuracy_score(all_labels, all_preds)
print(f"Test Accuracy: {accuracy:.2%}")

Test Accuracy: 52.20%

all_labels[:20]

[array([0.], dtype=float32),
 array([0.], dtype=float32),
 array([0.], dtype=float32),
 array([0.], dtype=float32),
 array([0.], dtype=float32),
 array([0.], dtype=float32),
 array([0.], dtype=float32),
 array([0.], dtype=float32),
 array([0.], dtype=float32),
 array([0.], dtype=float32),
 array([0.], dtype=float32),
 array([0.], dtype=float32),
 array([0.], dtype=float32),
 array([0.], dtype=float32),
 array([0.], dtype=float32),
 array([0.], dtype=float32),
 array([0.], dtype=float32),
 array([0.], dtype=float32),
 array([0.], dtype=float32),
 array([0.], dtype=float32)]

import matplotlib.pyplot as plt

model.eval()
class_names = train_dataset.classes  # ['cat', 'dog']

with torch.no_grad():
    images, labels = next(iter(test_loader))
    outputs = model(images.to(device))
    preds = (outputs > 0.5).int().cpu().numpy()

    # Plot first 6 images with predictions
    for i in range(5):
        img = images[i].permute(1, 2, 0).numpy() * 0.5 + 0.5  # unnormalize
        plt.imshow(img)
        plt.title(f"True: {class_names[labels[i]]}, Pred: {class_names[preds[i][0]]}")
        plt.axis('off')
        plt.show()

img = Image.open(random_image_path)
img

import torchvision.transforms as transforms

transform = transforms.Compose([transforms.Resize(224), transforms.ToTensor()])
image_tensor = transform(img)
print(image_tensor.shape)
image_tensor

torch.Size([3, 224, 323])

tensor([[[0.2510, 0.2510, 0.2510,  ..., 0.3176, 0.3176, 0.3216],
         [0.2471, 0.2471, 0.2471,  ..., 0.3176, 0.3176, 0.3176],
         [0.2392, 0.2392, 0.2392,  ..., 0.3176, 0.3176, 0.3176],
         ...,
         [0.4941, 0.5216, 0.5451,  ..., 0.5882, 0.5804, 0.5765],
         [0.4784, 0.5059, 0.5451,  ..., 0.6039, 0.5882, 0.5765],
         [0.4824, 0.5098, 0.5569,  ..., 0.6275, 0.6000, 0.5804]],

        [[0.3412, 0.3412, 0.3412,  ..., 0.4353, 0.4353, 0.4392],
         [0.3373, 0.3373, 0.3373,  ..., 0.4353, 0.4353, 0.4353],
         [0.3294, 0.3294, 0.3294,  ..., 0.4353, 0.4353, 0.4353],
         ...,
         [0.4784, 0.5059, 0.5294,  ..., 0.6000, 0.5922, 0.5882],
         [0.4627, 0.4902, 0.5294,  ..., 0.6157, 0.6000, 0.5882],
         [0.4667, 0.4941, 0.5412,  ..., 0.6392, 0.6118, 0.5922]],

        [[0.3961, 0.3961, 0.3961,  ..., 0.5294, 0.5294, 0.5333],
         [0.3922, 0.3922, 0.3922,  ..., 0.5294, 0.5294, 0.5294],
         [0.3843, 0.3843, 0.3843,  ..., 0.5294, 0.5294, 0.5294],
         ...,
         [0.4667, 0.4941, 0.5176,  ..., 0.6196, 0.6118, 0.6078],
         [0.4510, 0.4784, 0.5176,  ..., 0.6353, 0.6196, 0.6078],
         [0.4549, 0.4824, 0.5294,  ..., 0.6588, 0.6314, 0.6118]]])

import torchvision.transforms as transforms
transform = transforms.Compose([transforms.Resize(224), transforms.PILToTensor()])
image_tensor = transform(img)
print(image_tensor.shape)
image_tensor

torch.Size([3, 224, 323])

tensor([[[ 64,  64,  64,  ...,  81,  81,  82],
         [ 63,  63,  63,  ...,  81,  81,  81],
         [ 61,  61,  61,  ...,  81,  81,  81],
         ...,
         [126, 133, 139,  ..., 150, 148, 147],
         [122, 129, 139,  ..., 154, 150, 147],
         [123, 130, 142,  ..., 160, 153, 148]],

        [[ 87,  87,  87,  ..., 111, 111, 112],
         [ 86,  86,  86,  ..., 111, 111, 111],
         [ 84,  84,  84,  ..., 111, 111, 111],
         ...,
         [122, 129, 135,  ..., 153, 151, 150],
         [118, 125, 135,  ..., 157, 153, 150],
         [119, 126, 138,  ..., 163, 156, 151]],

        [[101, 101, 101,  ..., 135, 135, 136],
         [100, 100, 100,  ..., 135, 135, 135],
         [ 98,  98,  98,  ..., 135, 135, 135],
         ...,
         [119, 126, 132,  ..., 158, 156, 155],
         [115, 122, 132,  ..., 162, 158, 155],
         [116, 123, 135,  ..., 168, 161, 156]]], dtype=torch.uint8)

from torchvision.utils import draw_bounding_boxes

bbox = torch.tensor([50, 15, 310, 210])
bbox = bbox.unsqueeze(0)
bbox_image = draw_bounding_boxes(image_tensor, bbox, width=3, colors="red")

transform = transforms.Compose([transforms.ToPILImage()])

pil_image = transform(bbox_image)

import matplotlib.pyplot as plt
plt.imshow(pil_image)

<matplotlib.image.AxesImage at 0x27fc8993b80>

# R-CNN: backbone with PyTorch
import torch.nn as nn 
from torchvision.models import vgg16, VGG16_Weights 
vgg = vgg16(weights=VGG16_Weights.DEFAULT)

backbone = nn.Sequential(*list (vgg.features.children()))

input_dimension = nn.Sequential(*list(
    backbone.classifier.children())
    )[0].in_features

classifier = nn.Sequential(
    nn.Linear(input_dimension, 512),
    nn.ReLU(),
    nn.Linear(512, num_classes),
)

classifier = nn.Sequential(
    nn.Linear(input_dimension, 512),
    nn.ReLU(),
    nn.Linear(512, num_classes),
)

class ObjectDetectorCNN(nn.Module):
    def __init__(self):
        super(ObjectDetectorCNN, self).__init__()
        vgg = vgg16(weights=VGG16_Weights.DEFAULT)
        self.backbone = nn.Sequential(*list(vgg.features.children()))
        input_features = nn.Sequential(*list(vgg.classifier.children()))[0].in_features
        
        self.classifier = nn.Sequential(
            nn.Linear(input_features, 512),
            nn.ReLU(),
            nn.Linear(512, 2),
        )
        self.box_regressor = nn.Sequential(
            nn.Linear(input_features, 32),
            nn.ReLU(),
            nn.Linear(32, 4),
        )
        
    def forward(self, x):
        features = self.backbone(x)
        bboxes = self.regressor(features)
        classes = self.classifier(features)
        
        return bboxes, classes

import torch.nn as nn
from torchvision.models import vgg16, VGG16_Weights

class ObjectDetectorCNN(nn.Module):
    def __init__(self):
        super(ObjectDetectorCNN, self).__init__()

        # Load a pre-trained VGG16 model with default ImageNet weights
        vgg = vgg16(weights=VGG16_Weights.DEFAULT)

        # Use the convolutional feature extractor part of VGG16 as the backbone
        self.backbone = nn.Sequential(*list(vgg.features.children()))

        # Get the number of input features for the first fully connected layer in VGG16
        input_features = nn.Sequential(*list(vgg.classifier.children()))[0].in_features

        # Define a new classifier head for binary classification (2 classes)
        self.classifier = nn.Sequential(
            nn.Linear(input_features, 512),
            nn.ReLU(),
            nn.Linear(512, 2),  # Output: class scores (e.g., object vs. background)
        )

        # Define a bounding box regressor head to predict [x1, y1, x2, y2]
        self.box_regressor = nn.Sequential(
            nn.Linear(input_features, 32),
            nn.ReLU(),
            nn.Linear(32, 4),  # Output: bounding box coordinates
        )

    def forward(self, x):
        # Extract features from the input image using the CNN backbone
        features = self.backbone(x)

        # 'features' will be a feature map (4D tensor)
        features = torch.flatten(features, 1)

        # Apply the classifier and regressor heads
        bboxes = self.box_regressor(features)
        classes = self.classifier(features)

        return bboxes, classes  # Return both bounding boxes and class scores

import os
import xml.etree.ElementTree as ET
from PIL import Image
import torch
from torch.utils.data import Dataset
import torchvision.transforms as T

class DogsCatsDataset(Dataset):
    def __init__(self, image_dir, annot_dir, transform=None, target_size=(224, 224)):
        self.image_dir = image_dir
        self.annot_dir = annot_dir
        self.transform = transform
        self.image_files = [f for f in os.listdir(image_dir) if f.endswith(".png")]
        self.label_map = {"cat": 0, "dog": 1}
        self.target_size = target_size  # for resizing boxes

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        img_filename = self.image_files[idx]
        img_path = os.path.join(self.image_dir, img_filename)
        annot_path = os.path.join(self.annot_dir, img_filename.replace(".png", ".xml"))

        image = Image.open(img_path).convert("RGB")
        original_width, original_height = image.size

        # Parse XML
        tree = ET.parse(annot_path)
        root = tree.getroot()
        label_name = root.find("object").find("name").text
        label = self.label_map[label_name]

        bbox_xml = root.find("object").find("bndbox")
        xmin = float(bbox_xml.find("xmin").text)
        ymin = float(bbox_xml.find("ymin").text)
        xmax = float(bbox_xml.find("xmax").text)
        ymax = float(bbox_xml.find("ymax").text)

        # Scale bbox to match resized image size
        target_w, target_h = self.target_size
        x_scale = target_w / original_width
        y_scale = target_h / original_height

        xmin_scaled = xmin * x_scale
        xmax_scaled = xmax * x_scale
        ymin_scaled = ymin * y_scale
        ymax_scaled = ymax * y_scale

        bbox = torch.tensor([xmin_scaled, ymin_scaled, xmax_scaled, ymax_scaled], dtype=torch.float32)

        # Resize image
        if self.transform:
            image = self.transform(image)

        return image, bbox, torch.tensor(label)

from torch.utils.data import random_split

transform = T.Compose([
    T.Resize((224, 224)),
    T.ToTensor(),
])

dataset = DogsCatsDataset(
    image_dir="./Data/dog-and-cat-detection/images",
    annot_dir="./Data/dog-and-cat-detection/annotations",
    transform=transform
)

# Split sizes (e.g., 80% train / 20% test)
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size

# Random split
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

for images, bboxes, labels in train_loader:
    print(f"Batch image shape: {images.shape}")  # Should be [32, 3, 224, 224]
    print(f"Batch label shape: {bboxes.shape}")  # Should be [32, 4]
    print(f"Batch label shape: {labels.shape}")  # Should be [32]
    break

Batch image shape: torch.Size([32, 3, 224, 224])
Batch label shape: torch.Size([32, 4])
Batch label shape: torch.Size([32])

import torchvision.transforms.functional as F

# Look at the data
def draw_bbox(image, bbox, label, color='red'):
    image = F.to_pil_image(image.cpu())
    draw = ImageDraw.Draw(image)
    draw.rectangle(bbox.tolist(), outline=color, width=3)
    draw.text((bbox[0].item(), bbox[1].item()), label, fill=color)
    return image

# Display a few predictions
from PIL import ImageDraw

classes = {0: "cat", 1: "dog"}

sample_images = images[:3]
sample_bboxes = bboxes[:3]
sample_labels = labels[:3]

for i in range(3):
    img = draw_bbox(sample_images[i], sample_bboxes[i], classes[sample_labels[i].item()])
    plt.imshow(img)
    plt.axis("off")
    plt.title(f"Actual: {classes[sample_labels[i].item()]}")
    plt.show()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = ObjectDetectorCNN().to(device)

# Define loss functions
criterion_cls = nn.CrossEntropyLoss()      # for classification
criterion_bbox = nn.MSELoss()              # for bounding box regression

optimizer = optim.Adam(model.parameters(), lr=1e-4)

from tqdm import tqdm

num_epochs = 4

for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0

    for images, bboxes, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
        images = images.to(device)
        bboxes = bboxes.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()

        pred_bboxes, class_logits = model(images)

        # Compute losses
        loss_cls = criterion_cls(class_logits, labels)
        loss_bbox = criterion_bbox(pred_bboxes, bboxes)

        loss = loss_cls + loss_bbox

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}")

Epoch 1/4: 100%|███████████████████████████████████████████████████████████████████████| 25/25 [09:32<00:00, 22.89s/it]

Epoch [1/4], Loss: 4456.7773

Epoch 2/4: 100%|███████████████████████████████████████████████████████████████████████| 25/25 [09:33<00:00, 22.94s/it]

Epoch [2/4], Loss: 842.4922

Epoch 3/4: 100%|███████████████████████████████████████████████████████████████████████| 25/25 [09:34<00:00, 23.00s/it]

Epoch [3/4], Loss: 402.2148

Epoch 4/4: 100%|███████████████████████████████████████████████████████████████████████| 25/25 [09:35<00:00, 23.02s/it]

Epoch [4/4], Loss: 274.2585

import matplotlib.pyplot as plt


model.eval()
test_loss = 0.0

all_preds = []
all_pred_bboxes = []
all_targets_bboxes = []
all_targets = []

with torch.no_grad():
    for images, bboxes, labels in test_loader:
        images = images.to(device)
        bboxes = bboxes.to(device)
        labels = labels.to(device)

        # Forward pass
        pred_bboxes, class_logits = model(images)
        all_pred_bboxes.extend(pred_bboxes.cpu().numpy())

        # Compute losses
        loss_cls = criterion_cls(class_logits, labels)
        loss_bbox = criterion_bbox(pred_bboxes, bboxes)
        loss = loss_cls + loss_bbox

        test_loss += loss.item()

        # Store predictions and targets (optional, for metrics or visualization)
        preds = torch.argmax(class_logits, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_targets_bboxes.extend(bboxes.cpu().numpy())
        all_targets.extend(labels.cpu().numpy())

# Print average test loss
avg_test_loss = test_loss / len(test_loader)
print(f"\n✅ Test Loss: {avg_test_loss:.4f}")

✅ Test Loss: 371.3380

font = {'size'   : 6}
plt.rc('font', **font)

n_fig = 5

sample_images = images[:n_fig]
pred_bboxes = pred_bboxes[:n_fig]
pred_labels = torch.argmax(class_logits, dim=1)[:n_fig]
#
act_bboxes = bboxes[:n_fig]
act_labels = labels[:n_fig]


for i in range(n_fig):
    plt.figure(figsize=(10, 5))
    ax1=plt.subplot(1,2,1)
    img = draw_bbox(sample_images[i], act_bboxes[i], classes[act_labels[i].item()])
    plt.imshow(img)
    plt.axis("off")
    plt.title(f"Actual: {classes[act_labels[i].item()]}", fontsize=16)
    
    ax1=plt.subplot(1,2,2)
    img = draw_bbox(sample_images[i], pred_bboxes[i], classes[pred_labels[i].item()])
    plt.imshow(img)
    plt.axis("off")
    plt.title(f"Predicted: {classes[pred_labels[i].item()]}", fontsize=16)    
    plt.show()

from torchvision.models.detection import fasterrcnn_resnet50_fpn

# Load a pre-trained Faster R-CNN model with ResNet50-FPN backbone
model_fast_rcnn = fasterrcnn_resnet50_fpn(pretrained=True)

# Define number of classes and classifier input sise
num_classes = 2
in_features = model_fast_rcnn.roi_heads.box_predictor.cls_score.in_features

from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
# Replace model_fast_rcnn's classifier with a one with the desired number of classes
model_fast_rcnn.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

C:\Users\mrezv\anaconda3\envs\faiss-env\lib\site-packages\torchvision\models\_utils.py:208: UserWarning: The parameter 'pretrained' is deprecated since 0.13 and may be removed in the future, please use 'weights' instead.
  warnings.warn(
C:\Users\mrezv\anaconda3\envs\faiss-env\lib\site-packages\torchvision\models\_utils.py:223: UserWarning: Arguments other than a weight enum or `None` for 'weights' are deprecated since 0.13 and may be removed in the future. The current behavior is equivalent to passing `weights=FasterRCNN_ResNet50_FPN_Weights.COCO_V1`. You can also use `weights=FasterRCNN_ResNet50_FPN_Weights.DEFAULT` to get the most up-to-date weights.
  warnings.warn(msg)

class FasterRcnnDogsCatsDataset(torch.utils.data.Dataset):
    def __init__(self, image_dir, annot_dir, transform=None, target_size=(224, 224)):
        self.image_dir = image_dir
        self.annot_dir = annot_dir
        self.transform = transform
        self.image_files = [f for f in os.listdir(image_dir) if f.endswith(".png")]
        self.label_map = {"cat": 0, "dog": 1}  # 0 is background
        self.target_size = target_size  # for resizing boxes

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        img_filename = self.image_files[idx]
        img_path = os.path.join(self.image_dir, img_filename)
        annot_path = os.path.join(self.annot_dir, img_filename.replace(".png", ".xml"))

        image = Image.open(img_path).convert("RGB")
        original_width, original_height = image.size

        # Parse XML
        tree = ET.parse(annot_path)
        root = tree.getroot()
        obj = root.find("object")
        label = self.label_map[obj.find("name").text]
        bbox = obj.find("bndbox")
        xmin = float(bbox.find("xmin").text)
        ymin = float(bbox.find("ymin").text)
        xmax = float(bbox.find("xmax").text)
        ymax = float(bbox.find("ymax").text)

        # Scale bbox to match resized image size
        target_w, target_h = self.target_size
        x_scale = target_w / original_width
        y_scale = target_h / original_height

        xmin_scaled = xmin * x_scale
        xmax_scaled = xmax * x_scale
        ymin_scaled = ymin * y_scale
        ymax_scaled = ymax * y_scale
        
        boxes = torch.tensor([[xmin_scaled, ymin_scaled, xmax_scaled, ymax_scaled]], dtype=torch.float32)
        labels = torch.tensor([label], dtype=torch.int64)

        target = {"boxes": boxes, "labels": labels}

        if self.transform:
            image = self.transform(image)

        return image, target

from torchvision.transforms import ToTensor

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor()
])

image_dir="./Data/dog-and-cat-detection/images"
annot_dir="./Data/dog-and-cat-detection/annotations"
    
dataset = FasterRcnnDogsCatsDataset(image_dir, annot_dir, transform=transform, target_size=(224, 224))
dataloader = torch.utils.data.DataLoader(dataset, batch_size=5, shuffle=True, collate_fn=lambda x: tuple(zip(*x)))

# Split sizes (e.g., 80% train / 20% test)
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size

# Random split
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

from torch.utils.data import DataLoader

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=5, shuffle=True, collate_fn=lambda x: tuple(zip(*x)))

test_dataset = torch.utils.data.DataLoader(test_dataset, batch_size=5, shuffle=True, collate_fn=lambda x: tuple(zip(*x)))

for images, targets in train_loader:
    images = list(img.to(device) for img in images)
    targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
    break

targets

[{'boxes': tensor([[128.1280,  63.9039, 169.7920, 135.2072]]),
  'labels': tensor([1])},
 {'boxes': tensor([[34.4960,  0.5973, 97.2160, 94.9760]]),
  'labels': tensor([1])},
 {'boxes': tensor([[ 61.3760,  54.5340, 127.2320, 142.4921]]),
  'labels': tensor([0])},
 {'boxes': tensor([[ 87.2308,  17.9200, 208.3846, 136.6400]]),
  'labels': tensor([1])},
 {'boxes': tensor([[ 29.6092,  57.6000,  96.5517, 139.5200]]),
  'labels': tensor([0])}]

%time
classes = {0: "cat", 1: "dog"}

for i in range(3):
    sample_images = images[i]
    sample_bboxes = targets[i]['boxes']
    sample_labels = targets[i]['labels']
    img = draw_bbox(sample_images, sample_bboxes[0], classes[sample_labels.item()])
    plt.imshow(img)
    plt.axis("off")
    plt.title(f"Actual: {classes[sample_labels.item()]}")
    plt.show()

CPU times: total: 0 ns
Wall time: 0 ns

%time
model_fast_rcnn.to(device)
model_fast_rcnn.train()
optimizer = torch.optim.Adam(model_fast_rcnn.parameters(), lr=1e-4)

num_epochs = 2
for epoch in range(num_epochs):
    for images, targets in train_loader:
        images = list(img.to(device) for img in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        loss_dict = model_fast_rcnn(images, targets)
        losses = sum(loss for loss in loss_dict.values())

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

    print(f"Epoch {epoch+1} - Loss: {losses.item():.4f}")

CPU times: total: 0 ns
Wall time: 0 ns
Epoch 1 - Loss: 0.1005
Epoch 2 - Loss: 0.0754

model_fast_rcnn.eval()  # set model to eval mode
all_preds = []
all_actual = []

with torch.no_grad():
    for images, bboxes in test_dataset:
        images = [img.to(device) for img in images]

        # Run inference
        outputs = model_fast_rcnn(images)

        # Move outputs to CPU and store
        for i in range(len(images)):
            pred_boxes = outputs[i]['boxes'].cpu()
            pred_labels = outputs[i]['labels'].cpu()
            pred_scores = outputs[i]['scores'].cpu()

            all_preds.append({
                'boxes': pred_boxes,
                'labels': pred_labels,
                'scores': pred_scores,
                'image_id': i  # Optional: track which image
            })

            all_actual.append({
                'image': images[i],
                'bboxes': bboxes[i]
            })

pred_scores

tensor([0.0825, 0.0727])

# Look at the data
def draw_multiple_bbox(image, bbox, label, color='red', width=1, pred_scores=None):
    image = F.to_pil_image(image.cpu())
    draw = ImageDraw.Draw(image)
    if len(bbox)>=1:
        for ir in range(len(bbox)):
            draw.rectangle(bbox[ir].tolist(), outline=color, width=1)
            if pred_scores!=None:
                prd = int(np.round(pred_scores[ir],2)*100)
                label = f'{label}, Score={prd}%'
            else:
                label = f'{label}'            
            draw.text((bbox[ir][0].item(), bbox[ir][1].item()), label, fill=color)
    return image

font = {'size'   : 6}
plt.rc('font', **font)

n_fig = 10

for i in range(n_fig):
    # Sample input
    sample_images = all_actual[i]['image']
    act_bboxes = all_actual[i]['bboxes']['boxes']
    act_label = all_actual[i]['bboxes']['labels']
    #
    prediction = all_preds[i] 
    
    # Extract data
    pred_boxes = prediction['boxes']
    pred_labels = prediction['labels']
    pred_scores = prediction['scores']
    
    # Apply a confidence threshold
    threshold = 0.15
    keep = pred_scores > threshold
    pred_boxes = pred_boxes[keep]
    pred_labels = [classes[l.item()] for l in pred_labels[keep]]    
    
    plt.figure(figsize=(10, 5))
    ax1=plt.subplot(1,2,1)
    img = draw_bbox(sample_images, act_bboxes[0], classes[act_label.item()])
    plt.imshow(img)
    plt.axis("off")
    plt.title(f"Actual: {classes[act_label.item()]}", fontsize=16)
    
    ax1=plt.subplot(1,2,2)
    if len(pred_labels)>=1:
        img = draw_multiple_bbox(sample_images, pred_boxes, pred_labels[0], pred_scores=None)
    else:
        img = draw_multiple_bbox(sample_images, pred_boxes, None, pred_scores=None)        
    plt.imshow(img)
    plt.axis("off")
    if len(pred_labels)>=1: plt.title(f"Predicted: {pred_labels[0]}", fontsize=16)    
    plt.show()

# Two sets of boxes (x1, y1, x2, y2)
bbox_1 = [63, 63, 162, 162]
bbox_2 = [71, 71, 178, 178]
bbox_1 = torch.tensor(bbox_1).unsqueeze(0)
bbox_2 = torch.tensor(bbox_2).unsqueeze(0)

from torchvision.ops import box_iou
iou = box_iou(bbox_1, bbox_2)
print(iou)

tensor([[0.6385]])

model_fast_rcnn.eval()  # set model to eval mode
with torch.no_grad():
    for images, bboxes in test_dataset:
        images = [img.to(device) for img in images]

        # Run inference
        outputs = model_fast_rcnn(images)
        break

print(outputs)

[{'boxes': tensor([[ 59.1766,  35.7878, 160.2520, 102.8107],
        [ 48.4511,  18.7178, 175.4347, 129.9121],
        [ 29.1291,  29.2868, 156.6243, 188.5912]]), 'labels': tensor([1, 1, 1]), 'scores': tensor([0.9580, 0.4953, 0.0535])}, {'boxes': tensor([[ 65.2527,  25.3158, 167.1676, 100.0974],
        [126.5162,  24.3283, 165.3551,  59.8299],
        [141.5926,  67.2581, 208.0837, 138.8255],
        [ 50.2338,  16.2549, 191.9981, 135.2037],
        [ 94.6246,  19.3349, 147.2737, 104.8754],
        [114.5969,  24.8429, 169.7292,  81.4180],
        [ 10.2869, 118.3336,  84.3468, 183.6370]]), 'labels': tensor([1, 1, 1, 1, 1, 1, 1]), 'scores': tensor([0.9674, 0.7978, 0.5249, 0.3867, 0.2823, 0.2576, 0.0500])}, {'boxes': tensor([[ 15.2346,  38.0855,  86.5071, 149.0240],
        [ 63.7119,  48.9329,  86.0664,  87.5123],
        [ 33.2000,  75.4455,  78.4895, 153.8040],
        [ 19.4210,  46.1877,  84.5977,  91.5905]]), 'labels': tensor([1, 1, 1, 1]), 'scores': tensor([0.3836, 0.3575, 0.1281, 0.0848])}, {'boxes': tensor([[132.1436,   6.1657, 153.0520,  62.8487],
        [131.2393,  41.7011, 195.3923, 168.2761],
        [139.3058,  48.7482, 194.3261, 116.2357],
        [129.5448,   3.8996, 166.7290,  68.5764],
        [ 14.0309,  80.1299, 141.8325, 207.4180],
        [148.1124,  76.1309, 194.3829, 115.8646],
        [153.0909,  73.1803, 193.6847, 171.0162],
        [122.3342,   3.2140, 154.2494,  80.4919],
        [ 30.8971,  88.1470, 112.4306, 178.5684],
        [124.3426,   6.4704, 192.4521, 119.8814]]), 'labels': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1]), 'scores': tensor([0.7861, 0.6424, 0.4685, 0.2317, 0.2105, 0.1900, 0.1517, 0.1080, 0.0639,
        0.0583])}, {'boxes': tensor([[ 61.5898,  24.0701, 187.2649, 154.6089],
        [151.2988,  33.1828, 182.9017,  62.8595]]), 'labels': tensor([1, 1]), 'scores': tensor([0.4711, 0.0856])}]

boxes = outputs[0]["boxes"]
scores = outputs[0]["scores"]
print('------boxes------')
print(boxes)
print('\n')
print('------scores------')
print(scores)

------boxes------
tensor([[ 59.1766,  35.7878, 160.2520, 102.8107],
        [ 48.4511,  18.7178, 175.4347, 129.9121],
        [ 29.1291,  29.2868, 156.6243, 188.5912]])


------scores------
tensor([0.9580, 0.4953, 0.0535])

all_preds[0]['boxes']

tensor([[ 39.0378,  19.4280, 174.5359, 136.4559],
        [ 67.7851,  84.0135, 143.2345, 169.7757],
        [ 40.7384,  21.7338, 119.9496, 112.0267],
        [ 85.3279,  23.2526, 165.4138, 113.2446],
        [ 52.8781,  55.6788, 194.7459, 171.7679],
        [125.6090,  28.8947, 164.5665,  55.3016]])

font = {'size'   : 6}
plt.rc('font', **font)
plt.figure(figsize=(15, 10))

i = 1
# Sample input
sample_images = all_actual[i]['image']
prediction = all_preds[i] 

# Extract data
pred_boxes = prediction['boxes']
pred_labels = prediction['labels']
pred_scores = prediction['scores']

# Apply a confidence threshold
threshold = 0.2
keep = pred_scores > threshold
pred_boxes = pred_boxes[keep]
pred_labels = [classes[l.item()] for l in pred_labels[keep]]   
    
ax1=plt.subplot(1,2,1)
#(image, bbox, label, color='red', width=1, pred_scores=None)
img = draw_multiple_bbox(sample_images, pred_boxes, pred_labels[0], color='red', pred_scores=pred_scores, width=2)
plt.imshow(img)
plt.axis("off")
plt.title(f"Predicted: {pred_labels[0]}", fontsize=16)    
plt.show()

from torchvision.ops import nms

pred_boxes = prediction['boxes']
pred_labels = prediction['labels']
pred_scores = prediction['scores']

box_indices = nms(
boxes=pred_boxes,
scores=pred_scores,
iou_threshold=0.1,
)
print(box_indices)

tensor([0])

filtered_boxes = pred_boxes[box_indices]
pred_scores[box_indices]

tensor([0.9896])

pred_labels = [classes[l.item()] for l in pred_labels]

font = {'size'   : 6}
plt.rc('font', **font)
plt.figure(figsize=(15, 10))

ax1=plt.subplot(1,2,1)
#(image, bbox, label, color='red', width=1, pred_scores=None)
img = draw_multiple_bbox(sample_images, pred_boxes[box_indices], 
                         pred_labels[box_indices], color='red', pred_scores=pred_scores[box_indices], width=2)
plt.imshow(img)
plt.axis("off")
plt.title(f"Predicted: {pred_labels[box_indices]}", fontsize=16)    
plt.show()

Model	Region Proposals	Speed	Accuracy	Notes
R-CNN	External (Selective Search)	Slow	High	First CNN-based detector
Fast R-CNN	External (Selective Search)	Faster	High	Single CNN pass per image
Faster R-CNN	Learned (RPN)	Much faster	Very high	Fully end-to-end model

Table of Contents

Introduction¶

What is a CNN?¶

Classification with CNN¶

CNN Binary Classifier¶

Use a Pre-trained Model¶

Object Recognition¶

Converting Pixels to Tensors¶

Drawing the bounding box¶

Two-Stage Object Detection¶

Implement R-CNN¶

Dogs vs Cats (Bounding Boxes Added)¶

Look at the Images¶

Prediction¶

Look at Predictions¶

Implement Faster R-CNN¶

Prediction¶

Look at Prediction¶

Intersection over Union (IoU)¶

Non-Max Suppression (NMS)¶

Non-Max Suppression in PyTorch¶