Bird Call Audio Classification with a Convolutional Neural Network P2
Import needed packages¶
These block loads the needed packages and sets up the “device” object to move computations to gpu.
import matplotlib.pyplot as plt
import numpy as np
import os
%matplotlib inline
from PIL import Image
import torch
from torchvision import datasets, models, transforms
import torch.nn as nn
from torch.nn import functional as F
import torch.optim as optim
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
Helper function¶
This helper function gets bird call labels from the training file folder structure. The files are orgnized into folders by bird call labels.
def find_classes(dir):
classes = os.listdir(dir)
classes.sort()
class_to_idx = {classes[i]:i for i in range(len(classes)) }
idx_to_class = dict([(value, key) for key, value in class_to_idx.items()])
return classes, class_to_idx, idx_to_class
classes_list, _, labels = find_classes('trn/')
Load data¶
This loads the image data using the torchvision package. It transforms the image to tensor for training. It then creates a training and validation data loader.
transform = transforms.Compose([transforms.Resize((224,224)),
#transforms.Grayscale(),
transforms.ToTensor(),
transforms.Normalize((0.5),(0.25))
])
data_path = 'trn/'
dataset = datasets.ImageFolder(root=data_path, transform = transform)
training_dataset, validation_dataset = torch.utils.data.random_split(dataset, [(len(dataset)-round(len(dataset)*.1)), round(len(dataset)*.1)],
generator=torch.Generator().manual_seed(42))
training_loader = torch.utils.data.DataLoader(training_dataset, batch_size=300, shuffle=True)
validation_loader = torch.utils.data.DataLoader(validation_dataset, batch_size = 300, shuffle=False)
Tensor to image¶
The function below converts tensors back to images for inspection. The following block displays 20 sample images and their labels.
def im_convert(tensor):
image = tensor.cpu().clone().detach().numpy()
image = image.transpose(1, 2, 0)
image = image * np.array((0.5, 0.5, 0.5)) + np.array((0.5, 0.5, 0.5))
image = image.clip(0, 1)
return image
dataiter = iter(training_loader)
images, label = dataiter.next()
fig = plt.figure(figsize=(25, 4))
for idx in np.arange(20):
ax = fig.add_subplot(2, 10, idx+1, xticks=[], yticks=[])
plt.imshow(im_convert(images[idx]))
ax.set_title([labels[idx]])
Define model¶
The following block defines the model. It uses a a pretrained resnet 50 model. I modified the parameters of the model to fit my training set and class labels. I used cross entropy loss and an adaptive gradient-based optimization with a learning rate of 0.001.
I did not expect this pretrained model to do very well on my set because it was trained on color images of objects. My set has gray scale images of signal. This took 6.5 hours of wall time with a lower validation accuracy than my custom trained model shown in Part 3 with took 2.25 hours.
model = models.resnet50(pretrained=True).to(device)
for param in model.parameters():
param.requires_grad = False
model.fc = nn.Sequential(
nn.Linear(2048, 500),
nn.ReLU(inplace=True),
nn.Linear(500, 265)).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001)
%%time
epochs = 15
running_loss_history = []
running_corrects_history = []
val_running_loss_history = []
val_running_corrects_history = []
batch_size = 300
for e in range(epochs):
running_loss = 0.0
running_corrects = 0.0
val_running_loss = 0.0
val_running_corrects = 0.0
for inputs, labels in training_loader:
inputs = inputs.to(device)
labels = labels.to(device)
outputs = model(inputs)
loss = criterion(outputs, labels)
optimizer.zero_grad()
loss.backward()
optimizer.step()
_, preds = torch.max(outputs, 1)
running_loss += loss.item()
running_corrects += torch.sum(preds == labels.data)
else:
with torch.no_grad():
for val_inputs, val_labels in validation_loader:
val_inputs = val_inputs.to(device)
val_labels = val_labels.to(device)
val_outputs = model(val_inputs)
val_loss = criterion(val_outputs, val_labels)
_, val_preds = torch.max(val_outputs, 1)
val_running_loss += val_loss.item()
val_running_corrects += torch.sum(val_preds == val_labels.data)
epoch_loss = running_loss / len(training_dataset)
epoch_acc = running_corrects.float() / len(training_dataset)
running_loss_history.append(epoch_loss)
running_corrects_history.append(epoch_acc)
val_epoch_loss = val_running_loss / len(validation_dataset)
val_epoch_acc = val_running_corrects.float() / len(validation_dataset)
val_running_loss_history.append(val_epoch_loss)
val_running_corrects_history.append(val_epoch_acc)
print('epoch :', (e+1))
print('training loss: {:.4f}, acc {:.4f} '.format(epoch_loss, epoch_acc.item()))
print('validation loss: {:.4f}, validation acc {:.4f} '.format(val_epoch_loss, val_epoch_acc.item()))
torch.save(model.state_dict(), "resnet_50_birdcall")
torch.cuda.empty_cache()