Bird Call Audio Classification with a Convolutional Neural Network P3
Import packages¶
Here I import packages needed for the analysis and create object to use GPUs
#pip install torch torchvision
import torch
import matplotlib.pyplot as plt
import numpy as np
import torch.nn.functional as F
from torch import nn
from torchvision import datasets, transforms
import os
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
Get labels¶
This function looks through the training image directory and gets labels based on the directory structure. Images are organized within folders which are named after the bird call label.
def find_classes(dir):
classes = os.listdir(dir)
classes.sort()
class_to_idx = {classes[i]:i for i in range(len(classes)) }
idx_to_class = dict([(value, key) for key, value in class_to_idx.items()])
return classes, class_to_idx, idx_to_class
classes_list, _, labels = find_classes('trn/')
Load data¶
This block loads the data with transforms (convert to tensors) using torchvision library. It splits into training and validation batches using 42 as the random seed.
transform = transforms.Compose([transforms.Resize((128, 256)),
transforms.Grayscale(),
transforms.ToTensor(),
transforms.Normalize((0.5),(0.25))
])
data_path = 'trn/'
dataset = datasets.ImageFolder(root=data_path, transform = transform)
training_dataset, validation_dataset = torch.utils.data.random_split(dataset, [(len(dataset)-round(len(dataset)*.1)), round(len(dataset)*.1)],
generator=torch.Generator().manual_seed(42))
training_loader = torch.utils.data.DataLoader(training_dataset, batch_size=300, shuffle=True)
validation_loader = torch.utils.data.DataLoader(validation_dataset, batch_size = 300, shuffle=False)
Helper function¶
This helper function will transform a tensor back to an image for inspection. The block after this function displays a sample of the images from the training set with their labels.
def im_convert(tensor):
image = tensor.cpu().clone().detach().numpy()
image = image.transpose(1, 2, 0)
image = image * np.array((0.5, 0.5, 0.5)) + np.array((0.5, 0.5, 0.5))
image = image.clip(0, 1)
return image
dataiter = iter(training_loader)
images, label = dataiter.next()
fig = plt.figure(figsize=(25, 4))
for idx in np.arange(20):
ax = fig.add_subplot(2, 10, idx+1, xticks=[], yticks=[])
plt.imshow(im_convert(images[idx]))
ax.set_title([labels[idx]])
Defining the model¶
This defines the CNN model I used. It contains 2 convolutional layers and two fully connected layers with dropout for regularization. The following two blocks in this notebook move the model to gpu memory and set criteria and optimizer. I used cross entropy loss and an adaptive gradient-based optimization with a learning rate of 0.0001.
class Net(nn.Module):
def __init__(self):
super().__init__()
self.conv1 = nn.Conv2d(1, 16, kernel_size = 5, padding = 2)
self.act1 = nn.Tanh()
self.pool1 = nn.MaxPool2d(2)
self.conv2 = nn.Conv2d(16, 8, kernel_size = 3, padding = 1)
self.act2 = nn.Tanh()
self.pool2 = nn.MaxPool2d(2)
self.fc1 = nn.Linear(8*32*64, 1000)
self.dropout1 = nn.Dropout(0.5)
self.act3 = nn.Tanh()
self.fc2 = nn.Linear(1000, 265)
def forward(self, x):
x = self.pool1(self.act1(self.conv1(x)))
x = self.pool2(self.act2(self.conv2(x)))
x = x.view(-1, 8*32*64)
x = self.act3(self.fc1(x))
x = self.dropout1(x)
x = self.fc2(x)
return x
model = Net().to(device)
model
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 0.0001)
Training the model¶
The next block run the actual training. It keeps track of accuracy and loss for training and validation. I ran this for 20 epoch with a batch size of 300 images. It overtrained. The training accuracy continued to increase and the validation accuracy did not increase at about 15 epochs. A few blocks down I ran the code again with 15 epoch. I decreased the wall time from about 3 hours to 2.25 hours and maintained the same accuracy. The 15 epoch model is the one I retained.
The final accuracy of the model is 0.46 on the validation set. Which is fairly decent given that the model is taking a five second segment of audio and classifying into 265 feature labels. I plan to experiment with different methods of combining the predictions. A recording of a birdcall will vary from 5 seconds. For instance, a 30 second clip may generate 6 predicted labels. Hopefully, combining with a simple vote counting method or adding another model to combine may increase the predictive power of the base CNN similar to boosting.
%%time
epochs = 20
running_loss_history = []
running_corrects_history = []
val_running_loss_history = []
val_running_corrects_history = []
batch_size = 300
for e in range(epochs):
running_loss = 0.0
running_corrects = 0.0
val_running_loss = 0.0
val_running_corrects = 0.0
for inputs, labels in training_loader:
inputs = inputs.to(device)
labels = labels.to(device)
outputs = model(inputs)
loss = criterion(outputs, labels)
optimizer.zero_grad()
loss.backward()
optimizer.step()
_, preds = torch.max(outputs, 1)
running_loss += loss.item()
running_corrects += torch.sum(preds == labels.data)
else:
with torch.no_grad():
for val_inputs, val_labels in validation_loader:
val_inputs = val_inputs.to(device)
val_labels = val_labels.to(device)
val_outputs = model(val_inputs)
val_loss = criterion(val_outputs, val_labels)
_, val_preds = torch.max(val_outputs, 1)
val_running_loss += val_loss.item()
val_running_corrects += torch.sum(val_preds == val_labels.data)
epoch_loss = running_loss/len(training_loader)
epoch_acc = running_corrects.float()/ len(training_dataset)
running_loss_history.append(epoch_loss)
running_corrects_history.append(epoch_acc)
val_epoch_loss = val_running_loss / len(validation_loader)
val_epoch_acc = val_running_corrects.float()/ len(validation_dataset)
val_running_loss_history.append(val_epoch_loss)
val_running_corrects_history.append(val_epoch_acc)
print('epoch :', (e+1))
print('training loss: {:.4f}, acc {:.4f} '.format(epoch_loss, epoch_acc.item()))
print('validation loss: {:.4f}, validation acc {:.4f} '.format(val_epoch_loss, val_epoch_acc.item()))
torch.cuda.empty_cache()
%%time
epochs = 12
running_loss_history = []
running_corrects_history = []
val_running_loss_history = []
val_running_corrects_history = []
batch_size = 300
for e in range(epochs):
running_loss = 0.0
running_corrects = 0.0
val_running_loss = 0.0
val_running_corrects = 0.0
for inputs, labels in training_loader:
inputs = inputs.to(device)
labels = labels.to(device)
outputs = model(inputs)
loss = criterion(outputs, labels)
optimizer.zero_grad()
loss.backward()
optimizer.step()
_, preds = torch.max(outputs, 1)
running_loss += loss.item()
running_corrects += torch.sum(preds == labels.data)
else:
with torch.no_grad():
for val_inputs, val_labels in validation_loader:
val_inputs = val_inputs.to(device)
val_labels = val_labels.to(device)
val_outputs = model(val_inputs)
val_loss = criterion(val_outputs, val_labels)
_, val_preds = torch.max(val_outputs, 1)
val_running_loss += val_loss.item()
val_running_corrects += torch.sum(val_preds == val_labels.data)
epoch_loss = running_loss/len(training_loader)
epoch_acc = running_corrects.float()/ len(training_dataset)
running_loss_history.append(epoch_loss)
running_corrects_history.append(epoch_acc)
val_epoch_loss = val_running_loss / len(validation_loader)
val_epoch_acc = val_running_corrects.float()/ len(validation_dataset)
val_running_loss_history.append(val_epoch_loss)
val_running_corrects_history.append(val_epoch_acc)
print('epoch :', (e+1))
print('training loss: {:.4f}, acc {:.4f} '.format(epoch_loss, epoch_acc.item()))
print('validation loss: {:.4f}, validation acc {:.4f} '.format(val_epoch_loss, val_epoch_acc.item()))
torch.cuda.empty_cache()
20 Epochs take 2h51mins 15 Epochs take 2hr Accuracy is essentially the same
torch.save(model.state_dict(), "my_model_birdcall")