Bird Call Audio Classification with a Convolutional Neural Network P1
Part 1 – Processing Audio
Load neccessary libraries.
from tqdm.notebook import tqdm
import argparse
import warnings
import librosa
import pandas as pd
import numpy as np
from pathlib import Path
from joblib import delayed, Parallel
from tqdm.notebook import tqdm
import os
import gc
from pathlib import Path
from matplotlib import pyplot as plt
from scipy.ndimage import maximum_filter1d
import math
warnings.filterwarnings('ignore')
import skimage.io
Create dictionary from folders structure. Dictionary will be used to iterate through files.
PATH_DICT = {}
for folder_path in tqdm(os.listdir('train_audio')):
for img_path in os.listdir('train_audio/' + folder_path +'/'):
PATH_DICT[img_path] = 'train_audio/' + folder_path +'/' + img_path + '/'
for img_path in os.listdir('train_audio/' + folder_path +'/'):
PATH_DICT[img_path] = 'train_audio/' + folder_path +'/' + img_path
This helper function will convert pixel values to a user defined max and min (linear transformation).
def scale_minmax(X, min=0.0, max=1.0):
X_std = (X - X.min()) / (X.max() - X.min())
X_scaled = X_std * (max - min) + min
return X_scaled.astype(np.uint8)
The function below first converts the raw audio file into an image (mel spectrogram; x axis is time, y axis is frequency, and values are decibles). It rotates the image and flips the pixel values to have dark grays as louder values. This code also quantizes the picture matrix into roughly 5 second segments with about a 1-2 second overlap. The function will take audio clips under 5 seconds and piece them together to make a five second clip from the sample. The function also checks each image to detect for variability in the column and row means. If there was little variability in the means, the clip had a high likelyhood of being noise and have no call. I experimented with different standard deviations to get an optimal detection. This then created a folder and placed clips without signal in a “no call” folder.
def audio_to_specs(audio, sr, file, path):
mel = librosa.power_to_db(
librosa.feature.melspectrogram(audio, sr=sr, fmin=0, fmax=11000, n_mels=128, power=2)
)
#min-max scale to fit inside range
img = scale_minmax(mel, 0, 255)
img = np.flip(img, axis=0) # put low frequencies at the bottom in image
img = 255-img
sli_ind = []
sli_ind.append([0,256])
start = 255
for i in range((img.shape[1]%256)*2):
start = start - 102
end = start + 256
if end < img.shape[1]:
sli_ind.append([start, end])
start = end
else:
sli_ind.append([img.shape[1]-256, img.shape[1]])
break
#print(sli_ind)
if img.shape[1] < 255:
x = math.ceil(256 / img.shape[1])
t_img = img
for _ in range(x):
t_img = np.append(t_img, img, axis=1)
img = t_img
sli_ind.pop()
for i, slice in enumerate(sli_ind):
# save as PNG
start, end = slice
#skimage.io.imsave(os.path.join(path ,f'img{file}{str(i)}.png'), img[:, start:end],check_contrast = False)
mean_0 = np.mean(img[:, start:end], axis = 0)
mean_1 = np.mean(img[:, start:end], axis = 1)
if np.std(mean_1) <= 1.8 or np.std(mean_0) <= 1.2:
skimage.io.imsave(os.path.join("trn/no_call/" ,f'img{file}{str(i)}.png'), img[:, start:end],check_contrast = False)
else:
skimage.io.imsave(os.path.join(path ,f'img{file}{str(i)}.png'), img[:, start:end],check_contrast = False)
return "completed"
This code runs the function above and creates a new folder structure to save the images generated by the function above. These images will be used in training the convolutional network. It also timed the cell to see how long the processing took. I executed this code on a laptop with a Ryzen 7 processor and 16gb of ram and it took about 10hrs.
%%time
if not os.path.exists('trn/no_call/'):
os.makedirs('trn/no_call/')
for key in PATH_DICT:
try:
x, sr = librosa.load(PATH_DICT[key], mono=True)
except:
continue
gpath = PATH_DICT[key]
path = gpath[gpath.find("/") + 1 : gpath.find("/", gpath.find("/") + 1, len(gpath))]
if not os.path.exists('trn/' + path + "/"):
os.makedirs('trn/' + path + "/")
spcs = audio_to_specs(x, sr, key, "trn/" + path + "/")
Below I generated an image of the whole “XC2628.mp3” file as an example.
x, sr = librosa.load(PATH_DICT['XC2628.mp3'], mono=True)
mel = librosa.power_to_db(
librosa.feature.melspectrogram(x, sr=sr, fmin=0, fmax=11000, n_mels=128, power=2)
)
#min-max scale to fit inside range
img = scale_minmax(mel, 0, 255)
img = img_as_ubyte(img)
img = np.flip(img, axis=0) # put low frequencies at the bottom in image
img = 255-img # invert. make black==more energy
plt.imshow(img[:, :255],cmap='gray', vmin=0, vmax=255)