Load neccessary libraries.

from tqdm.notebook import tqdm
import argparse
import warnings
import librosa
import pandas as pd
import numpy as np
from pathlib import Path
from joblib import delayed, Parallel
from tqdm.notebook import tqdm
import os
import gc
from pathlib import Path
from matplotlib import pyplot as plt
from scipy.ndimage import maximum_filter1d
import math
warnings.filterwarnings('ignore')
import skimage.io

Create dictionary from folders structure. Dictionary will be used to iterate through files.

PATH_DICT = {}
for folder_path in tqdm(os.listdir('train_audio')):
    for img_path in os.listdir('train_audio/' + folder_path +'/'):
      PATH_DICT[img_path] = 'train_audio/' + folder_path +'/' + img_path + '/'
      for img_path in os.listdir('train_audio/' + folder_path +'/'):
        PATH_DICT[img_path] = 'train_audio/' + folder_path +'/' + img_path

This helper function will convert pixel values to a user defined max and min (linear transformation).

def scale_minmax(X, min=0.0, max=1.0):
    X_std = (X - X.min()) / (X.max() - X.min())
    X_scaled = X_std * (max - min) + min
    return X_scaled.astype(np.uint8)

The function below first converts the raw audio file into an image (mel spectrogram; x axis is time, y axis is frequency, and values are decibles). It rotates the image and flips the pixel values to have dark grays as louder values. This code also quantizes the picture matrix into roughly 5 second segments with about a 1-2 second overlap. The function will take audio clips under 5 seconds and piece them together to make a five second clip from the sample. The function also checks each image to detect for variability in the column and row means. If there was little variability in the means, the clip had a high likelyhood of being noise and have no call. I experimented with different standard deviations to get an optimal detection. This then created a folder and placed clips without signal in a “no call” folder.

def audio_to_specs(audio, sr, file, path):
    mel = librosa.power_to_db(
        librosa.feature.melspectrogram(audio, sr=sr, fmin=0, fmax=11000, n_mels=128, power=2)
    )

    #min-max scale to fit inside range
    img = scale_minmax(mel, 0, 255)
    img = np.flip(img, axis=0) # put low frequencies at the bottom in image
    img = 255-img 

    sli_ind = []
    sli_ind.append([0,256])
    start = 255
    for i in range((img.shape[1]%256)*2):
        start = start - 102
        end = start + 256
        if end < img.shape[1]:
            sli_ind.append([start, end])
            start = end
        else:
            sli_ind.append([img.shape[1]-256, img.shape[1]])
            break
    #print(sli_ind)
    
    if img.shape[1] < 255:
        x = math.ceil(256 / img.shape[1])
        t_img = img
        for _ in range(x):
            t_img = np.append(t_img, img, axis=1)
        img = t_img
        sli_ind.pop()

    for i, slice in enumerate(sli_ind):
        # save as PNG
        start, end = slice
        #skimage.io.imsave(os.path.join(path ,f'img{file}{str(i)}.png'), img[:, start:end],check_contrast = False)
        mean_0 = np.mean(img[:, start:end], axis = 0)
        mean_1 = np.mean(img[:, start:end], axis = 1)
        if np.std(mean_1) <= 1.8 or np.std(mean_0) <= 1.2:
            skimage.io.imsave(os.path.join("trn/no_call/" ,f'img{file}{str(i)}.png'), img[:, start:end],check_contrast = False)
        else:
            skimage.io.imsave(os.path.join(path ,f'img{file}{str(i)}.png'), img[:, start:end],check_contrast = False)
    return "completed"

This code runs the function above and creates a new folder structure to save the images generated by the function above. These images will be used in training the convolutional network. It also timed the cell to see how long the processing took. I executed this code on a laptop with a Ryzen 7 processor and 16gb of ram and it took about 10hrs.

%%time
if not os.path.exists('trn/no_call/'):
    os.makedirs('trn/no_call/')

for key in PATH_DICT:
    try:
        x, sr = librosa.load(PATH_DICT[key], mono=True)
    except:
        continue
    gpath = PATH_DICT[key]
    path = gpath[gpath.find("/") + 1 : gpath.find("/", gpath.find("/") + 1, len(gpath))]
    if not os.path.exists('trn/' + path + "/"):
        os.makedirs('trn/' + path + "/")
    spcs = audio_to_specs(x, sr, key, "trn/" + path + "/")

Wall time: 10h 2min

Below I generated an image of the whole “XC2628.mp3” file as an example.

x, sr = librosa.load(PATH_DICT['XC2628.mp3'], mono=True)
mel = librosa.power_to_db(
        librosa.feature.melspectrogram(x, sr=sr, fmin=0, fmax=11000, n_mels=128, power=2)
    )

    #min-max scale to fit inside range
img = scale_minmax(mel, 0, 255)
img = img_as_ubyte(img)
img = np.flip(img, axis=0) # put low frequencies at the bottom in image
img = 255-img # invert. make black==more energy
plt.imshow(img[:, :255],cmap='gray', vmin=0, vmax=255)

<matplotlib.image.AxesImage at 0x2025edee2c8>

Carlos Valle

Bird Call Audio Classification with a Convolutional Neural Network P1

Description