poissonisfish

2021-05-11T07:45:43+00:00

Great post! Do you know if wavelets can be use for this type of analysis?

2021-05-14T08:08:04+00:00

Hi Victor, thanks for the kind words. I suppose yes, the STFT used here could be adjusted / replaced accordingly. Best, Francisco

LikeLike

Reply

Pingback: Object detection and tracking in Python – poissonisfish

Pingback: Intelligent Voice

2022-10-21T13:03:41+00:00

Thanks a lot for sharing great knowledge. I am working on detecting insect pest sounds in stored grains to prevent losses, this is indeed useful.

LikeLike

Reply

2022-10-21T18:34:42+00:00

Hi Carlito, glad you found it helpful! Good luck with your project, it sounds interesting

LikeLike

Reply

	# Tue Feb 4 19:43:33 2020 ——————————
	setwd("~/Documents/Tutorials/birdsong")
	library(parallel)
	library(tidyverse)
	library(abind)
	library(caret)
	library(tuneR)
	library(warbleR)
	source("funs.R")

	# Create mp3/ if necessary
	if(!dir.exists("mp3/")){
	dir.create("mp3/")
	}

	#### Download HQ male song recordings > 30s long from Europe ####
	query <- querxc("type:song type:male len_gt:30 q_gt:C area:europe")
	query$Species <- with(query, paste(Genus, Specific_epithet))
	# Select top 50 most abundant bird species
	speciesCount <- sort(table(query$Species), decreasing = T)
	topSpecies <- names(speciesCount)[1:50]
	query <- query[query$Species %in% topSpecies, ]
	# Downsample to min size among the 50 classes
	balancedClasses <- lapply(topSpecies, function(x){
	set.seed(100)
	sample(which(query$Species == x), min(table(query$Species)))
	}) %>% unlist()
	# Subset accordingly
	query <- query[balancedClasses, ]
	# Download using updated query
	querxc(X = query, download = T, path = "mp3/", parallel = 8)

	#### Pre-processing ####
	# Read files
	fnames <- list.files("mp3/", full.names = T, patt = "*.mp3")

	# Write metadata for Kaggle dataset
	ids <- str_extract(fnames, pattern = "[0-9]{4,}")
	query$Path <- fnames[match(query$Recording_ID, ids)]
	write.csv(query, "metadata.csv")

	# Play random file – setWavPlayer in macOS if "permission denied"
	setWavPlayer('/usr/bin/afplay')
	play(sample(fnames, 1)) # esc to skip

	# read, downsample, clip, mel spec, normalize and remove noise
	melspec <- function(x, start, end){
	mp3 <- readMP3(filename = x) %>%
	extractWave(xunit = "time",
	from = start, to = end)

	# return log-spectrogram with 256 Mel bands and compression
	sp <- melfcc(mp3, nbands = 256, usecmp = T,
	spec_out = T,
	hoptime = (end-start) / 256)$aspectrum

	# Median-based noise reduction
	noise <- apply(sp, 1, median)
	sp <- sweep(sp, 1, noise)
	sp[sp < 0] <- 0

	# Normalize to max
	sp <- sp / max(sp)

	return(sp)
	}

	# iterate melspec over all samples, arrange output into array
	melslice <- function(x, from, to){
	lapply(X = x, FUN = melspec,
	start = from, end = to) %>%
	simplify2array()
	}

	# iterate melslice over all different time windows
	audioProcess <- function(files, limit = 10, ws = 10, stride = 2,
	ncores = 8){
	windowSize <- seq(0, limit, by = stride)
	# iterate and parallelise
	batches <- mclapply(windowSize, function(w){
	# execute
	melslice(files, from = w, to = w+ws)
	}, mc.cores = ncores)
	# combine output into single array
	out <- abind(batches, along = 3)
	# reorder dimensions after adding single-channel as 4th
	dim(out) <- c(dim(out), 1)
	out <- aperm(out, c(3,1,2,4))
	return(out)
	}

	# Encode species from fnames regex
	species <- str_extract(fnames, patt = "[A-Za-z]+-[a-z]+") %>%
	gsub(patt = "-", rep = " ") %>% factor()

	# Stratified sampling: train (80%), val (10%) and test (10%)
	set.seed(100)
	idx <- createFolds(species, k = 10)
	valIdx <- idx$Fold01
	testIdx <- idx$Fold02
	# Define samples for train, val and test
	fnamesTrain <- fnames[-c(valIdx, testIdx)]
	fnamesVal <- fnames[valIdx]
	fnamesTest <- fnames[testIdx]

	# Take multiple readings per sample for training
	Xtrain <- audioProcess(files = fnamesTrain, ncores = 5,
	limit = 20, ws = 10, stride = 5)
	Xval <- audioProcess(files = fnamesVal, ncores = 5,
	limit = 20, ws = 10, stride = 5)
	Xtest <- audioProcess(files = fnamesTest, ncores = 5,
	limit = 20, ws = 10, stride = 5)

poissonisfish

Audio classification in R

Introduction

What is sound?

Fourier Transform

Dataset

Let’s get started with R

1. Download and pre-processing of bird songs

Query and download

Pre-processing

2. Bird species classification

Model architecture

Model training

Model evaluation

Wrap-up

Please support

References

Citation

6 thoughts on “Audio classification in R”

Leave a comment Cancel reply

	# Define targets and augment data
	target <- model.matrix(~0+species)

	targetTrain <- do.call("rbind", lapply(1:(dim(Xtrain)[1]/length(fnamesTrain)),
	function(x) target[-c(valIdx, testIdx),]))
	targetVal <- do.call("rbind", lapply(1:(dim(Xval)[1]/length(fnamesVal)),
	function(x) target[valIdx,]))
	targetTest <- do.call("rbind", lapply(1:(dim(Xtest)[1]/length(fnamesTest)),
	function(x) target[testIdx,]))
	# Assemble Xs and Ys
	train <- list(X = Xtrain, Y = targetTrain)
	val <- list(X = Xval, Y = targetVal)
	test <- list(X = Xtest, Y = targetTest)

	# Plot spectrogram from random training sample – range is 0-22.05 kHz
	image(train$X[sample(dim(train$X)[1], 1),,,],
	xlab = "Time (s)",
	ylab = "Frequency (kHz)",
	axes = F)
	# Generate mel sequence from Hz points, standardize to plot
	freqs <- c(0, 1, 5, 15, 22.05)
	mels <- 2595 * log10(1 + (freqs*1e3) / 700) # https://en.wikipedia.org/wiki/Mel_scale
	mels <- mels – min(mels)
	mels <- mels / max(mels)

	axis(1, at = seq(0, 1, by = .2), labels = seq(0, 10, by = 2))
	axis(2, at = mels, las = 2,
	labels = round(freqs, 2))
	axis(3, labels = F); axis(4, labels = F)

	#### Save ####
	save(train, val, test, file = "prepAudio.RData")

	# Fri Feb 7 15:49:46 2020 ——————————
	setwd("~/Documents/Tutorials/birdsong")
	library(keras)
	use_condaenv("plaidml")
	use_backend("plaidml")
	k_backend() # plaidml
	library(tidyverse)
	library(caret)
	library(e1071)
	library(pheatmap)
	library(RColorBrewer)

	# Read processed data
	load("prepAudio.RData")

	# Build model
	model <- keras_model_sequential() %>%
	layer_conv_2d(input_shape = dim(train$X)[2:4],
	filters = 16, kernel_size = c(3, 3),
	activation = "relu") %>%
	layer_max_pooling_2d(pool_size = c(2, 2)) %>%
	layer_dropout(rate = .2) %>%

	layer_conv_2d(filters = 32, kernel_size = c(3, 3),
	activation = "relu") %>%
	layer_max_pooling_2d(pool_size = c(2, 2)) %>%
	layer_dropout(rate = .2) %>%

	layer_conv_2d(filters = 64, kernel_size = c(3, 3),
	activation = "relu") %>%
	layer_max_pooling_2d(pool_size = c(2, 2)) %>%
	layer_dropout(rate = .2) %>%

	layer_conv_2d(filters = 128, kernel_size = c(3, 3),
	activation = "relu") %>%
	layer_max_pooling_2d(pool_size = c(28, 2)) %>%
	layer_dropout(rate = .2) %>%

	layer_flatten() %>%

	layer_dense(units = 128, activation = "relu") %>%
	layer_dropout(rate = .5) %>%
	layer_dense(units = ncol(train$Y), activation = "softmax")

	# Print summary
	summary(model)
	model %>% compile(optimizer = optimizer_adam(decay = 1e-5),
	loss = "categorical_crossentropy",
	metrics = "accuracy")

	history <- fit(model, x = train$X, y = train$Y,
	batch_size = 16, epochs = 50,
	validation_data = list(val$X, val$Y))

	plot(history)

	# Save model
	# model %>% save_model_hdf5("model.h5")

	# Grep species, set colors for heatmap
	speciesClass <- gsub(colnames(train$Y), pat = "species", rep = "")
	cols <- colorRampPalette(rev(brewer.pal(n = 7, name = "RdGy")))

	# Validation predictions
	predProb <- predict(model, val$X)
	predClass <- speciesClass[apply(predProb, 1, which.max)]
	trueClass <- speciesClass[apply(val$Y, 1, which.max)]

	# Plot confusion matrix
	confMat <- confusionMatrix(data = factor(predClass, levels = speciesClass),
	reference = factor(trueClass, levels = speciesClass))

	pheatmap(confMat$table, cluster_rows = F, cluster_cols = F,
	border_color = NA, show_colnames = F,
	labels_row = speciesClass,
	color = cols(max(confMat$table)+1))

	# Accuracy in validation set
	mean(predClass == trueClass) # 0.7541

	# Test set prediction
	predXProb <- predict(model, test$X)
	predXClass <- speciesClass[apply(predXProb, 1, which.max)]
	trueXClass <- speciesClass[apply(test$Y, 1, which.max)]

	# Plot confusion matrix
	confMatTest <- confusionMatrix(data = factor(predXClass, levels = speciesClass),
	reference = factor(trueXClass, levels = speciesClass))

	pheatmap(confMatTest$table, cluster_rows = F, cluster_cols = F,
	border_color = NA, show_colnames = F,
	labels_row = speciesClass,
	color = cols(max(confMatTest$table)+1))

	# Accuracy in test set
	mean(predXClass == trueXClass) # 0.7364

	# Write sessioninfo
	writeLines(capture.output(sessionInfo()), "sessionInfo")

Introduction

What is sound?

Fourier Transform

Dataset

Let’s get started with R

1. Download and pre-processing of bird songs

Query and download

Pre-processing

2. Bird species classification

Model architecture

Model training

Model evaluation

Wrap-up

Please support

References

Citation

Share this:

6 thoughts on “Audio classification in R”

Leave a comment Cancel reply