1010
1111"""
1212
13- # Uncomment to run in Google Colab
1413# !pip install torch
1514# !pip install torchaudio
1615
@@ -66,14 +65,14 @@ def __init__(self, subset: str = None):
6665 filepath = os .path .join (self ._path , "validation_list.txt" )
6766 with open (filepath ) as f :
6867 validation_list = [
69- os .path .join (self ._path , l .strip ()) for l in f .readlines ()
68+ os .path .join (self ._path , line .strip ()) for line in f .readlines ()
7069 ]
7170
7271 if subset in ["training" , "testing" ]:
7372 filepath = os .path .join (self ._path , "testing_list.txt" )
7473 with open (filepath ) as f :
7574 testing_list = [
76- os .path .join (self ._path , l .strip ()) for l in f .readlines ()
75+ os .path .join (self ._path , line .strip ()) for line in f .readlines ()
7776 ]
7877
7978 if subset == "validation" :
@@ -216,15 +215,16 @@ def collate_fn(batch):
216215 return tensors , targets
217216
218217
218+ batch_size = 128
219+
219220kwargs = (
220221 {"num_workers" : 1 , "pin_memory" : True } if device == "cuda" else {}
221- ) # needed to run on gpu
222-
222+ ) # needed for using datasets on gpu
223223train_loader = torch .utils .data .DataLoader (
224- train_set , batch_size = 128 , shuffle = True , collate_fn = collate_fn , ** kwargs
224+ train_set , batch_size = batch_size , shuffle = True , collate_fn = collate_fn , ** kwargs
225225)
226226test_loader = torch .utils .data .DataLoader (
227- test_set , batch_size = 128 , shuffle = False , collate_fn = collate_fn , ** kwargs
227+ test_set , batch_size = batch_size , shuffle = False , collate_fn = collate_fn , ** kwargs
228228)
229229
230230
@@ -236,31 +236,32 @@ def collate_fn(batch):
236236# the raw audio data. Usually more advanced transforms are applied to the
237237# audio data, however CNNs can be used to accurately process the raw data.
238238# The specific architecture is modeled after the M5 network architecture
239- # described in https://arxiv.org/pdf/1610.00087.pdf. An important aspect
240- # of models processing raw audio data is the receptive field of their
241- # first layer’s filters. Our model’s first filter is length 80 so when
242- # processing audio sampled at 8kHz the receptive field is around 10ms.
243- # This size is similar to speech processing applications that often use
244- # receptive fields ranging from 20ms to 40ms.
239+ # described in ``this paper <https://arxiv.org/pdf/1610.00087.pdf>``\ \_.
240+ # An important aspect of models processing raw audio data is the receptive
241+ # field of their first layer’s filters. Our model’s first filter is length
242+ # 80 so when processing audio sampled at 8kHz the receptive field is
243+ # around 10ms (and at 4kHz, around 20 ms). This size is similar to speech
244+ # processing applications that often use receptive fields ranging from
245+ # 20ms to 40ms.
245246#
246247
247248
248- class Net (nn .Module ):
249- def __init__ (self , n_output = 10 ):
250- super (Net , self ).__init__ ()
251- self .conv1 = nn .Conv1d (1 , 128 , 80 , 4 )
252- self .bn1 = nn .BatchNorm1d (128 )
249+ class M5 (nn .Module ):
250+ def __init__ (self , stride = 16 , n_channel = 32 , n_output = 35 ):
251+ super ().__init__ ()
252+ self .conv1 = nn .Conv1d (1 , n_channel , 80 , stride = stride )
253+ self .bn1 = nn .BatchNorm1d (n_channel )
253254 self .pool1 = nn .MaxPool1d (4 )
254- self .conv2 = nn .Conv1d (128 , 128 , 3 )
255- self .bn2 = nn .BatchNorm1d (128 )
255+ self .conv2 = nn .Conv1d (n_channel , n_channel , 3 )
256+ self .bn2 = nn .BatchNorm1d (n_channel )
256257 self .pool2 = nn .MaxPool1d (4 )
257- self .conv3 = nn .Conv1d (128 , 256 , 3 )
258- self .bn3 = nn .BatchNorm1d (256 )
258+ self .conv3 = nn .Conv1d (n_channel , 2 * n_channel , 3 )
259+ self .bn3 = nn .BatchNorm1d (2 * n_channel )
259260 self .pool3 = nn .MaxPool1d (4 )
260- self .conv4 = nn .Conv1d (256 , 512 , 3 )
261- self .bn4 = nn .BatchNorm1d (512 )
261+ self .conv4 = nn .Conv1d (2 * n_channel , 2 * n_channel , 3 )
262+ self .bn4 = nn .BatchNorm1d (2 * n_channel )
262263 self .pool4 = nn .MaxPool1d (4 )
263- self .fc1 = nn .Linear (512 , n_output )
264+ self .fc1 = nn .Linear (2 * n_channel , n_output )
264265
265266 def forward (self , x ):
266267 x = self .conv1 (x )
@@ -275,15 +276,13 @@ def forward(self, x):
275276 x = self .conv4 (x )
276277 x = F .relu (self .bn4 (x ))
277278 x = self .pool4 (x )
278- x = F .avg_pool1d (
279- x , x .shape [- 1 ]
280- ) # input should be 512x14 so this outputs a 512x1
281- x = x .permute (0 , 2 , 1 ) # change the 512x1 to 1x512
279+ x = F .avg_pool1d (x , x .shape [- 1 ])
280+ x = x .permute (0 , 2 , 1 )
282281 x = self .fc1 (x )
283282 return F .log_softmax (x , dim = 2 )
284283
285284
286- model = Net (n_output = len (labels ))
285+ model = M5 (n_output = len (labels ))
287286model .to (device )
288287print (model )
289288
@@ -304,7 +303,9 @@ def count_parameters(model):
304303#
305304
306305optimizer = optim .Adam (model .parameters (), lr = 0.01 , weight_decay = 0.0001 )
307- scheduler = optim .lr_scheduler .StepLR (optimizer , step_size = 20 , gamma = 0.1 )
306+ scheduler = optim .lr_scheduler .StepLR (
307+ optimizer , step_size = 20 , gamma = 0.1
308+ ) # reduce the learning after 20 epochs by a factor of 10
308309
309310
310311######################################################################
@@ -321,11 +322,6 @@ def count_parameters(model):
321322#
322323
323324
324- def nll_loss (tensor , target ):
325- # negative log-likelihood for a tensor of size (batch x 1 x n_output)
326- return F .nll_loss (tensor .squeeze (), target )
327-
328-
329325def train (model , epoch , log_interval ):
330326 model .train ()
331327 for batch_idx , (data , target ) in enumerate (train_loader ):
@@ -334,7 +330,9 @@ def train(model, epoch, log_interval):
334330 target = target .to (device )
335331
336332 output = model (data )
337- loss = nll_loss (output , target )
333+
334+ # negative log-likelihood for a tensor of size (batch x 1 x n_output)
335+ loss = F .nll_loss (output .squeeze (), target )
338336
339337 optimizer .zero_grad ()
340338 loss .backward ()
@@ -385,7 +383,7 @@ def test(model, epoch):
385383 pbar .update ()
386384
387385 print (
388- f"\n Test set: Accuracy : { correct } /{ len (test_loader .dataset )} ({ 100. * correct / len (test_loader .dataset ):.0f} %)\n "
386+ f"\n Test Epoch: { epoch } \t Accuracy : { correct } /{ len (test_loader .dataset )} ({ 100. * correct / len (test_loader .dataset ):.0f} %)\n "
389387 )
390388
391389
@@ -412,12 +410,16 @@ def test(model, epoch):
412410
413411waveform , sample_rate , utterance , * _ = train_set [- 1 ]
414412ipd .Audio (waveform .numpy (), rate = sample_rate )
413+
414+ waveform = transform (waveform )
415415output = model (waveform .unsqueeze (0 ))
416416output = argmax (output ).squeeze ()
417417print (f"Expected: { utterance } . Predicted: { labels [output ]} ." )
418418
419419waveform , sample_rate , utterance , * _ = test_set [- 1 ]
420420ipd .Audio (waveform .numpy (), rate = sample_rate )
421+
422+ waveform = transform (waveform )
421423output = model (waveform .unsqueeze (0 ))
422424output = argmax (output ).squeeze ()
423425print (f"Expected: { utterance } . Predicted: { labels [output ]} ." )
@@ -427,7 +429,7 @@ def test(model, epoch):
427429# Conclusion
428430# ----------
429431#
430- # After one epoch , the network should be more than 65 % accurate.
432+ # After two epochs , the network should be more than 70 % accurate.
431433#
432434# In this tutorial, we used torchaudio to load a dataset and resample the
433435# signal. We have then defined a neural network that we trained to
0 commit comments