pytorch · vincentqb · Apr 23, 2021 · Apr 20, 2021
diff --git a/examples/pipeline_wav2letter/README.md b/examples/pipeline_wav2letter/README.md
@@ -3,7 +3,7 @@ This is an example pipeline for speech recognition using a greedy or Viterbi CTC
 ### Usage
 
 More information about each command line parameters is available with the `--help` option. An example can be invoked as follows.
-```
+```bash
 DATASET_ROOT = <Top>/<level>/<folder>
 DATASET_FOLDER_IN_ARCHIVE =  'LibriSpeech'
 
@@ -25,19 +25,20 @@ python main.py \
     --normalize \
     --optimizer adadelta \
     --scheduler reduceonplateau \
-    --epochs 30
+    --epochs 40
 ```
-With these default parameters, we get a character error rate of 13.8% on dev-clean after 30 epochs.
 
-### Output
+With these default parameters, we get 13.3 %CER and 41.9 %WER on dev-clean after 40 epochs (character and word error rates, respectively) while training on train-clean. The tail of the output is the following.
 
-The information reported at each iteration and epoch (e.g. loss, character error rate, word error rate) is printed to standard output in the form of one json per line, e.g.
-```python
-{"name": "train", "epoch": 0, "cer over target length": 1.0, "cumulative cer": 23317.0, "total chars": 23317.0, "cer": 0.0, "cumulative cer over target length": 0.0, "wer over target length": 1.0, "cumulative wer": 4446.0, "total words": 4446.0, "wer": 0.0, "cumulative wer over target length": 0.0, "lr": 0.6, "batch size": 128, "n_channel": 13, "n_time": 2453, "dataset length": 128.0, "iteration": 1.0, "loss": 8.712121963500977, "cumulative loss": 8.712121963500977, "average loss": 8.712121963500977, "iteration time": 41.46276903152466, "epoch time": 41.46276903152466}
-{"name": "train", "epoch": 0, "cer over target length": 1.0, "cumulative cer": 46005.0, "total chars": 46005.0, "cer": 0.0, "cumulative cer over target length": 0.0, "wer over target length": 1.0, "cumulative wer": 8762.0, "total words": 8762.0, "wer": 0.0, "cumulative wer over target length": 0.0, "lr": 0.6, "batch size": 128, "n_channel": 13, "n_time": 1703, "dataset length": 256.0, "iteration": 2.0, "loss": 8.918599128723145, "cumulative loss": 17.63072109222412, "average loss": 8.81536054611206, "iteration time": 1.2905676364898682, "epoch time": 42.753336668014526}
-{"name": "train", "epoch": 0, "cer over target length": 1.0, "cumulative cer": 70030.0, "total chars": 70030.0, "cer": 0.0, "cumulative cer over target length": 0.0, "wer over target length": 1.0, "cumulative wer": 13348.0, "total words": 13348.0, "wer": 0.0, "cumulative wer over target length": 0.0, "lr": 0.6, "batch size": 128, "n_channel": 13, "n_time": 1713, "dataset length": 384.0, "iteration": 3.0, "loss": 8.550191879272461, "cumulative loss": 26.180912971496582, "average loss": 8.726970990498861, "iteration time": 1.2109291553497314, "epoch time": 43.96426582336426}
+```json
+...
+{"name": "train", "epoch": 40, "batch char error": 925, "batch char total": 22563, "batch char error rate": 0.040996321411159865, "epoch char error": 1135098.0, "epoch char total": 23857713.0, "epoch char error rate": 0.047577821059378154, "batch word error": 791, "batch word total": 4308, "batch word error rate": 0.18361188486536675, "epoch word error": 942906.0, "epoch word total": 4569507.0, "epoch word error rate": 0.20634742435015418, "lr": 0.06, "batch size": 128, "n_channel": 13, "n_time": 1685, "dataset length": 132096.0, "iteration": 1032.0, "loss": 0.07428030669689178, "cumulative loss": 90.47326805442572, "average loss": 0.08766789540157531, "iteration time": 1.9895553588867188, "epoch time": 2036.8874564170837}
+{"name": "train", "epoch": 40, "batch char error": 1131, "batch char total": 24260, "batch char error rate": 0.0466199505358615, "epoch char error": 1136229.0, "epoch char total": 23881973.0, "epoch char error rate": 0.04757684802675223, "batch word error": 957, "batch word total": 4657, "batch word error rate": 0.2054971011380717, "epoch word error": 943863.0, "epoch word total": 4574164.0, "epoch word error rate": 0.20634655862798099, "lr": 0.06, "batch size": 128, "n_channel": 13, "n_time": 1641, "dataset length": 132224.0, "iteration": 1033.0, "loss": 0.08775319904088974, "cumulative loss": 90.5610212534666, "average loss": 0.08766797798012256, "iteration time": 2.108018159866333, "epoch time": 2038.99547457695}
+{"name": "train", "epoch": 40, "batch char error": 1099, "batch char total": 23526, "batch char error rate": 0.0467142735696676, "epoch char error": 1137328.0, "epoch char total": 23905499.0, "epoch char error rate": 0.04757599914563591, "batch word error": 936, "batch word total": 4544, "batch word error rate": 0.20598591549295775, "epoch word error": 944799.0, "epoch word total": 4578708.0, "epoch word error rate": 0.20634620071863066, "lr": 0.06, "batch size": 128, "n_channel": 13, "n_time": 1682, "dataset length": 132352.0, "iteration": 1034.0, "loss": 0.0791337713599205, "cumulative loss": 90.64015502482653, "average loss": 0.08765972439538348, "iteration time": 2.0329701900482178, "epoch time": 2041.0284447669983}
+{"name": "train", "epoch": 40, "batch char error": 1023, "batch char total": 22399, "batch char error rate": 0.045671681771507655, "epoch char error": 1138351.0, "epoch char total": 23927898.0, "epoch char error rate": 0.04757421650660664, "batch word error": 863, "batch word total": 4318, "batch word error rate": 0.1998610467809171, "epoch word error": 945662.0, "epoch word total": 4583026.0, "epoch word error rate": 0.20634009058643787, "lr": 0.06, "batch size": 128, "n_channel": 13, "n_time": 1644, "dataset length": 132480.0, "iteration": 1035.0, "loss": 0.07874362915754318, "cumulative loss": 90.71889865398407, "average loss": 0.08765110981061262, "iteration time": 1.9106628894805908, "epoch time": 2042.9391076564789}
+{"name": "validation", "epoch": 40, "cumulative loss": 12.095281183719635, "dataset length": 2688.0, "iteration": 21.0, "batch char error": 1867, "batch char total": 14792, "batch char error rate": 0.12621687398593834, "epoch char error": 37119.0, "epoch char total": 280923.0, "epoch char error rate": 0.13213229247872194, "batch word error": 1155, "batch word total": 2841, "batch word error rate": 0.4065469904963041, "epoch word error": 22601.0, "epoch word total": 54008.0, "epoch word error rate": 0.418475040734706, "average loss": 0.575965770653316, "validation time": 24.185853481292725}
 ```
-One way to import the output in python with pandas is by saving the standard output to a file, and then using `pandas.read_json(filename, lines=True)`.
+As can be seen in the output above, the information reported at each iteration and epoch (e.g. loss, character error rate, word error rate) is printed to standard output in the form of one json per line. One way to import the output in python with pandas is by saving the standard output to a file, and then using `pandas.read_json(filename, lines=True)`.
 
 ## Structure of pipeline
 

diff --git a/examples/pipeline_wav2letter/main.py b/examples/pipeline_wav2letter/main.py
@@ -220,10 +220,12 @@ def compute_error_rates(outputs, targets, decoder, language_model, metric):
     cers = [levenshtein_distance(t, o) for t, o in zip(target, output)]
     cers = sum(cers)
     n = sum(len(t) for t in target)
-    metric["cer over target length"] = cers / n
-    metric["cumulative cer"] += cers
-    metric["total chars"] += n
-    metric["cumulative cer over target length"] = metric["cer"] / metric["total chars"]
+    metric["batch char error"] = cers
+    metric["batch char total"] = n
+    metric["batch char error rate"] = cers / n
+    metric["epoch char error"] += cers
+    metric["epoch char total"] += n
+    metric["epoch char error rate"] = metric["epoch char error"] / metric["epoch char total"]
 
     # Compute WER
 
@@ -233,10 +235,12 @@ def compute_error_rates(outputs, targets, decoder, language_model, metric):
     wers = [levenshtein_distance(t, o) for t, o in zip(target, output)]
     wers = sum(wers)
     n = sum(len(t) for t in target)
-    metric["wer over target length"] = wers / n
-    metric["cumulative wer"] += wers
-    metric["total words"] += n
-    metric["cumulative wer over target length"] = metric["wer"] / metric["total words"]
+    metric["batch word error"] = wers
+    metric["batch word total"] = n
+    metric["batch word error rate"] = wers / n
+    metric["epoch word error"] += wers
+    metric["epoch word total"] += n
+    metric["epoch word error rate"] = metric["epoch word error"] / metric["epoch word total"]
 
 
 def train_one_epoch(