Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 11 additions & 10 deletions examples/pipeline_wav2letter/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ This is an example pipeline for speech recognition using a greedy or Viterbi CTC
### Usage

More information about each command line parameters is available with the `--help` option. An example can be invoked as follows.
```
```bash
DATASET_ROOT = <Top>/<level>/<folder>
DATASET_FOLDER_IN_ARCHIVE = 'LibriSpeech'

Expand All @@ -25,19 +25,20 @@ python main.py \
--normalize \
--optimizer adadelta \
--scheduler reduceonplateau \
--epochs 30
--epochs 40
```
With these default parameters, we get a character error rate of 13.8% on dev-clean after 30 epochs.

### Output
With these default parameters, we get 13.3 %CER and 41.9 %WER on dev-clean after 40 epochs (character and word error rates, respectively) while training on train-clean. The tail of the output is the following.

The information reported at each iteration and epoch (e.g. loss, character error rate, word error rate) is printed to standard output in the form of one json per line, e.g.
```python
{"name": "train", "epoch": 0, "cer over target length": 1.0, "cumulative cer": 23317.0, "total chars": 23317.0, "cer": 0.0, "cumulative cer over target length": 0.0, "wer over target length": 1.0, "cumulative wer": 4446.0, "total words": 4446.0, "wer": 0.0, "cumulative wer over target length": 0.0, "lr": 0.6, "batch size": 128, "n_channel": 13, "n_time": 2453, "dataset length": 128.0, "iteration": 1.0, "loss": 8.712121963500977, "cumulative loss": 8.712121963500977, "average loss": 8.712121963500977, "iteration time": 41.46276903152466, "epoch time": 41.46276903152466}
{"name": "train", "epoch": 0, "cer over target length": 1.0, "cumulative cer": 46005.0, "total chars": 46005.0, "cer": 0.0, "cumulative cer over target length": 0.0, "wer over target length": 1.0, "cumulative wer": 8762.0, "total words": 8762.0, "wer": 0.0, "cumulative wer over target length": 0.0, "lr": 0.6, "batch size": 128, "n_channel": 13, "n_time": 1703, "dataset length": 256.0, "iteration": 2.0, "loss": 8.918599128723145, "cumulative loss": 17.63072109222412, "average loss": 8.81536054611206, "iteration time": 1.2905676364898682, "epoch time": 42.753336668014526}
{"name": "train", "epoch": 0, "cer over target length": 1.0, "cumulative cer": 70030.0, "total chars": 70030.0, "cer": 0.0, "cumulative cer over target length": 0.0, "wer over target length": 1.0, "cumulative wer": 13348.0, "total words": 13348.0, "wer": 0.0, "cumulative wer over target length": 0.0, "lr": 0.6, "batch size": 128, "n_channel": 13, "n_time": 1713, "dataset length": 384.0, "iteration": 3.0, "loss": 8.550191879272461, "cumulative loss": 26.180912971496582, "average loss": 8.726970990498861, "iteration time": 1.2109291553497314, "epoch time": 43.96426582336426}
```json
...
{"name": "train", "epoch": 40, "batch char error": 925, "batch char total": 22563, "batch char error rate": 0.040996321411159865, "epoch char error": 1135098.0, "epoch char total": 23857713.0, "epoch char error rate": 0.047577821059378154, "batch word error": 791, "batch word total": 4308, "batch word error rate": 0.18361188486536675, "epoch word error": 942906.0, "epoch word total": 4569507.0, "epoch word error rate": 0.20634742435015418, "lr": 0.06, "batch size": 128, "n_channel": 13, "n_time": 1685, "dataset length": 132096.0, "iteration": 1032.0, "loss": 0.07428030669689178, "cumulative loss": 90.47326805442572, "average loss": 0.08766789540157531, "iteration time": 1.9895553588867188, "epoch time": 2036.8874564170837}
{"name": "train", "epoch": 40, "batch char error": 1131, "batch char total": 24260, "batch char error rate": 0.0466199505358615, "epoch char error": 1136229.0, "epoch char total": 23881973.0, "epoch char error rate": 0.04757684802675223, "batch word error": 957, "batch word total": 4657, "batch word error rate": 0.2054971011380717, "epoch word error": 943863.0, "epoch word total": 4574164.0, "epoch word error rate": 0.20634655862798099, "lr": 0.06, "batch size": 128, "n_channel": 13, "n_time": 1641, "dataset length": 132224.0, "iteration": 1033.0, "loss": 0.08775319904088974, "cumulative loss": 90.5610212534666, "average loss": 0.08766797798012256, "iteration time": 2.108018159866333, "epoch time": 2038.99547457695}
{"name": "train", "epoch": 40, "batch char error": 1099, "batch char total": 23526, "batch char error rate": 0.0467142735696676, "epoch char error": 1137328.0, "epoch char total": 23905499.0, "epoch char error rate": 0.04757599914563591, "batch word error": 936, "batch word total": 4544, "batch word error rate": 0.20598591549295775, "epoch word error": 944799.0, "epoch word total": 4578708.0, "epoch word error rate": 0.20634620071863066, "lr": 0.06, "batch size": 128, "n_channel": 13, "n_time": 1682, "dataset length": 132352.0, "iteration": 1034.0, "loss": 0.0791337713599205, "cumulative loss": 90.64015502482653, "average loss": 0.08765972439538348, "iteration time": 2.0329701900482178, "epoch time": 2041.0284447669983}
{"name": "train", "epoch": 40, "batch char error": 1023, "batch char total": 22399, "batch char error rate": 0.045671681771507655, "epoch char error": 1138351.0, "epoch char total": 23927898.0, "epoch char error rate": 0.04757421650660664, "batch word error": 863, "batch word total": 4318, "batch word error rate": 0.1998610467809171, "epoch word error": 945662.0, "epoch word total": 4583026.0, "epoch word error rate": 0.20634009058643787, "lr": 0.06, "batch size": 128, "n_channel": 13, "n_time": 1644, "dataset length": 132480.0, "iteration": 1035.0, "loss": 0.07874362915754318, "cumulative loss": 90.71889865398407, "average loss": 0.08765110981061262, "iteration time": 1.9106628894805908, "epoch time": 2042.9391076564789}
{"name": "validation", "epoch": 40, "cumulative loss": 12.095281183719635, "dataset length": 2688.0, "iteration": 21.0, "batch char error": 1867, "batch char total": 14792, "batch char error rate": 0.12621687398593834, "epoch char error": 37119.0, "epoch char total": 280923.0, "epoch char error rate": 0.13213229247872194, "batch word error": 1155, "batch word total": 2841, "batch word error rate": 0.4065469904963041, "epoch word error": 22601.0, "epoch word total": 54008.0, "epoch word error rate": 0.418475040734706, "average loss": 0.575965770653316, "validation time": 24.185853481292725}
```
One way to import the output in python with pandas is by saving the standard output to a file, and then using `pandas.read_json(filename, lines=True)`.
As can be seen in the output above, the information reported at each iteration and epoch (e.g. loss, character error rate, word error rate) is printed to standard output in the form of one json per line. One way to import the output in python with pandas is by saving the standard output to a file, and then using `pandas.read_json(filename, lines=True)`.

## Structure of pipeline

Expand Down
20 changes: 12 additions & 8 deletions examples/pipeline_wav2letter/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,10 +220,12 @@ def compute_error_rates(outputs, targets, decoder, language_model, metric):
cers = [levenshtein_distance(t, o) for t, o in zip(target, output)]
cers = sum(cers)
n = sum(len(t) for t in target)
metric["cer over target length"] = cers / n
metric["cumulative cer"] += cers
metric["total chars"] += n
metric["cumulative cer over target length"] = metric["cer"] / metric["total chars"]
metric["batch char error"] = cers
metric["batch char total"] = n
metric["batch char error rate"] = cers / n
metric["epoch char error"] += cers
metric["epoch char total"] += n
metric["epoch char error rate"] = metric["epoch char error"] / metric["epoch char total"]

# Compute WER

Expand All @@ -233,10 +235,12 @@ def compute_error_rates(outputs, targets, decoder, language_model, metric):
wers = [levenshtein_distance(t, o) for t, o in zip(target, output)]
wers = sum(wers)
n = sum(len(t) for t in target)
metric["wer over target length"] = wers / n
metric["cumulative wer"] += wers
metric["total words"] += n
metric["cumulative wer over target length"] = metric["wer"] / metric["total words"]
metric["batch word error"] = wers
metric["batch word total"] = n
metric["batch word error rate"] = wers / n
metric["epoch word error"] += wers
metric["epoch word total"] += n
metric["epoch word error rate"] = metric["epoch word error"] / metric["epoch word total"]


def train_one_epoch(
Expand Down