From 68c1e7daa9e0b4f74f1e3634c10b7b9a97eb8c16 Mon Sep 17 00:00:00 2001 From: Qkd004 Date: Fri, 7 Nov 2025 17:10:28 +0800 Subject: [PATCH 1/3] fix:'lengths' and 'likely' --- _typos.toml | 2 -- 1 file changed, 2 deletions(-) diff --git a/_typos.toml b/_typos.toml index 4a6fb17b022..ec74897c8b6 100644 --- a/_typos.toml +++ b/_typos.toml @@ -69,11 +69,9 @@ intput = "intput" lable = "lable" learing = "learing" legth = "legth" -lengthes = "lengthes" lenth = "lenth" leran = "leran" libary = "libary" -likey = "likey" mantained = "mantained" matrics = "matrics" mdule = "mdule" From ffb2c01e3c56fba1902c3f13a18f6d5c7c4dd045 Mon Sep 17 00:00:00 2001 From: Qkd004 Date: Fri, 7 Nov 2025 17:12:20 +0800 Subject: [PATCH 2/3] fix:'likely' --- docs/design/dist_train/distributed_training_review.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/design/dist_train/distributed_training_review.md b/docs/design/dist_train/distributed_training_review.md index c09b7c99159..9f13224728c 100644 --- a/docs/design/dist_train/distributed_training_review.md +++ b/docs/design/dist_train/distributed_training_review.md @@ -30,7 +30,7 @@ Synchronous training usually faces scalability and performance issues, if not ca Similar to asynchronous training, the benefit of synchronous training diminishes quickly. Depending on the models, increasing the number of trainers (effectively batch size) beyond a point won’t delivers faster converge time or better final model quality. # Codistillation -Codistillation is a technique that tries to scale the training further. A few training instance (each training instance can be distributed) are performed during the same period. Each training instance has extra losses that comes from the prediction of other training instances. (likey teacher and student) The training process converges faster and usually converge to a better model quality. [4] +Codistillation is a technique that tries to scale the training further. A few training instance (each training instance can be distributed) are performed during the same period. Each training instance has extra losses that comes from the prediction of other training instances. (likely teacher and student) The training process converges faster and usually converge to a better model quality. [4] # Reference From e4928f7c171f7d22df3b57f544dddbdd848f84a9 Mon Sep 17 00:00:00 2001 From: Qkd004 Date: Fri, 7 Nov 2025 17:12:55 +0800 Subject: [PATCH 3/3] fix:'lengths' --- docs/design/network/deep_speech_2.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/design/network/deep_speech_2.md b/docs/design/network/deep_speech_2.md index 31520fbd778..8fa9e3c2c3b 100644 --- a/docs/design/network/deep_speech_2.md +++ b/docs/design/network/deep_speech_2.md @@ -127,7 +127,7 @@ Key ingredients about the layers: - **Data Layers**: - Frame sequences data of audio **spectrogram** (with FFT). - Token sequences data of **transcription** text (labels). - - These two type of sequences do not have the same lengthes, thus a CTC-loss layer is required. + - These two type of sequences do not have the same lengths, thus a CTC-loss layer is required. - **2D Convolution Layers**: - Not only temporal convolution, but also **frequency convolution**. Like a 2D image convolution, but with a variable dimension (i.e. temporal dimension). - With striding for only the first convlution layer.