diff --git a/README.md b/README.md
index 94f993ddaa..290bb73543 100644
--- a/README.md
+++ b/README.md
@@ -113,8 +113,8 @@ python setup.py install
 MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++ python setup.py install
 ```
 
-Alternatively, the build process can build SoX (and codecs such as libmad, lame and flac) statically and torchaudio can link them, by setting environment variable `BUILD_SOX=1`.
-The build process will fetch and build SoX, liblame, libmad, flac before building extension.
+Alternatively, the build process can build libsox and some optional codecs statically and torchaudio can link them, by setting environment variable `BUILD_SOX=1`.
+The build process will fetch and build libmad, lame, flac, vorbis, opus, and libsox before building extension. This process requires `cmake` and `pkg-config`.
 
 ```bash
 # Linux
diff --git a/build_tools/setup_helpers/build_third_party.sh b/build_tools/setup_helpers/build_third_party.sh
deleted file mode 100755
index 9577776cc9..0000000000
--- a/build_tools/setup_helpers/build_third_party.sh
+++ /dev/null
@@ -1,50 +0,0 @@
-#!/bin/bash
-# Build third party libraries (SoX, lame, libmad, and flac)
-# Usage: ./build_thid_parth.sh [prefix] [download_only?=false]
-
-set -e
-
-this_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-root_dir="${this_dir}/../.."
-
-prefix="${1:-}"
-if [ -z "${prefix}" ]; then
-    prefix="${root_dir}"
-fi
-download_only="${2:-false}"
-
-tp_dir="${prefix}/third_party"
-tmp_dir="${tp_dir}/tmp"
-build_dir="${tp_dir}/build"
-
-mkdir -p "${tmp_dir}" "${build_dir}"
-
-. "${this_dir}/build_third_party_helper.sh"
-
-if ! found_lame "${build_dir}" ; then
-    get_lame "${tmp_dir}"
-    if [ "${download_only}" = "false" ]; then
-        build_lame "${tmp_dir}" "${build_dir}"
-    fi
-fi
-
-if ! found_flac "${build_dir}" ; then
-   get_flac "${tmp_dir}"
-   if [ "${download_only}" = "false" ]; then
-       build_flac "${tmp_dir}" "${build_dir}"
-   fi
-fi
-
-if ! found_mad "${build_dir}" ; then
-   get_mad "${tmp_dir}"
-   if [ "${download_only}" = "false" ]; then
-       build_mad "${tmp_dir}" "${build_dir}"
-   fi
-fi
-
-if ! found_sox "${build_dir}" ; then
-   get_sox "${tmp_dir}"
-   if [ "${download_only}" = "false" ]; then
-       build_sox "${tmp_dir}" "${build_dir}"
-   fi
-fi
diff --git a/build_tools/setup_helpers/build_third_party_helper.sh b/build_tools/setup_helpers/build_third_party_helper.sh
deleted file mode 100644
index 7cca812409..0000000000
--- a/build_tools/setup_helpers/build_third_party_helper.sh
+++ /dev/null
@@ -1,217 +0,0 @@
-#!/usr/bin/env bash
-
-set -euo pipefail
-
-# Global options
-CURL_OPTS="-L --retry 10 --connect-timeout 5 --max-time 180"
-MAKE_OPTS="-j"
-CONFIG_OPTS=""
-
-if [ -z ${DEBUG+x} ]; then
-    CURL_OPTS="${CURL_OPTS} --silent --show-error"
-    MAKE_OPTS="${MAKE_OPTS} --quiet"
-    CONFIG_OPTS="${CONFIG_OPTS} --quiet"
-fi
-
-all_found() {
-    dir="$1"
-    shift
-    while [ "$#" -gt 0 ]; do
-        if [ ! -f "${dir}/$1" ]; then
-            return 1
-        fi
-        shift
-    done
-}
-
-
-found_lame() {
-    all_found "$1" 'include/lame/lame.h' 'lib/libmp3lame.a'
-}
-
-found_flac() {
-    all_found "$1" \
-              'include/FLAC/format.h' \
-              'include/FLAC/stream_decoder.h' \
-              'include/FLAC/export.h' \
-              'include/FLAC/ordinals.h' \
-              'include/FLAC/all.h' \
-              'include/FLAC/assert.h' \
-              'include/FLAC/callback.h' \
-              'include/FLAC/metadata.h' \
-              'include/FLAC/stream_encoder.h' \
-              'include/FLAC++/export.h' \
-              'include/FLAC++/decoder.h' \
-              'include/FLAC++/all.h' \
-              'include/FLAC++/metadata.h' \
-              'include/FLAC++/encoder.h' \
-              'lib/libFLAC++.a' \
-              'lib/libFLAC.a'
-}
-
-found_mad() {
-    all_found "$1" 'include/mad.h' 'lib/libmad.a'
-}
-
-found_sox() {
-    all_found "$1" 'include/sox.h' 'lib/libsox.a'
-}
-
-LAME="lame-3.99.5"
-LAME_ARCHIVE="${LAME}.tar.gz"
-
-get_lame() {
-    work_dir="$1"
-    url="https://downloads.sourceforge.net/project/lame/lame/3.99/${LAME_ARCHIVE}"
-    (
-        cd "${work_dir}"
-        if [ ! -d "${LAME}" ]; then
-            if [ ! -f "${LAME_ARCHIVE}" ]; then
-                printf "Fetching liblame from %s\n" "${url}"
-                curl $CURL_OPTS -O "${url}"
-            fi
-        fi
-    )
-}
-
-build_lame() {
-    work_dir="$1"
-    install_dir="$2"
-    (
-        cd "${work_dir}"
-        if [ ! -d "${LAME}" ]; then
-            tar xfp "${LAME_ARCHIVE}"
-        fi
-        cd "${LAME}"
-        # build statically
-        printf "Building liblame\n"
-        if [ ! -f Makefile ]; then
-            ./configure ${CONFIG_OPTS} \
-                        --disable-shared --enable-static --prefix="${install_dir}" CFLAGS=-fPIC CXXFLAGS=-fPIC \
-                        --with-pic --disable-debug --disable-dependency-tracking --enable-nasm
-        fi
-        make ${MAKE_OPTS} > make.log 2>&1
-        make ${MAKE_OPTS} install
-    )
-}
-
-FLAC="flac-1.3.2"
-FLAC_ARCHIVE="${FLAC}.tar.xz"
-
-get_flac() {
-    work_dir="$1"
-    url="https://downloads.sourceforge.net/project/flac/flac-src/${FLAC_ARCHIVE}"
-    (
-        cd "${work_dir}"
-        if [ ! -d "${FLAC}" ]; then
-            if [ ! -f "${FLAC_ARCHIVE}" ]; then
-                printf "Fetching flac from %s\n" "${url}"
-                curl $CURL_OPTS -O "${url}"
-            fi
-        fi
-    )
-}
-
-build_flac() {
-    work_dir="$1"
-    install_dir="$2"
-    (
-        cd "${work_dir}"
-        if [ ! -d "${FLAC}" ]; then
-            tar xfp "${FLAC_ARCHIVE}"
-        fi
-        cd "${FLAC}"
-        # build statically
-        printf "Building flac\n"
-        if [ ! -f Makefile ]; then
-            ./configure ${CONFIG_OPTS} \
-                        --disable-shared --enable-static --prefix="${install_dir}" CFLAGS=-fPIC CXXFLAGS=-fPIC \
-                        --with-pic --disable-debug --disable-dependency-tracking
-        fi
-        make ${MAKE_OPTS} > make.log 2>&1
-        make ${MAKE_OPTS} install
-    )
-}
-
-LIBMAD="libmad-0.15.1b"
-LIBMAD_ARCHIVE="${LIBMAD}.tar.gz"
-
-get_mad() {
-    work_dir="$1"
-    url="https://downloads.sourceforge.net/project/mad/libmad/0.15.1b/${LIBMAD_ARCHIVE}"
-    (
-        cd "${work_dir}"
-        if [ ! -d "${LIBMAD}" ]; then
-            if [ ! -f "${LIBMAD_ARCHIVE}" ]; then
-                printf "Fetching mad from %s\n" "${url}"
-                curl $CURL_OPTS -O "${url}"
-            fi
-        fi
-    )
-}
-
-build_mad() {
-    work_dir="$1"
-    install_dir="$2"
-    (
-        cd "${work_dir}"
-        if [ ! -d "${LIBMAD}" ]; then
-            tar xfp "${LIBMAD_ARCHIVE}"
-        fi
-        cd "${LIBMAD}"
-        # build statically
-        printf "Building mad\n"
-        if [ ! -f Makefile ]; then
-            # See https://stackoverflow.com/a/12864879/23845
-            sed -i.bak 's/-march=i486//' configure
-            ./configure ${CONFIG_OPTS} \
-                        --disable-shared --enable-static --prefix="${install_dir}" CFLAGS=-fPIC CXXFLAGS=-fPIC \
-                        --with-pic --disable-debug --disable-dependency-tracking
-        fi
-        make ${MAKE_OPTS} > make.log 2>&1
-        make ${MAKE_OPTS} install
-    )
-}
-
-SOX="sox-14.4.2"
-SOX_ARCHIVE="${SOX}.tar.bz2"
-
-get_sox() {
-    work_dir="$1"
-    url="https://downloads.sourceforge.net/project/sox/sox/14.4.2/${SOX_ARCHIVE}"
-    (
-        cd "${work_dir}"
-        if [ ! -d "${SOX}" ]; then
-            if [ ! -f "${SOX_ARCHIVE}" ]; then
-                printf "Fetching SoX from %s\n" "${url}"
-                curl $CURL_OPTS -O "${url}"
-            fi
-        fi
-    )
-}
-
-build_sox() {
-    work_dir="$1"
-    install_dir="$2"
-    (
-        cd "${work_dir}"
-        if [ ! -d "${SOX}" ]; then
-            tar xfp "${SOX_ARCHIVE}"
-        fi
-        cd "${SOX}"
-        # build statically
-        printf "Building SoX\n"
-        if [ ! -f Makefile ]; then
-            # --without-png makes OS X build less hazardous; somehow the build
-            # finds png and enables it.  We don't want it; we'd need to package
-            # it statically if we do.
-            ./configure ${CONFIG_OPTS} --disable-shared --enable-static --prefix="${install_dir}" \
-                        LDFLAGS="-L${install_dir}/lib" CPPFLAGS="-I${install_dir}/include" \
-                        --with-lame --with-flac --with-mad --without-alsa --without-coreaudio \
-                        --without-png --without-oggvorbis --without-oss --without-sndfile \
-                        CFLAGS=-fPIC CXXFLAGS=-fPIC --with-pic --disable-debug --disable-dependency-tracking
-        fi
-        make ${MAKE_OPTS} > make.log 2>&1
-        make ${MAKE_OPTS} install
-    )
-}
diff --git a/build_tools/setup_helpers/extension.py b/build_tools/setup_helpers/extension.py
index dcf61ab259..b9fb30e114 100644
--- a/build_tools/setup_helpers/extension.py
+++ b/build_tools/setup_helpers/extension.py
@@ -17,7 +17,7 @@
 _ROOT_DIR = _THIS_DIR.parent.parent.resolve()
 _CSRC_DIR = _ROOT_DIR / 'torchaudio' / 'csrc'
 _TP_BASE_DIR = _ROOT_DIR / 'third_party'
-_TP_INSTALL_DIR = _TP_BASE_DIR / 'build'
+_TP_INSTALL_DIR = _TP_BASE_DIR / 'install'
 
 
 def _get_build_sox():
@@ -76,8 +76,20 @@ def _get_extra_objects():
         # NOTE: The order of the library listed bellow matters.
         #
         # (the most important thing is that dependencies come after a library
-        # e.g., sox comes first)
-        libs = ['libsox.a', 'libmad.a', 'libFLAC.a', 'libmp3lame.a']
+        # e.g., sox comes first, flac/vorbis comes before ogg, and
+        # vorbisenc/vorbisfile comes before vorbis
+        libs = [
+            'libsox.a',
+            'libmad.a',
+            'libFLAC.a',
+            'libmp3lame.a',
+            'libopusfile.a',
+            'libopus.a',
+            'libvorbisenc.a',
+            'libvorbisfile.a',
+            'libvorbis.a',
+            'libogg.a',
+        ]
         for lib in libs:
             objs.append(str(_TP_INSTALL_DIR / 'lib' / lib))
     return objs
@@ -87,15 +99,19 @@ def _get_libraries():
     return [] if _BUILD_SOX else ['sox']
 
 
-def _build_codecs():
+def _build_third_party():
+    build_dir = str(_TP_BASE_DIR / 'build')
+    os.makedirs(build_dir, exist_ok=True)
     subprocess.run(
-        args=[str(_THIS_DIR / 'build_third_party.sh')],
+        args=['cmake', '..'],
+        cwd=build_dir,
+        check=True,
+    )
+    subprocess.run(
+        args=['cmake', '--build', '.'],
+        cwd=build_dir,
         check=True,
     )
-
-
-def _configure_third_party():
-    _build_codecs()
 
 
 _EXT_NAME = 'torchaudio._torchaudio'
@@ -120,5 +136,5 @@ def get_ext_modules(debug=False):
 class BuildExtension(TorchBuildExtension):
     def build_extension(self, ext):
         if ext.name == _EXT_NAME and _BUILD_SOX:
-            _configure_third_party()
+            _build_third_party()
         super().build_extension(ext)
diff --git a/packaging/build_conda.sh b/packaging/build_conda.sh
index 3e011d312f..628eaf0bf8 100755
--- a/packaging/build_conda.sh
+++ b/packaging/build_conda.sh
@@ -6,7 +6,7 @@ script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 
 export BUILD_TYPE="conda"
 export NO_CUDA_PACKAGE=1
-setup_env 0.6.0
+setup_env 0.7.0
 export SOURCE_ROOT_DIR="$PWD"
 setup_conda_pytorch_constraint
 conda build $CONDA_CHANNEL_FLAGS --no-anaconda-upload --python "$PYTHON_VERSION" packaging/torchaudio
diff --git a/packaging/build_wheel.sh b/packaging/build_wheel.sh
index d08196cb20..3b25c00cd2 100755
--- a/packaging/build_wheel.sh
+++ b/packaging/build_wheel.sh
@@ -6,7 +6,7 @@ script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 
 export BUILD_TYPE="wheel"
 export NO_CUDA_PACKAGE=1
-setup_env 0.6.0
+setup_env 0.7.0
 setup_wheel_python
 pip_install numpy future
 setup_pip_pytorch_version
diff --git a/packaging/pkg_helpers.bash b/packaging/pkg_helpers.bash
index 4030128a98..3ec49a0269 100644
--- a/packaging/pkg_helpers.bash
+++ b/packaging/pkg_helpers.bash
@@ -171,6 +171,7 @@ setup_pip_pytorch_version() {
   else
     pip_install "torch==$PYTORCH_VERSION$PYTORCH_VERSION_SUFFIX" \
       -f https://download.pytorch.org/whl/torch_stable.html \
+      -f https://download.pytorch.org/whl/test/torch_test.html \
       -f https://download.pytorch.org/whl/nightly/torch_nightly.html
   fi
 }
@@ -184,7 +185,7 @@ setup_conda_pytorch_constraint() {
     export CONDA_CHANNEL_FLAGS="-c pytorch-nightly"
     export PYTORCH_VERSION="$(conda search --json 'pytorch[channel=pytorch-nightly]' | python -c "import sys, json, re; print(re.sub(r'\\+.*$', '', json.load(sys.stdin)['pytorch'][-1]['version']))")"
   else
-    export CONDA_CHANNEL_FLAGS="-c pytorch -c pytorch-nightly"
+    export CONDA_CHANNEL_FLAGS="-c pytorch -c pytorch-test -c pytorch-nightly"
   fi
   if [[ "$CU_VERSION" == cpu ]]; then
     export CONDA_PYTORCH_BUILD_CONSTRAINT="- pytorch==$PYTORCH_VERSION${PYTORCH_VERSION_SUFFIX}"
diff --git a/setup.py b/setup.py
index 5cf1ad680a..aab59b4d8a 100644
--- a/setup.py
+++ b/setup.py
@@ -12,7 +12,7 @@
 
 
 # Creating the version file
-version = '0.6.0a0'
+version = '0.7.0a0'
 sha = 'Unknown'
 
 try:
diff --git a/test/README.md b/test/README.md
index 35a926c120..304bfa9c8e 100644
--- a/test/README.md
+++ b/test/README.md
@@ -41,6 +41,58 @@ The following test modules are defined for corresponding `torchaudio` module/fun
 - [assets/kaldi](./assets/kaldi): Contains Kaldi format matrix files used in [./test_compliance_kaldi.py](./test_compliance_kaldi.py).
 - [compliance](./compliance): Scripts used to generate above Kaldi matrix files.
 
+### Waveforms for Testing Purposes
+
+When testing transforms we often need waveforms of specific type (ex: pure tone, noise, or voice), with specific bitrate (ex. 8 or 16 kHz) and number of channels (ex. mono, stereo). Below are some tips on how to construct waveforms and guidance around existing audio files.
+
+#### Load a Waveform from a File
+
+```python
+filepath = common_utils.get_asset_path('filename.wav')
+waveform, sample_rate = common_utils.load_wav(filepath)
+```
+
+*Note: Should you choose to contribute an audio file, please leave a comment in the issue or pull request, mentioning content source and licensing information. WAV files are preferred. Other formats should be used only when there is no alternative. (i.e. dataset implementation comes with hardcoded non-wav extension).*
+
+#### Pure Tone
+
+Code:
+
+```python
+waveform = common_utils.get_sinusoid(
+	frequency=300,
+	sample_rate=16000,
+	duration=1,  # seconds
+	n_channels=1,
+	dtype="float32",
+	device="cpu",
+)
+```
+
+#### Noise
+
+Code:
+
+```python
+tensor = common_utils.get_whitenoise()
+```
+
+Files:
+
+* `steam-train-whistle-daniel_simon.wav`
+
+#### Voice
+
+Files:
+
+* `CommonVoice/cv-corpus-4-2019-12-10/tt/clips/common_voice_tt_00000000.wav`
+* `LibriSpeech/dev-clean/1272/128104/1272-128104-0000.flac`
+* `LJSpeech-1.1/wavs/LJ001-0001.wav`
+* `SpeechCommands/speech_commands_v0.02/go/0a9f9af7_nohash_0.wav`
+* `VCTK-Corpus/wav48/p224/p224_002.wav`
+* `waves_yesno/0_1_0_1_0_1_1_0.wav`
+* `vad-go-stereo-44100.wav`
+* `vad-go-mono-32000.wav`
 
 ## Adding test
 
diff --git a/test/assets/CommonVoice/cv-corpus-4-2019-12-10/tt/clips/common_voice_tt_00000000.mp3 b/test/assets/CommonVoice/cv-corpus-4-2019-12-10/tt/clips/common_voice_tt_00000000.mp3
deleted file mode 100644
index d6fe9f44b9..0000000000
Binary files a/test/assets/CommonVoice/cv-corpus-4-2019-12-10/tt/clips/common_voice_tt_00000000.mp3 and /dev/null differ
diff --git a/test/assets/CommonVoice/cv-corpus-4-2019-12-10/tt/clips/common_voice_tt_00000000.wav b/test/assets/CommonVoice/cv-corpus-4-2019-12-10/tt/clips/common_voice_tt_00000000.wav
new file mode 100644
index 0000000000..8f9d80d7e1
Binary files /dev/null and b/test/assets/CommonVoice/cv-corpus-4-2019-12-10/tt/clips/common_voice_tt_00000000.wav differ
diff --git a/test/assets/CommonVoice/cv-corpus-4-2019-12-10/tt/train.tsv b/test/assets/CommonVoice/cv-corpus-4-2019-12-10/tt/train.tsv
index 2b677dbf7f..a5244b01f2 100644
--- a/test/assets/CommonVoice/cv-corpus-4-2019-12-10/tt/train.tsv
+++ b/test/assets/CommonVoice/cv-corpus-4-2019-12-10/tt/train.tsv
@@ -1,3 +1,3 @@
 client_id	path	sentence	up_votes	down_votes	age	gender	accent
-00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000	common_voice_tt_00000000.mp3	test.	1	0	thirties	female	
-00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001	common_voice_tt_00000000.mp3	test.	1	0	thirties	female	
+00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000	common_voice_tt_00000000.wav	test.	1	0	thirties	female	
+00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001	common_voice_tt_00000000.wav	test.	1	0	thirties	female	
diff --git a/test/assets/dtmf_30s_stereo.mp3 b/test/assets/dtmf_30s_stereo.mp3
deleted file mode 100644
index 6c97835ec0..0000000000
Binary files a/test/assets/dtmf_30s_stereo.mp3 and /dev/null differ
diff --git a/test/assets/io/96k_0_1ch.opus b/test/assets/io/96k_0_1ch.opus
new file mode 100644
index 0000000000..df95474ddb
Binary files /dev/null and b/test/assets/io/96k_0_1ch.opus differ
diff --git a/test/assets/io/96k_0_2ch.opus b/test/assets/io/96k_0_2ch.opus
new file mode 100644
index 0000000000..b8837e81e2
Binary files /dev/null and b/test/assets/io/96k_0_2ch.opus differ
diff --git a/test/assets/io/96k_10_1ch.opus b/test/assets/io/96k_10_1ch.opus
new file mode 100644
index 0000000000..56b170d380
Binary files /dev/null and b/test/assets/io/96k_10_1ch.opus differ
diff --git a/test/assets/io/96k_10_2ch.opus b/test/assets/io/96k_10_2ch.opus
new file mode 100644
index 0000000000..e2b147fc7f
Binary files /dev/null and b/test/assets/io/96k_10_2ch.opus differ
diff --git a/test/assets/io/96k_5_1ch.opus b/test/assets/io/96k_5_1ch.opus
new file mode 100644
index 0000000000..a1f5214d3a
Binary files /dev/null and b/test/assets/io/96k_5_1ch.opus differ
diff --git a/test/assets/io/96k_5_2ch.opus b/test/assets/io/96k_5_2ch.opus
new file mode 100644
index 0000000000..007bc813ce
Binary files /dev/null and b/test/assets/io/96k_5_2ch.opus differ
diff --git a/test/assets/io/generate_opus.py b/test/assets/io/generate_opus.py
new file mode 100644
index 0000000000..e6b99c471c
--- /dev/null
+++ b/test/assets/io/generate_opus.py
@@ -0,0 +1,50 @@
+"""Generate opus file for testing load functions"""
+
+import argparse
+import subprocess
+
+import scipy.io.wavfile
+import torch
+
+
+def _parse_args():
+    parser = argparse.ArgumentParser(
+        description='Generate opus files for test'
+    )
+    parser.add_argument('--num-channels', required=True, type=int)
+    parser.add_argument('--compression-level', required=True, type=int, choices=list(range(11)))
+    parser.add_argument('--bitrate', default='96k')
+    return parser.parse_args()
+
+
+def convert_to_opus(
+        src_path, dst_path,
+        *, bitrate, compression_level):
+    """Convert audio file with `ffmpeg` command."""
+    command = ['ffmpeg', '-y', '-i', src_path, '-c:a', 'libopus', '-b:a', bitrate]
+    if compression_level is not None:
+        command += ['-compression_level', str(compression_level)]
+    command += [dst_path]
+    print(' '.join(command))
+    subprocess.run(command, check=True)
+
+
+def _generate(num_channels, compression_level, bitrate):
+    org_path = 'original.wav'
+    ops_path = f'{bitrate}_{compression_level}_{num_channels}ch.opus'
+
+    # Note: ffmpeg forces sample rate 48k Hz for opus https://stackoverflow.com/a/39186779
+    # 1. generate original wav
+    data = torch.linspace(-32768, 32767, 32768, dtype=torch.int16).repeat([num_channels, 1]).t()
+    scipy.io.wavfile.write(org_path, 48000, data.numpy())
+    # 2. convert to opus
+    convert_to_opus(org_path, ops_path, bitrate=bitrate, compression_level=compression_level)
+
+
+def _main():
+    args = _parse_args()
+    _generate(args.num_channels, args.compression_level, args.bitrate)
+
+
+if __name__ == '__main__':
+    _main()
diff --git a/test/common_utils.py b/test/common_utils.py
deleted file mode 100644
index fe045d9db8..0000000000
--- a/test/common_utils.py
+++ /dev/null
@@ -1,171 +0,0 @@
-import os
-import tempfile
-import unittest
-from typing import Union
-from shutil import copytree
-
-import torch
-from torch.testing._internal.common_utils import TestCase as PytorchTestCase
-import torchaudio
-
-_TEST_DIR_PATH = os.path.dirname(os.path.realpath(__file__))
-BACKENDS = torchaudio.list_audio_backends()
-
-
-def get_asset_path(*paths):
-    """Return full path of a test asset"""
-    return os.path.join(_TEST_DIR_PATH, 'assets', *paths)
-
-
-def create_temp_assets_dir():
-    """
-    Creates a temporary directory and moves all files from test/assets there.
-    Returns a Tuple[string, TemporaryDirectory] which is the folder path
-    and object.
-    """
-    tmp_dir = tempfile.TemporaryDirectory()
-    copytree(os.path.join(_TEST_DIR_PATH, "assets"),
-             os.path.join(tmp_dir.name, "assets"))
-    return tmp_dir.name, tmp_dir
-
-
-def random_float_tensor(seed, size, a=22695477, c=1, m=2 ** 32):
-    """ Generates random tensors given a seed and size
-    https://en.wikipedia.org/wiki/Linear_congruential_generator
-    X_{n + 1} = (a * X_n + c) % m
-    Using Borland C/C++ values
-
-    The tensor will have values between [0,1)
-    Inputs:
-        seed (int): an int
-        size (Tuple[int]): the size of the output tensor
-        a (int): the multiplier constant to the generator
-        c (int): the additive constant to the generator
-        m (int): the modulus constant to the generator
-    """
-    num_elements = 1
-    for s in size:
-        num_elements *= s
-
-    arr = [(a * seed + c) % m]
-    for i in range(num_elements - 1):
-        arr.append((a * arr[i] + c) % m)
-
-    return torch.tensor(arr).float().view(size) / m
-
-
-def filter_backends_with_mp3(backends):
-    # Filter out backends that do not support mp3
-    test_filepath = get_asset_path('steam-train-whistle-daniel_simon.mp3')
-
-    def supports_mp3(backend):
-        torchaudio.set_audio_backend(backend)
-        try:
-            torchaudio.load(test_filepath)
-            return True
-        except (RuntimeError, ImportError):
-            return False
-
-    return [backend for backend in backends if supports_mp3(backend)]
-
-
-BACKENDS_MP3 = filter_backends_with_mp3(BACKENDS)
-
-
-def set_audio_backend(backend):
-    """Allow additional backend value, 'default'"""
-    if backend == 'default':
-        if 'sox' in BACKENDS:
-            be = 'sox'
-        elif 'soundfile' in BACKENDS:
-            be = 'soundfile'
-        else:
-            raise unittest.SkipTest('No default backend available')
-    else:
-        be = backend
-
-    torchaudio.set_audio_backend(be)
-
-
-class TestBaseMixin:
-    """Mixin to provide consistent way to define device/dtype/backend aware TestCase"""
-    dtype = None
-    device = None
-    backend = None
-
-    def setUp(self):
-        super().setUp()
-        set_audio_backend(self.backend)
-
-
-class TorchaudioTestCase(TestBaseMixin, PytorchTestCase):
-    pass
-
-
-skipIfNoSoxBackend = unittest.skipIf('sox' not in BACKENDS, 'Sox backend not available')
-skipIfNoCuda = unittest.skipIf(not torch.cuda.is_available(), reason='CUDA not available')
-
-
-def get_whitenoise(
-    *,
-    sample_rate: int = 16000,
-    duration: float = 1,  # seconds
-    n_channels: int = 1,
-    seed: int = 0,
-    dtype: Union[str, torch.dtype] = "float32",
-    device: Union[str, torch.device] = "cpu",
-):
-    """Generate pseudo audio data with whitenoise
-
-    Args:
-        sample_rate: Sampling rate
-        duration: Length of the resulting Tensor in seconds.
-        n_channels: Number of channels
-        seed: Seed value used for random number generation.
-            Note that this function does not modify global random generator state.
-        dtype: Torch dtype
-        device: device
-    Returns:
-        Tensor: shape of (n_channels, sample_rate * duration)
-    """
-    if isinstance(dtype, str):
-        dtype = getattr(torch, dtype)
-    shape = [n_channels, sample_rate * duration]
-    # According to the doc, folking rng on all CUDA devices is slow when there are many CUDA devices,
-    # so we only folk on CPU, generate values and move the data to the given device
-    with torch.random.fork_rng([]):
-        torch.random.manual_seed(seed)
-        tensor = torch.randn(shape, dtype=dtype, device='cpu')
-    tensor /= 2.0
-    tensor.clamp_(-1.0, 1.0)
-    return tensor.to(device=device)
-
-
-def get_sinusoid(
-    *,
-    frequency: float = 300,
-    sample_rate: int = 16000,
-    duration: float = 1,  # seconds
-    n_channels: int = 1,
-    dtype: Union[str, torch.dtype] = "float32",
-    device: Union[str, torch.device] = "cpu",
-):
-    """Generate pseudo audio data with sine wave.
-
-    Args:
-        frequency: Frequency of sine wave
-        sample_rate: Sampling rate
-        duration: Length of the resulting Tensor in seconds.
-        n_channels: Number of channels
-        dtype: Torch dtype
-        device: device
-
-    Returns:
-        Tensor: shape of (n_channels, sample_rate * duration)
-    """
-    if isinstance(dtype, str):
-        dtype = getattr(torch, dtype)
-    pie2 = 2 * 3.141592653589793
-    end = pie2 * frequency * duration
-    theta = torch.linspace(0, end, sample_rate * duration, dtype=dtype, device=device)
-    return torch.sin(theta, out=None).repeat([n_channels, 1])
diff --git a/test/common_utils/__init__.py b/test/common_utils/__init__.py
new file mode 100644
index 0000000000..841cd26713
--- /dev/null
+++ b/test/common_utils/__init__.py
@@ -0,0 +1,31 @@
+from .data_utils import (
+    get_asset_path,
+    get_whitenoise,
+    get_sinusoid,
+)
+from .backend_utils import (
+    set_audio_backend,
+    BACKENDS,
+    BACKENDS_MP3,
+)
+from .case_utils import (
+    TempDirMixin,
+    TestBaseMixin,
+    PytorchTestCase,
+    TorchaudioTestCase,
+    skipIfNoCuda,
+    skipIfNoExec,
+    skipIfNoModule,
+    skipIfNoExtension,
+    skipIfNoSoxBackend,
+)
+from .wav_utils import (
+    get_wav_data,
+    normalize_wav,
+    load_wav,
+    save_wav,
+)
+from .parameterized_utils import (
+    load_params,
+)
+from . import sox_utils
diff --git a/test/common_utils/backend_utils.py b/test/common_utils/backend_utils.py
new file mode 100644
index 0000000000..158fde87ed
--- /dev/null
+++ b/test/common_utils/backend_utils.py
@@ -0,0 +1,41 @@
+import unittest
+
+import torchaudio
+
+from .import data_utils
+
+
+BACKENDS = torchaudio.list_audio_backends()
+
+
+def _filter_backends_with_mp3(backends):
+    # Filter out backends that do not support mp3
+    test_filepath = data_utils.get_asset_path('steam-train-whistle-daniel_simon.mp3')
+
+    def supports_mp3(backend):
+        torchaudio.set_audio_backend(backend)
+        try:
+            torchaudio.load(test_filepath)
+            return True
+        except (RuntimeError, ImportError):
+            return False
+
+    return [backend for backend in backends if supports_mp3(backend)]
+
+
+BACKENDS_MP3 = _filter_backends_with_mp3(BACKENDS)
+
+
+def set_audio_backend(backend):
+    """Allow additional backend value, 'default'"""
+    if backend == 'default':
+        if 'sox' in BACKENDS:
+            be = 'sox'
+        elif 'soundfile' in BACKENDS:
+            be = 'soundfile'
+        else:
+            raise unittest.SkipTest('No default backend available')
+    else:
+        be = backend
+
+    torchaudio.set_audio_backend(be)
diff --git a/test/common_utils/case_utils.py b/test/common_utils/case_utils.py
new file mode 100644
index 0000000000..f3b0c343a6
--- /dev/null
+++ b/test/common_utils/case_utils.py
@@ -0,0 +1,75 @@
+import shutil
+import os.path
+import tempfile
+import unittest
+
+import torch
+from torch.testing._internal.common_utils import TestCase as PytorchTestCase
+import torchaudio
+from torchaudio._internal.module_utils import is_module_available
+
+from .backend_utils import set_audio_backend
+
+
+class TempDirMixin:
+    """Mixin to provide easy access to temp dir"""
+    temp_dir_ = None
+    base_temp_dir = None
+    temp_dir = None
+
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
+        # If TORCHAUDIO_TEST_TEMP_DIR is set, use it instead of temporary directory.
+        # this is handy for debugging.
+        key = 'TORCHAUDIO_TEST_TEMP_DIR'
+        if key in os.environ:
+            cls.base_temp_dir = os.environ[key]
+        else:
+            cls.temp_dir_ = tempfile.TemporaryDirectory()
+            cls.base_temp_dir = cls.temp_dir_.name
+
+    @classmethod
+    def tearDownClass(cls):
+        super().tearDownClass()
+        if isinstance(cls.temp_dir_, tempfile.TemporaryDirectory):
+            cls.temp_dir_.cleanup()
+
+    def setUp(self):
+        super().setUp()
+        self.temp_dir = os.path.join(self.base_temp_dir, self.id())
+
+    def get_temp_path(self, *paths):
+        path = os.path.join(self.temp_dir, *paths)
+        os.makedirs(os.path.dirname(path), exist_ok=True)
+        return path
+
+
+class TestBaseMixin:
+    """Mixin to provide consistent way to define device/dtype/backend aware TestCase"""
+    dtype = None
+    device = None
+    backend = None
+
+    def setUp(self):
+        super().setUp()
+        set_audio_backend(self.backend)
+
+
+class TorchaudioTestCase(TestBaseMixin, PytorchTestCase):
+    pass
+
+
+def skipIfNoExec(cmd):
+    return unittest.skipIf(shutil.which(cmd) is None, f'`{cmd}` is not available')
+
+
+def skipIfNoModule(module, display_name=None):
+    display_name = display_name or module
+    return unittest.skipIf(not is_module_available(module), f'"{display_name}" is not available')
+
+
+skipIfNoSoxBackend = unittest.skipIf(
+    'sox' not in torchaudio.list_audio_backends(), 'Sox backend not available')
+skipIfNoCuda = unittest.skipIf(not torch.cuda.is_available(), reason='CUDA not available')
+skipIfNoExtension = skipIfNoModule('torchaudio._torchaudio', 'torchaudio C++ extension')
diff --git a/test/common_utils/data_utils.py b/test/common_utils/data_utils.py
new file mode 100644
index 0000000000..b948ce334a
--- /dev/null
+++ b/test/common_utils/data_utils.py
@@ -0,0 +1,94 @@
+import os.path
+from typing import Union
+
+import torch
+
+
+_TEST_DIR_PATH = os.path.realpath(
+    os.path.join(os.path.dirname(__file__), '..'))
+
+
+def get_asset_path(*paths):
+    """Return full path of a test asset"""
+    return os.path.join(_TEST_DIR_PATH, 'assets', *paths)
+
+
+def get_whitenoise(
+    *,
+    sample_rate: int = 16000,
+    duration: float = 1,  # seconds
+    n_channels: int = 1,
+    seed: int = 0,
+    dtype: Union[str, torch.dtype] = "float32",
+    device: Union[str, torch.device] = "cpu",
+    channels_first=True,
+    scale_factor: float = 1,
+):
+    """Generate pseudo audio data with whitenoise
+    Args:
+        sample_rate: Sampling rate
+        duration: Length of the resulting Tensor in seconds.
+        n_channels: Number of channels
+        seed: Seed value used for random number generation.
+            Note that this function does not modify global random generator state.
+        dtype: Torch dtype
+        device: device
+        channels_first: whether first dimension is n_channels
+        scale_factor: scale the Tensor before clamping and quantization
+    Returns:
+        Tensor: shape of (n_channels, sample_rate * duration)
+    """
+    if isinstance(dtype, str):
+        dtype = getattr(torch, dtype)
+    if dtype not in [torch.float32, torch.int32, torch.int16, torch.uint8]:
+        raise NotImplementedError(f'dtype {dtype} is not supported.')
+    # According to the doc, folking rng on all CUDA devices is slow when there are many CUDA devices,
+    # so we only folk on CPU, generate values and move the data to the given device
+    with torch.random.fork_rng([]):
+        torch.random.manual_seed(seed)
+        tensor = torch.randn([sample_rate * duration], dtype=torch.float32, device='cpu')
+    tensor /= 2.0
+    tensor *= scale_factor
+    tensor.clamp_(-1.0, 1.0)
+    if dtype == torch.int32:
+        tensor *= (tensor > 0) * 2147483647 + (tensor < 0) * 2147483648
+    if dtype == torch.int16:
+        tensor *= (tensor > 0) * 32767 + (tensor < 0) * 32768
+    if dtype == torch.uint8:
+        tensor *= (tensor > 0) * 127 + (tensor < 0) * 128
+        tensor += 128
+    tensor = tensor.to(dtype)
+    tensor = tensor.repeat([n_channels, 1])
+    if not channels_first:
+        tensor = tensor.t()
+    return tensor.to(device=device)
+
+
+def get_sinusoid(
+    *,
+    frequency: float = 300,
+    sample_rate: int = 16000,
+    duration: float = 1,  # seconds
+    n_channels: int = 1,
+    dtype: Union[str, torch.dtype] = "float32",
+    device: Union[str, torch.device] = "cpu",
+):
+    """Generate pseudo audio data with sine wave.
+
+    Args:
+        frequency: Frequency of sine wave
+        sample_rate: Sampling rate
+        duration: Length of the resulting Tensor in seconds.
+        n_channels: Number of channels
+        dtype: Torch dtype
+        device: device
+
+    Returns:
+        Tensor: shape of (n_channels, sample_rate * duration)
+    """
+    if isinstance(dtype, str):
+        dtype = getattr(torch, dtype)
+    pie2 = 2 * 3.141592653589793
+    end = pie2 * frequency * duration
+    theta = torch.linspace(0, end, sample_rate * duration, dtype=dtype, device=device)
+    return torch.sin(theta, out=None).repeat([n_channels, 1])
diff --git a/test/common_utils/parameterized_utils.py b/test/common_utils/parameterized_utils.py
new file mode 100644
index 0000000000..24404a6edd
--- /dev/null
+++ b/test/common_utils/parameterized_utils.py
@@ -0,0 +1,10 @@
+import json
+
+from parameterized import param
+
+from .data_utils import get_asset_path
+
+
+def load_params(*paths):
+    with open(get_asset_path(*paths), 'r') as file:
+        return [param(json.loads(line)) for line in file]
diff --git a/test/common_utils/sox_utils.py b/test/common_utils/sox_utils.py
new file mode 100644
index 0000000000..cd1c247b72
--- /dev/null
+++ b/test/common_utils/sox_utils.py
@@ -0,0 +1,79 @@
+import subprocess
+
+
+def get_encoding(dtype):
+    encodings = {
+        'float32': 'floating-point',
+        'int32': 'signed-integer',
+        'int16': 'signed-integer',
+        'uint8': 'unsigned-integer',
+    }
+    return encodings[dtype]
+
+
+def get_bit_depth(dtype):
+    bit_depths = {
+        'float32': 32,
+        'int32': 32,
+        'int16': 16,
+        'uint8': 8,
+    }
+    return bit_depths[dtype]
+
+
+def gen_audio_file(
+        path, sample_rate, num_channels,
+        *, encoding=None, bit_depth=None, compression=None, attenuation=None, duration=1,
+):
+    """Generate synthetic audio file with `sox` command."""
+    if path.endswith('.wav'):
+        raise RuntimeError(
+            'Use get_wav_data and save_wav to generate wav file for accurate result.')
+    command = [
+        'sox',
+        '-V3',  # verbose
+        '-R',
+        # -R is supposed to be repeatable, though the implementation looks suspicious
+        # and not setting the seed to a fixed value.
+        # https://fossies.org/dox/sox-14.4.2/sox_8c_source.html
+        # search "sox_globals.repeatable"
+    ]
+    if bit_depth is not None:
+        command += ['--bits', str(bit_depth)]
+    command += [
+        '--rate', str(sample_rate),
+        '--null',  # no input
+        '--channels', str(num_channels),
+    ]
+    if compression is not None:
+        command += ['--compression', str(compression)]
+    if bit_depth is not None:
+        command += ['--bits', str(bit_depth)]
+    if encoding is not None:
+        command += ['--encoding', str(encoding)]
+    command += [
+        str(path),
+        'synth', str(duration),  # synthesizes for the given duration [sec]
+        'sawtooth', '1',
+        # saw tooth covers the both ends of value range, which is a good property for test.
+        # similar to linspace(-1., 1.)
+        # this introduces bigger boundary effect than sine when converted to mp3
+    ]
+    if attenuation is not None:
+        command += ['vol', f'-{attenuation}dB']
+    print(' '.join(command))
+    subprocess.run(command, check=True)
+
+
+def convert_audio_file(
+        src_path, dst_path,
+        *, bit_depth=None, compression=None):
+    """Convert audio file with `sox` command."""
+    command = ['sox', '-V3', '-R', str(src_path)]
+    if bit_depth is not None:
+        command += ['--bits', str(bit_depth)]
+    if compression is not None:
+        command += ['--compression', str(compression)]
+    command += [dst_path]
+    print(' '.join(command))
+    subprocess.run(command, check=True)
diff --git a/test/common_utils/wav_utils.py b/test/common_utils/wav_utils.py
new file mode 100644
index 0000000000..bc122ec6cb
--- /dev/null
+++ b/test/common_utils/wav_utils.py
@@ -0,0 +1,86 @@
+from typing import Optional
+
+import torch
+import scipy.io.wavfile
+
+
+def normalize_wav(tensor: torch.Tensor) -> torch.Tensor:
+    if tensor.dtype == torch.float32:
+        pass
+    elif tensor.dtype == torch.int32:
+        tensor = tensor.to(torch.float32)
+        tensor[tensor > 0] /= 2147483647.
+        tensor[tensor < 0] /= 2147483648.
+    elif tensor.dtype == torch.int16:
+        tensor = tensor.to(torch.float32)
+        tensor[tensor > 0] /= 32767.
+        tensor[tensor < 0] /= 32768.
+    elif tensor.dtype == torch.uint8:
+        tensor = tensor.to(torch.float32) - 128
+        tensor[tensor > 0] /= 127.
+        tensor[tensor < 0] /= 128.
+    return tensor
+
+
+def get_wav_data(
+        dtype: str,
+        num_channels: int,
+        *,
+        num_frames: Optional[int] = None,
+        normalize: bool = True,
+        channels_first: bool = True,
+):
+    """Generate linear signal of the given dtype and num_channels
+
+    Data range is
+        [-1.0, 1.0] for float32,
+        [-2147483648, 2147483647] for int32
+        [-32768, 32767] for int16
+        [0, 255] for uint8
+
+    num_frames allow to change the linear interpolation parameter.
+    Default values are 256 for uint8, else 1 << 16.
+    1 << 16 as default is so that int16 value range is completely covered.
+    """
+    dtype_ = getattr(torch, dtype)
+
+    if num_frames is None:
+        if dtype == 'uint8':
+            num_frames = 256
+        else:
+            num_frames = 1 << 16
+
+    if dtype == 'uint8':
+        base = torch.linspace(0, 255, num_frames, dtype=dtype_)
+    if dtype == 'float32':
+        base = torch.linspace(-1., 1., num_frames, dtype=dtype_)
+    if dtype == 'int32':
+        base = torch.linspace(-2147483648, 2147483647, num_frames, dtype=dtype_)
+    if dtype == 'int16':
+        base = torch.linspace(-32768, 32767, num_frames, dtype=dtype_)
+    data = base.repeat([num_channels, 1])
+    if not channels_first:
+        data = data.transpose(1, 0)
+    if normalize:
+        data = normalize_wav(data)
+    return data
+
+
+def load_wav(path: str, normalize=True, channels_first=True) -> torch.Tensor:
+    """Load wav file without torchaudio"""
+    sample_rate, data = scipy.io.wavfile.read(path)
+    data = torch.from_numpy(data.copy())
+    if data.ndim == 1:
+        data = data.unsqueeze(1)
+    if normalize:
+        data = normalize_wav(data)
+    if channels_first:
+        data = data.transpose(1, 0)
+    return data, sample_rate
+
+
+def save_wav(path, data, sample_rate, channels_first=True):
+    """Save wav file without torchaudio"""
+    if channels_first:
+        data = data.transpose(1, 0)
+    scipy.io.wavfile.write(path, sample_rate, data.numpy())
diff --git a/test/functional_cpu_test.py b/test/functional_cpu_test.py
index 470d6ab770..ab5fdaed95 100644
--- a/test/functional_cpu_test.py
+++ b/test/functional_cpu_test.py
@@ -10,6 +10,31 @@
 from .functional_impl import Lfilter
 
 
+def random_float_tensor(seed, size, a=22695477, c=1, m=2 ** 32):
+    """ Generates random tensors given a seed and size
+    https://en.wikipedia.org/wiki/Linear_congruential_generator
+    X_{n + 1} = (a * X_n + c) % m
+    Using Borland C/C++ values
+
+    The tensor will have values between [0,1)
+    Inputs:
+        seed (int): an int
+        size (Tuple[int]): the size of the output tensor
+        a (int): the multiplier constant to the generator
+        c (int): the additive constant to the generator
+        m (int): the modulus constant to the generator
+    """
+    num_elements = 1
+    for s in size:
+        num_elements *= s
+
+    arr = [(a * seed + c) % m]
+    for i in range(num_elements - 1):
+        arr.append((a * arr[i] + c) % m)
+
+    return torch.tensor(arr).float().view(size) / m
+
+
 class TestLFilterFloat32(Lfilter, common_utils.PytorchTestCase):
     dtype = torch.float32
     device = torch.device('cpu')
@@ -49,7 +74,7 @@ def _test_istft_is_inverse_of_stft(kwargs):
     for data_size in [(2, 20), (3, 15), (4, 10)]:
         for i in range(100):
 
-            sound = common_utils.random_float_tensor(i, data_size)
+            sound = random_float_tensor(i, data_size)
 
             stft = torch.stft(sound, **kwargs)
             estimate = torchaudio.functional.istft(stft, length=sound.size(1), **kwargs)
@@ -211,8 +236,8 @@ def test_istft_of_sine(self):
 
     def _test_linearity_of_istft(self, data_size, kwargs, atol=1e-6, rtol=1e-8):
         for i in range(self.number_of_trials):
-            tensor1 = common_utils.random_float_tensor(i, data_size)
-            tensor2 = common_utils.random_float_tensor(i * 2, data_size)
+            tensor1 = random_float_tensor(i, data_size)
+            tensor2 = random_float_tensor(i * 2, data_size)
             a, b = torch.rand(2)
             istft1 = torchaudio.functional.istft(tensor1, **kwargs)
             istft2 = torchaudio.functional.istft(tensor2, **kwargs)
@@ -274,8 +299,6 @@ def test_linearity_of_istft4(self):
 
 
 class TestDetectPitchFrequency(common_utils.TorchaudioTestCase):
-    backend = 'default'
-
     def test_pitch(self):
         test_filepath_100 = common_utils.get_asset_path("100Hz_44100Hz_16bit_05sec.wav")
         test_filepath_440 = common_utils.get_asset_path("440Hz_44100Hz_16bit_05sec.wav")
@@ -287,7 +310,7 @@ def test_pitch(self):
         ]
 
         for filename, freq_ref in tests:
-            waveform, sample_rate = torchaudio.load(filename)
+            waveform, sample_rate = common_utils.load_wav(filename)
 
             freq = torchaudio.functional.detect_pitch_frequency(waveform, sample_rate)
 
diff --git a/test/kaldi_compatibility_impl.py b/test/kaldi_compatibility_impl.py
index ec58a3fa44..7e51620f63 100644
--- a/test/kaldi_compatibility_impl.py
+++ b/test/kaldi_compatibility_impl.py
@@ -1,20 +1,19 @@
 """Test suites for checking numerical compatibility against Kaldi"""
-import json
-import shutil
-import unittest
 import subprocess
 
 import kaldi_io
 import torch
 import torchaudio.functional as F
 import torchaudio.compliance.kaldi
+from parameterized import parameterized
 
-from . import common_utils
-from parameterized import parameterized, param
-
-
-def _not_available(cmd):
-    return shutil.which(cmd) is None
+from .common_utils import (
+    TestBaseMixin,
+    load_params,
+    skipIfNoExec,
+    get_asset_path,
+    load_wav
+)
 
 
 def _convert_args(**kwargs):
@@ -49,19 +48,12 @@ def _run_kaldi(command, input_type, input_value):
     return torch.from_numpy(result.copy())  # copy supresses some torch warning
 
 
-def _load_params(path):
-    with open(path, 'r') as file:
-        return [param(json.loads(line)) for line in file]
-
-
-class Kaldi(common_utils.TestBaseMixin):
-    backend = 'sox'
-
+class Kaldi(TestBaseMixin):
     def assert_equal(self, output, *, expected, rtol=None, atol=None):
         expected = expected.to(dtype=self.dtype, device=self.device)
         self.assertEqual(output, expected, rtol=rtol, atol=atol)
 
-    @unittest.skipIf(_not_available('apply-cmvn-sliding'), '`apply-cmvn-sliding` not available')
+    @skipIfNoExec('apply-cmvn-sliding')
     def test_sliding_window_cmn(self):
         """sliding_window_cmn should be numerically compatible with apply-cmvn-sliding"""
         kwargs = {
@@ -77,34 +69,34 @@ def test_sliding_window_cmn(self):
         kaldi_result = _run_kaldi(command, 'ark', tensor)
         self.assert_equal(result, expected=kaldi_result)
 
-    @parameterized.expand(_load_params(common_utils.get_asset_path('kaldi_test_fbank_args.json')))
-    @unittest.skipIf(_not_available('compute-fbank-feats'), '`compute-fbank-feats` not available')
+    @parameterized.expand(load_params('kaldi_test_fbank_args.json'))
+    @skipIfNoExec('compute-fbank-feats')
     def test_fbank(self, kwargs):
         """fbank should be numerically compatible with compute-fbank-feats"""
-        wave_file = common_utils.get_asset_path('kaldi_file.wav')
-        waveform = torchaudio.load_wav(wave_file)[0].to(dtype=self.dtype, device=self.device)
+        wave_file = get_asset_path('kaldi_file.wav')
+        waveform = load_wav(wave_file, normalize=False)[0].to(dtype=self.dtype, device=self.device)
         result = torchaudio.compliance.kaldi.fbank(waveform, **kwargs)
         command = ['compute-fbank-feats'] + _convert_args(**kwargs) + ['scp:-', 'ark:-']
         kaldi_result = _run_kaldi(command, 'scp', wave_file)
         self.assert_equal(result, expected=kaldi_result, rtol=1e-4, atol=1e-8)
 
-    @parameterized.expand(_load_params(common_utils.get_asset_path('kaldi_test_spectrogram_args.json')))
-    @unittest.skipIf(_not_available('compute-spectrogram-feats'), '`compute-spectrogram-feats` not available')
+    @parameterized.expand(load_params('kaldi_test_spectrogram_args.json'))
+    @skipIfNoExec('compute-spectrogram-feats')
     def test_spectrogram(self, kwargs):
         """spectrogram should be numerically compatible with compute-spectrogram-feats"""
-        wave_file = common_utils.get_asset_path('kaldi_file.wav')
-        waveform = torchaudio.load_wav(wave_file)[0].to(dtype=self.dtype, device=self.device)
+        wave_file = get_asset_path('kaldi_file.wav')
+        waveform = load_wav(wave_file, normalize=False)[0].to(dtype=self.dtype, device=self.device)
         result = torchaudio.compliance.kaldi.spectrogram(waveform, **kwargs)
         command = ['compute-spectrogram-feats'] + _convert_args(**kwargs) + ['scp:-', 'ark:-']
         kaldi_result = _run_kaldi(command, 'scp', wave_file)
         self.assert_equal(result, expected=kaldi_result, rtol=1e-4, atol=1e-8)
 
-    @parameterized.expand(_load_params(common_utils.get_asset_path('kaldi_test_mfcc_args.json')))
-    @unittest.skipIf(_not_available('compute-mfcc-feats'), '`compute-mfcc-feats` not available')
+    @parameterized.expand(load_params('kaldi_test_mfcc_args.json'))
+    @skipIfNoExec('compute-mfcc-feats')
     def test_mfcc(self, kwargs):
         """mfcc should be numerically compatible with compute-mfcc-feats"""
-        wave_file = common_utils.get_asset_path('kaldi_file.wav')
-        waveform = torchaudio.load_wav(wave_file)[0].to(dtype=self.dtype, device=self.device)
+        wave_file = get_asset_path('kaldi_file.wav')
+        waveform = load_wav(wave_file, normalize=False)[0].to(dtype=self.dtype, device=self.device)
         result = torchaudio.compliance.kaldi.mfcc(waveform, **kwargs)
         command = ['compute-mfcc-feats'] + _convert_args(**kwargs) + ['scp:-', 'ark:-']
         kaldi_result = _run_kaldi(command, 'scp', wave_file)
diff --git a/test/sox_io_backend/__init__.py b/test/sox_io_backend/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/test/sox_io_backend/common.py b/test/sox_io_backend/common.py
new file mode 100644
index 0000000000..eb85937236
--- /dev/null
+++ b/test/sox_io_backend/common.py
@@ -0,0 +1,2 @@
+def name_func(func, _, params):
+    return f'{func.__name__}_{"_".join(str(arg) for arg in params.args)}'
diff --git a/test/sox_io_backend/test_info.py b/test/sox_io_backend/test_info.py
new file mode 100644
index 0000000000..da5207a7e5
--- /dev/null
+++ b/test/sox_io_backend/test_info.py
@@ -0,0 +1,125 @@
+import itertools
+from parameterized import parameterized
+
+from torchaudio.backend import sox_io_backend
+
+from ..common_utils import (
+    TempDirMixin,
+    PytorchTestCase,
+    skipIfNoExec,
+    skipIfNoExtension,
+    get_asset_path,
+    get_wav_data,
+    save_wav,
+    sox_utils,
+)
+from .common import (
+    name_func,
+)
+
+
+@skipIfNoExec('sox')
+@skipIfNoExtension
+class TestInfo(TempDirMixin, PytorchTestCase):
+    @parameterized.expand(list(itertools.product(
+        ['float32', 'int32', 'int16', 'uint8'],
+        [8000, 16000],
+        [1, 2],
+    )), name_func=name_func)
+    def test_wav(self, dtype, sample_rate, num_channels):
+        """`sox_io_backend.info` can check wav file correctly"""
+        duration = 1
+        path = self.get_temp_path('data.wav')
+        data = get_wav_data(dtype, num_channels, normalize=False, num_frames=duration * sample_rate)
+        save_wav(path, data, sample_rate)
+        info = sox_io_backend.info(path)
+        assert info.sample_rate == sample_rate
+        assert info.num_frames == sample_rate * duration
+        assert info.num_channels == num_channels
+
+    @parameterized.expand(list(itertools.product(
+        ['float32', 'int32', 'int16', 'uint8'],
+        [8000, 16000],
+        [4, 8, 16, 32],
+    )), name_func=name_func)
+    def test_wav_multiple_channels(self, dtype, sample_rate, num_channels):
+        """`sox_io_backend.info` can check wav file with channels more than 2 correctly"""
+        duration = 1
+        path = self.get_temp_path('data.wav')
+        data = get_wav_data(dtype, num_channels, normalize=False, num_frames=duration * sample_rate)
+        save_wav(path, data, sample_rate)
+        info = sox_io_backend.info(path)
+        assert info.sample_rate == sample_rate
+        assert info.num_frames == sample_rate * duration
+        assert info.num_channels == num_channels
+
+    @parameterized.expand(list(itertools.product(
+        [8000, 16000],
+        [1, 2],
+        [96, 128, 160, 192, 224, 256, 320],
+    )), name_func=name_func)
+    def test_mp3(self, sample_rate, num_channels, bit_rate):
+        """`sox_io_backend.info` can check mp3 file correctly"""
+        duration = 1
+        path = self.get_temp_path('data.mp3')
+        sox_utils.gen_audio_file(
+            path, sample_rate, num_channels,
+            compression=bit_rate, duration=duration,
+        )
+        info = sox_io_backend.info(path)
+        assert info.sample_rate == sample_rate
+        # mp3 does not preserve the number of samples
+        # assert info.num_frames == sample_rate * duration
+        assert info.num_channels == num_channels
+
+    @parameterized.expand(list(itertools.product(
+        [8000, 16000],
+        [1, 2],
+        list(range(9)),
+    )), name_func=name_func)
+    def test_flac(self, sample_rate, num_channels, compression_level):
+        """`sox_io_backend.info` can check flac file correctly"""
+        duration = 1
+        path = self.get_temp_path('data.flac')
+        sox_utils.gen_audio_file(
+            path, sample_rate, num_channels,
+            compression=compression_level, duration=duration,
+        )
+        info = sox_io_backend.info(path)
+        assert info.sample_rate == sample_rate
+        assert info.num_frames == sample_rate * duration
+        assert info.num_channels == num_channels
+
+    @parameterized.expand(list(itertools.product(
+        [8000, 16000],
+        [1, 2],
+        [-1, 0, 1, 2, 3, 3.6, 5, 10],
+    )), name_func=name_func)
+    def test_vorbis(self, sample_rate, num_channels, quality_level):
+        """`sox_io_backend.info` can check vorbis file correctly"""
+        duration = 1
+        path = self.get_temp_path('data.vorbis')
+        sox_utils.gen_audio_file(
+            path, sample_rate, num_channels,
+            compression=quality_level, duration=duration,
+        )
+        info = sox_io_backend.info(path)
+        assert info.sample_rate == sample_rate
+        assert info.num_frames == sample_rate * duration
+        assert info.num_channels == num_channels
+
+
+@skipIfNoExtension
+class TestInfoOpus(PytorchTestCase):
+    @parameterized.expand(list(itertools.product(
+        ['96k'],
+        [1, 2],
+        [0, 5, 10],
+    )), name_func=name_func)
+    def test_opus(self, bitrate, num_channels, compression_level):
+        """`sox_io_backend.info` can check opus file correcty"""
+        path = get_asset_path('io', f'{bitrate}_{compression_level}_{num_channels}ch.opus')
+        info = sox_io_backend.info(path)
+        assert info.sample_rate == 48000
+        assert info.num_frames == 32768
+        assert info.num_channels == num_channels
diff --git a/test/sox_io_backend/test_load.py b/test/sox_io_backend/test_load.py
new file mode 100644
index 0000000000..8366a01f83
--- /dev/null
+++ b/test/sox_io_backend/test_load.py
@@ -0,0 +1,263 @@
+import itertools
+
+from torchaudio.backend import sox_io_backend
+from parameterized import parameterized
+
+from ..common_utils import (
+    TempDirMixin,
+    PytorchTestCase,
+    skipIfNoExec,
+    skipIfNoExtension,
+    get_asset_path,
+    get_wav_data,
+    load_wav,
+    save_wav,
+    sox_utils,
+)
+from .common import (
+    name_func,
+)
+
+
+class LoadTestBase(TempDirMixin, PytorchTestCase):
+    def assert_wav(self, dtype, sample_rate, num_channels, normalize, duration):
+        """`sox_io_backend.load` can load wav format correctly.
+
+        Wav data loaded with sox_io backend should match those with scipy
+        """
+        path = self.get_temp_path('reference.wav')
+        data = get_wav_data(dtype, num_channels, normalize=normalize, num_frames=duration * sample_rate)
+        save_wav(path, data, sample_rate)
+        expected = load_wav(path, normalize=normalize)[0]
+        data, sr = sox_io_backend.load(path, normalize=normalize)
+        assert sr == sample_rate
+        self.assertEqual(data, expected)
+
+    def assert_mp3(self, sample_rate, num_channels, bit_rate, duration):
+        """`sox_io_backend.load` can load mp3 format.
+
+        mp3 encoding introduces delay and boundary effects so
+        we create reference wav file from mp3
+
+         x
+         |
+         | 1. Generate mp3 with Sox
+         |
+         v    2. Convert to wav with Sox
+        mp3 ------------------------------> wav
+         |                                   |
+         | 3. Load with torchaudio           | 4. Load with scipy
+         |                                   |
+         v                                   v
+        tensor ----------> x <----------- tensor
+                       5. Compare
+
+        Underlying assumptions are;
+        i. Conversion of mp3 to wav with Sox preserves data.
+        ii. Loading wav file with scipy is correct.
+
+        By combining i & ii, step 2. and 4. allows to load reference mp3 data
+        without using torchaudio
+        """
+        path = self.get_temp_path('1.original.mp3')
+        ref_path = self.get_temp_path('2.reference.wav')
+
+        # 1. Generate mp3 with sox
+        sox_utils.gen_audio_file(
+            path, sample_rate, num_channels,
+            compression=bit_rate, duration=duration)
+        # 2. Convert to wav with sox
+        sox_utils.convert_audio_file(path, ref_path)
+        # 3. Load mp3 with torchaudio
+        data, sr = sox_io_backend.load(path)
+        # 4. Load wav with scipy
+        data_ref = load_wav(ref_path)[0]
+        # 5. Compare
+        assert sr == sample_rate
+        self.assertEqual(data, data_ref, atol=3e-03, rtol=1.3e-06)
+
+    def assert_flac(self, sample_rate, num_channels, compression_level, duration):
+        """`sox_io_backend.load` can load flac format.
+
+        This test takes the same strategy as mp3 to compare the result
+        """
+        path = self.get_temp_path('1.original.flac')
+        ref_path = self.get_temp_path('2.reference.wav')
+
+        # 1. Generate flac with sox
+        sox_utils.gen_audio_file(
+            path, sample_rate, num_channels,
+            compression=compression_level, bit_depth=16, duration=duration)
+        # 2. Convert to wav with sox
+        sox_utils.convert_audio_file(path, ref_path)
+        # 3. Load flac with torchaudio
+        data, sr = sox_io_backend.load(path)
+        # 4. Load wav with scipy
+        data_ref = load_wav(ref_path)[0]
+        # 5. Compare
+        assert sr == sample_rate
+        self.assertEqual(data, data_ref, atol=4e-05, rtol=1.3e-06)
+
+    def assert_vorbis(self, sample_rate, num_channels, quality_level, duration):
+        """`sox_io_backend.load` can load vorbis format.
+
+        This test takes the same strategy as mp3 to compare the result
+        """
+        path = self.get_temp_path('1.original.vorbis')
+        ref_path = self.get_temp_path('2.reference.wav')
+
+        # 1. Generate vorbis with sox
+        sox_utils.gen_audio_file(
+            path, sample_rate, num_channels,
+            compression=quality_level, bit_depth=16, duration=duration)
+        # 2. Convert to wav with sox
+        sox_utils.convert_audio_file(path, ref_path)
+        # 3. Load vorbis with torchaudio
+        data, sr = sox_io_backend.load(path)
+        # 4. Load wav with scipy
+        data_ref = load_wav(ref_path)[0]
+        # 5. Compare
+        assert sr == sample_rate
+        self.assertEqual(data, data_ref, atol=4e-05, rtol=1.3e-06)
+
+
+@skipIfNoExec('sox')
+@skipIfNoExtension
+class TestLoad(LoadTestBase):
+    """Test the correctness of `sox_io_backend.load` for various formats"""
+    @parameterized.expand(list(itertools.product(
+        ['float32', 'int32', 'int16', 'uint8'],
+        [8000, 16000],
+        [1, 2],
+        [False, True],
+    )), name_func=name_func)
+    def test_wav(self, dtype, sample_rate, num_channels, normalize):
+        """`sox_io_backend.load` can load wav format correctly."""
+        self.assert_wav(dtype, sample_rate, num_channels, normalize, duration=1)
+
+    @parameterized.expand(list(itertools.product(
+        ['int16'],
+        [16000],
+        [2],
+        [False],
+    )), name_func=name_func)
+    def test_wav_large(self, dtype, sample_rate, num_channels, normalize):
+        """`sox_io_backend.load` can load large wav file correctly."""
+        two_hours = 2 * 60 * 60
+        self.assert_wav(dtype, sample_rate, num_channels, normalize, two_hours)
+
+    @parameterized.expand(list(itertools.product(
+        ['float32', 'int32', 'int16', 'uint8'],
+        [4, 8, 16, 32],
+    )), name_func=name_func)
+    def test_multiple_channels(self, dtype, num_channels):
+        """`sox_io_backend.load` can load wav file with more than 2 channels."""
+        sample_rate = 8000
+        normalize = False
+        self.assert_wav(dtype, sample_rate, num_channels, normalize, duration=1)
+
+    @parameterized.expand(list(itertools.product(
+        [8000, 16000, 44100],
+        [1, 2],
+        [96, 128, 160, 192, 224, 256, 320],
+    )), name_func=name_func)
+    def test_mp3(self, sample_rate, num_channels, bit_rate):
+        """`sox_io_backend.load` can load mp3 format correctly."""
+        self.assert_mp3(sample_rate, num_channels, bit_rate, duration=1)
+
+    @parameterized.expand(list(itertools.product(
+        [16000],
+        [2],
+        [128],
+    )), name_func=name_func)
+    def test_mp3_large(self, sample_rate, num_channels, bit_rate):
+        """`sox_io_backend.load` can load large mp3 file correctly."""
+        two_hours = 2 * 60 * 60
+        self.assert_mp3(sample_rate, num_channels, bit_rate, two_hours)
+
+    @parameterized.expand(list(itertools.product(
+        [8000, 16000],
+        [1, 2],
+        list(range(9)),
+    )), name_func=name_func)
+    def test_flac(self, sample_rate, num_channels, compression_level):
+        """`sox_io_backend.load` can load flac format correctly."""
+        self.assert_flac(sample_rate, num_channels, compression_level, duration=1)
+
+    @parameterized.expand(list(itertools.product(
+        [16000],
+        [2],
+        [0],
+    )), name_func=name_func)
+    def test_flac_large(self, sample_rate, num_channels, compression_level):
+        """`sox_io_backend.load` can load large flac file correctly."""
+        two_hours = 2 * 60 * 60
+        self.assert_flac(sample_rate, num_channels, compression_level, two_hours)
+
+    @parameterized.expand(list(itertools.product(
+        [8000, 16000],
+        [1, 2],
+        [-1, 0, 1, 2, 3, 3.6, 5, 10],
+    )), name_func=name_func)
+    def test_vorbis(self, sample_rate, num_channels, quality_level):
+        """`sox_io_backend.load` can load vorbis format correctly."""
+        self.assert_vorbis(sample_rate, num_channels, quality_level, duration=1)
+
+    @parameterized.expand(list(itertools.product(
+        [16000],
+        [2],
+        [10],
+    )), name_func=name_func)
+    def test_vorbis_large(self, sample_rate, num_channels, quality_level):
+        """`sox_io_backend.load` can load large vorbis file correctly."""
+        two_hours = 2 * 60 * 60
+        self.assert_vorbis(sample_rate, num_channels, quality_level, two_hours)
+
+    @parameterized.expand(list(itertools.product(
+        ['96k'],
+        [1, 2],
+        [0, 5, 10],
+    )), name_func=name_func)
+    def test_opus(self, bitrate, num_channels, compression_level):
+        """`sox_io_backend.load` can load opus file correctly."""
+        ops_path = get_asset_path('io', f'{bitrate}_{compression_level}_{num_channels}ch.opus')
+        wav_path = self.get_temp_path(f'{bitrate}_{compression_level}_{num_channels}ch.opus.wav')
+        sox_utils.convert_audio_file(ops_path, wav_path)
+
+        expected, sample_rate = load_wav(wav_path)
+        found, sr = sox_io_backend.load(ops_path)
+
+        assert sample_rate == sr
+        self.assertEqual(expected, found)
+
+
+@skipIfNoExec('sox')
+@skipIfNoExtension
+class TestLoadParams(TempDirMixin, PytorchTestCase):
+    """Test the correctness of frame parameters of `sox_io_backend.load`"""
+    original = None
+    path = None
+
+    def setUp(self):
+        super().setUp()
+        sample_rate = 8000
+        self.original = get_wav_data('float32', num_channels=2)
+        self.path = self.get_temp_path('test.wave')
+        save_wav(self.path, self.original, sample_rate)
+
+    @parameterized.expand(list(itertools.product(
+        [0, 1, 10, 100, 1000],
+        [-1, 1, 10, 100, 1000],
+    )), name_func=name_func)
+    def test_frame(self, frame_offset, num_frames):
+        """num_frames and frame_offset correctly specify the region of data"""
+        found, _ = sox_io_backend.load(self.path, frame_offset, num_frames)
+        frame_end = None if num_frames == -1 else frame_offset + num_frames
+        self.assertEqual(found, self.original[:, frame_offset:frame_end])
+
+    @parameterized.expand([(True, ), (False, )], name_func=name_func)
+    def test_channels_first(self, channels_first):
+        """channels_first swaps axes"""
+        found, _ = sox_io_backend.load(self.path, channels_first=channels_first)
+        expected = self.original if channels_first else self.original.transpose(1, 0)
+        self.assertEqual(found, expected)
diff --git a/test/sox_io_backend/test_roundtrip.py b/test/sox_io_backend/test_roundtrip.py
new file mode 100644
index 0000000000..2a051bebd5
--- /dev/null
+++ b/test/sox_io_backend/test_roundtrip.py
@@ -0,0 +1,52 @@
+import itertools
+
+from torchaudio.backend import sox_io_backend
+from parameterized import parameterized
+
+from ..common_utils import (
+    TempDirMixin,
+    PytorchTestCase,
+    skipIfNoExec,
+    skipIfNoExtension,
+    get_wav_data,
+)
+from .common import (
+    name_func,
+)
+
+
+@skipIfNoExec('sox')
+@skipIfNoExtension
+class TestRoundTripIO(TempDirMixin, PytorchTestCase):
+    """save/load round trip should not degrade data for lossless formats"""
+    @parameterized.expand(list(itertools.product(
+        ['float32', 'int32', 'int16', 'uint8'],
+        [8000, 16000],
+        [1, 2],
+    )), name_func=name_func)
+    def test_wav(self, dtype, sample_rate, num_channels):
+        """save/load round trip should not degrade data for wav formats"""
+        original = get_wav_data(dtype, num_channels, normalize=False)
+        data = original
+        for i in range(10):
+            path = self.get_temp_path(f'{i}.wav')
+            sox_io_backend.save(path, data, sample_rate)
+            data, sr = sox_io_backend.load(path, normalize=False)
+            assert sr == sample_rate
+            self.assertEqual(original, data)
+
+    @parameterized.expand(list(itertools.product(
+        [8000, 16000],
+        [1, 2],
+        list(range(9)),
+    )), name_func=name_func)
+    def test_flac(self, sample_rate, num_channels, compression_level):
+        """save/load round trip should not degrade data for flac formats"""
+        original = get_wav_data('float32', num_channels)
+        data = original
+        for i in range(10):
+            path = self.get_temp_path(f'{i}.flac')
+            sox_io_backend.save(path, data, sample_rate, compression=compression_level)
+            data, sr = sox_io_backend.load(path)
+            assert sr == sample_rate
+            self.assertEqual(original, data)
diff --git a/test/sox_io_backend/test_save.py b/test/sox_io_backend/test_save.py
new file mode 100644
index 0000000000..53588c456b
--- /dev/null
+++ b/test/sox_io_backend/test_save.py
@@ -0,0 +1,304 @@
+import itertools
+
+from torchaudio.backend import sox_io_backend
+from parameterized import parameterized
+
+from ..common_utils import (
+    TempDirMixin,
+    PytorchTestCase,
+    skipIfNoExec,
+    skipIfNoExtension,
+    get_wav_data,
+    load_wav,
+    save_wav,
+    sox_utils,
+)
+from .common import (
+    name_func,
+)
+
+
+class SaveTestBase(TempDirMixin, PytorchTestCase):
+    def assert_wav(self, dtype, sample_rate, num_channels, num_frames):
+        """`sox_io_backend.save` can save wav format."""
+        path = self.get_temp_path('data.wav')
+        expected = get_wav_data(dtype, num_channels, num_frames=num_frames)
+        sox_io_backend.save(path, expected, sample_rate)
+        found, sr = load_wav(path)
+        assert sample_rate == sr
+        self.assertEqual(found, expected)
+
+    def assert_mp3(self, sample_rate, num_channels, bit_rate, duration):
+        """`sox_io_backend.save` can save mp3 format.
+
+        mp3 encoding introduces delay and boundary effects so
+        we convert the resulting mp3 to wav and compare the results there
+
+                          |
+                          | 1. Generate original wav file with SciPy
+                          |
+                          v
+          -------------- wav ----------------
+         |                                   |
+         | 2.1. load with scipy              | 3.1. Convert to mp3 with Sox
+         | then save with torchaudio         |
+         v                                   v
+        mp3                                 mp3
+         |                                   |
+         | 2.2. Convert to wav with Sox      | 3.2. Convert to wav with Sox
+         |                                   |
+         v                                   v
+        wav                                 wav
+         |                                   |
+         | 2.3. load with scipy              | 3.3. load with scipy
+         |                                   |
+         v                                   v
+        tensor -------> compare <--------- tensor
+
+        """
+        src_path = self.get_temp_path('1.reference.wav')
+        mp3_path = self.get_temp_path('2.1.torchaudio.mp3')
+        wav_path = self.get_temp_path('2.2.torchaudio.wav')
+        mp3_path_sox = self.get_temp_path('3.1.sox.mp3')
+        wav_path_sox = self.get_temp_path('3.2.sox.wav')
+
+        # 1. Generate original wav
+        data = get_wav_data('float32', num_channels, normalize=True, num_frames=duration * sample_rate)
+        save_wav(src_path, data, sample_rate)
+        # 2.1. Convert the original wav to mp3 with torchaudio
+        sox_io_backend.save(
+            mp3_path, load_wav(src_path)[0], sample_rate, compression=bit_rate)
+        # 2.2. Convert the mp3 to wav with Sox
+        sox_utils.convert_audio_file(mp3_path, wav_path)
+        # 2.3. Load
+        found = load_wav(wav_path)[0]
+
+        # 3.1. Convert the original wav to mp3 with SoX
+        sox_utils.convert_audio_file(src_path, mp3_path_sox, compression=bit_rate)
+        # 3.2. Convert the mp3 to wav with Sox
+        sox_utils.convert_audio_file(mp3_path_sox, wav_path_sox)
+        # 3.3. Load
+        expected = load_wav(wav_path_sox)[0]
+
+        self.assertEqual(found, expected)
+
+    def assert_flac(self, sample_rate, num_channels, compression_level, duration):
+        """`sox_io_backend.save` can save flac format.
+
+        This test takes the same strategy as mp3 to compare the result
+        """
+        src_path = self.get_temp_path('1.reference.wav')
+        flc_path = self.get_temp_path('2.1.torchaudio.flac')
+        wav_path = self.get_temp_path('2.2.torchaudio.wav')
+        flc_path_sox = self.get_temp_path('3.1.sox.flac')
+        wav_path_sox = self.get_temp_path('3.2.sox.wav')
+
+        # 1. Generate original wav
+        data = get_wav_data('float32', num_channels, normalize=True, num_frames=duration * sample_rate)
+        save_wav(src_path, data, sample_rate)
+        # 2.1. Convert the original wav to flac with torchaudio
+        sox_io_backend.save(
+            flc_path, load_wav(src_path)[0], sample_rate, compression=compression_level)
+        # 2.2. Convert the flac to wav with Sox
+        # converting to 32 bit because flac file has 24 bit depth which scipy cannot handle.
+        sox_utils.convert_audio_file(flc_path, wav_path, bit_depth=32)
+        # 2.3. Load
+        found = load_wav(wav_path)[0]
+
+        # 3.1. Convert the original wav to flac with SoX
+        sox_utils.convert_audio_file(src_path, flc_path_sox, compression=compression_level)
+        # 3.2. Convert the flac to wav with Sox
+        # converting to 32 bit because flac file has 24 bit depth which scipy cannot handle.
+        sox_utils.convert_audio_file(flc_path_sox, wav_path_sox, bit_depth=32)
+        # 3.3. Load
+        expected = load_wav(wav_path_sox)[0]
+
+        self.assertEqual(found, expected)
+
+    def _assert_vorbis(self, sample_rate, num_channels, quality_level, duration):
+        """`sox_io_backend.save` can save vorbis format.
+
+        This test takes the same strategy as mp3 to compare the result
+        """
+        src_path = self.get_temp_path('1.reference.wav')
+        vbs_path = self.get_temp_path('2.1.torchaudio.vorbis')
+        wav_path = self.get_temp_path('2.2.torchaudio.wav')
+        vbs_path_sox = self.get_temp_path('3.1.sox.vorbis')
+        wav_path_sox = self.get_temp_path('3.2.sox.wav')
+
+        # 1. Generate original wav
+        data = get_wav_data('int16', num_channels, normalize=False, num_frames=duration * sample_rate)
+        save_wav(src_path, data, sample_rate)
+        # 2.1. Convert the original wav to vorbis with torchaudio
+        sox_io_backend.save(
+            vbs_path, load_wav(src_path)[0], sample_rate, compression=quality_level)
+        # 2.2. Convert the vorbis to wav with Sox
+        sox_utils.convert_audio_file(vbs_path, wav_path)
+        # 2.3. Load
+        found = load_wav(wav_path)[0]
+
+        # 3.1. Convert the original wav to vorbis with SoX
+        sox_utils.convert_audio_file(src_path, vbs_path_sox, compression=quality_level)
+        # 3.2. Convert the vorbis to wav with Sox
+        sox_utils.convert_audio_file(vbs_path_sox, wav_path_sox)
+        # 3.3. Load
+        expected = load_wav(wav_path_sox)[0]
+
+        # sox's vorbis encoding has some random boundary effect, which cause small number of
+        # samples yields higher descrepency than the others.
+        # so we allow small portions of data to be outside of absolute torelance.
+        # make sure to pass somewhat long duration
+        atol = 1.0e-4
+        max_failure_allowed = 0.01  # this percent of samples are allowed to outside of atol.
+        failure_ratio = ((found - expected).abs() > atol).sum().item() / found.numel()
+        if failure_ratio > max_failure_allowed:
+            # it's failed and this will give a better error message.
+            self.assertEqual(found, expected, atol=atol, rtol=1.3e-6)
+
+    def assert_vorbis(self, *args, **kwargs):
+        # sox's vorbis encoding has some randomness, so we run tests multiple time
+        max_retry = 5
+        error = None
+        for _ in range(max_retry):
+            try:
+                self._assert_vorbis(*args, **kwargs)
+                break
+            except AssertionError as e:
+                error = e
+        else:
+            raise error
+
+
+@skipIfNoExec('sox')
+@skipIfNoExtension
+class TestSave(SaveTestBase):
+    @parameterized.expand(list(itertools.product(
+        ['float32', 'int32', 'int16', 'uint8'],
+        [8000, 16000],
+        [1, 2],
+    )), name_func=name_func)
+    def test_wav(self, dtype, sample_rate, num_channels):
+        """`sox_io_backend.save` can save wav format."""
+        self.assert_wav(dtype, sample_rate, num_channels, num_frames=None)
+
+    @parameterized.expand(list(itertools.product(
+        ['float32'],
+        [16000],
+        [2],
+    )), name_func=name_func)
+    def test_wav_large(self, dtype, sample_rate, num_channels):
+        """`sox_io_backend.save` can save large wav file."""
+        two_hours = 2 * 60 * 60 * sample_rate
+        self.assert_wav(dtype, sample_rate, num_channels, num_frames=two_hours)
+
+    @parameterized.expand(list(itertools.product(
+        ['float32', 'int32', 'int16', 'uint8'],
+        [4, 8, 16, 32],
+    )), name_func=name_func)
+    def test_multiple_channels(self, dtype, num_channels):
+        """`sox_io_backend.save` can save wav with more than 2 channels."""
+        sample_rate = 8000
+        self.assert_wav(dtype, sample_rate, num_channels, num_frames=None)
+
+    @parameterized.expand(list(itertools.product(
+        [8000, 16000],
+        [1, 2],
+        [-4.2, -0.2, 0, 0.2, 96, 128, 160, 192, 224, 256, 320],
+    )), name_func=name_func)
+    def test_mp3(self, sample_rate, num_channels, bit_rate):
+        """`sox_io_backend.save` can save mp3 format."""
+        self.assert_mp3(sample_rate, num_channels, bit_rate, duration=1)
+
+    @parameterized.expand(list(itertools.product(
+        [16000],
+        [2],
+        [128],
+    )), name_func=name_func)
+    def test_mp3_large(self, sample_rate, num_channels, bit_rate):
+        """`sox_io_backend.save` can save large mp3 file."""
+        two_hours = 2 * 60 * 60
+        self.assert_mp3(sample_rate, num_channels, bit_rate, duration=two_hours)
+
+    @parameterized.expand(list(itertools.product(
+        [8000, 16000],
+        [1, 2],
+        list(range(9)),
+    )), name_func=name_func)
+    def test_flac(self, sample_rate, num_channels, compression_level):
+        """`sox_io_backend.save` can save flac format."""
+        self.assert_flac(sample_rate, num_channels, compression_level, duration=1)
+
+    @parameterized.expand(list(itertools.product(
+        [16000],
+        [2],
+        [0],
+    )), name_func=name_func)
+    def test_flac_large(self, sample_rate, num_channels, compression_level):
+        """`sox_io_backend.save` can save large flac file."""
+        two_hours = 2 * 60 * 60
+        self.assert_flac(sample_rate, num_channels, compression_level, duration=two_hours)
+
+    @parameterized.expand(list(itertools.product(
+        [8000, 16000],
+        [1, 2],
+        [-1, 0, 1, 2, 3, 3.6, 5, 10],
+    )), name_func=name_func)
+    def test_vorbis(self, sample_rate, num_channels, quality_level):
+        """`sox_io_backend.save` can save vorbis format."""
+        self.assert_vorbis(sample_rate, num_channels, quality_level, duration=20)
+
+    # note: torchaudio can load large vorbis file, but cannot save large volbis file
+    # the following test causes Segmentation fault
+    #
+    '''
+    @parameterized.expand(list(itertools.product(
+        [16000],
+        [2],
+        [10],
+    )), name_func=name_func)
+    def test_vorbis_large(self, sample_rate, num_channels, quality_level):
+        """`sox_io_backend.save` can save large vorbis file correctly."""
+        two_hours = 2 * 60 * 60
+        self.assert_vorbis(sample_rate, num_channels, quality_level, two_hours)
+    '''
+
+
+@skipIfNoExec('sox')
+@skipIfNoExtension
+class TestSaveParams(TempDirMixin, PytorchTestCase):
+    """Test the correctness of optional parameters of `sox_io_backend.save`"""
+    @parameterized.expand([(True, ), (False, )], name_func=name_func)
+    def test_channels_first(self, channels_first):
+        """channels_first swaps axes"""
+        path = self.get_temp_path('data.wav')
+        data = get_wav_data('int32', 2, channels_first=channels_first)
+        sox_io_backend.save(
+            path, data, 8000, channels_first=channels_first)
+        found = load_wav(path)[0]
+        expected = data if channels_first else data.transpose(1, 0)
+        self.assertEqual(found, expected)
+
+    @parameterized.expand([
+        'float32', 'int32', 'int16', 'uint8'
+    ], name_func=name_func)
+    def test_noncontiguous(self, dtype):
+        """Noncontiguous tensors are saved correctly"""
+        path = self.get_temp_path('data.wav')
+        expected = get_wav_data(dtype, 4)[::2, ::2]
+        assert not expected.is_contiguous()
+        sox_io_backend.save(path, expected, 8000)
+        found = load_wav(path)[0]
+        self.assertEqual(found, expected)
+
+    @parameterized.expand([
+        'float32', 'int32', 'int16', 'uint8',
+    ])
+    def test_tensor_preserve(self, dtype):
+        """save function should not alter Tensor"""
+        path = self.get_temp_path('data.wav')
+        expected = get_wav_data(dtype, 4)[::2, ::2]
+
+        data = expected.clone()
+        sox_io_backend.save(path, data, 8000)
+
+        self.assertEqual(data, expected)
diff --git a/test/sox_io_backend/test_torchscript.py b/test/sox_io_backend/test_torchscript.py
new file mode 100644
index 0000000000..9a30aab0d2
--- /dev/null
+++ b/test/sox_io_backend/test_torchscript.py
@@ -0,0 +1,149 @@
+import itertools
+from typing import Optional
+
+import torch
+import torchaudio
+from parameterized import parameterized
+
+from ..common_utils import (
+    TempDirMixin,
+    TorchaudioTestCase,
+    skipIfNoExec,
+    skipIfNoExtension,
+    get_wav_data,
+    save_wav,
+    load_wav,
+    sox_utils,
+)
+from .common import (
+    name_func,
+)
+
+
+def py_info_func(filepath: str) -> torchaudio.backend.sox_io_backend.AudioMetaData:
+    return torchaudio.info(filepath)
+
+
+def py_load_func(filepath: str, normalize: bool, channels_first: bool):
+    return torchaudio.load(
+        filepath, normalize=normalize, channels_first=channels_first)
+
+
+def py_save_func(
+        filepath: str,
+        tensor: torch.Tensor,
+        sample_rate: int,
+        channels_first: bool = True,
+        compression: Optional[float] = None,
+):
+    torchaudio.save(filepath, tensor, sample_rate, channels_first, compression)
+
+
+@skipIfNoExec('sox')
+@skipIfNoExtension
+class SoxIO(TempDirMixin, TorchaudioTestCase):
+    """TorchScript-ability Test suite for `sox_io_backend`"""
+    backend = 'sox_io'
+
+    @parameterized.expand(list(itertools.product(
+        ['float32', 'int32', 'int16', 'uint8'],
+        [8000, 16000],
+        [1, 2],
+    )), name_func=name_func)
+    def test_info_wav(self, dtype, sample_rate, num_channels):
+        """`sox_io_backend.info` is torchscript-able and returns the same result"""
+        audio_path = self.get_temp_path(f'{dtype}_{sample_rate}_{num_channels}.wav')
+        data = get_wav_data(dtype, num_channels, normalize=False, num_frames=1 * sample_rate)
+        save_wav(audio_path, data, sample_rate)
+
+        script_path = self.get_temp_path('info_func.zip')
+        torch.jit.script(py_info_func).save(script_path)
+        ts_info_func = torch.jit.load(script_path)
+
+        py_info = py_info_func(audio_path)
+        ts_info = ts_info_func(audio_path)
+
+        assert py_info.sample_rate == ts_info.sample_rate
+        assert py_info.num_frames == ts_info.num_frames
+        assert py_info.num_channels == ts_info.num_channels
+
+    @parameterized.expand(list(itertools.product(
+        ['float32', 'int32', 'int16', 'uint8'],
+        [8000, 16000],
+        [1, 2],
+        [False, True],
+        [False, True],
+    )), name_func=name_func)
+    def test_load_wav(self, dtype, sample_rate, num_channels, normalize, channels_first):
+        """`sox_io_backend.load` is torchscript-able and returns the same result"""
+        audio_path = self.get_temp_path(f'test_load_{dtype}_{sample_rate}_{num_channels}_{normalize}.wav')
+        data = get_wav_data(dtype, num_channels, normalize=False, num_frames=1 * sample_rate)
+        save_wav(audio_path, data, sample_rate)
+
+        script_path = self.get_temp_path('load_func.zip')
+        torch.jit.script(py_load_func).save(script_path)
+        ts_load_func = torch.jit.load(script_path)
+
+        py_data, py_sr = py_load_func(
+            audio_path, normalize=normalize, channels_first=channels_first)
+        ts_data, ts_sr = ts_load_func(
+            audio_path, normalize=normalize, channels_first=channels_first)
+
+        self.assertEqual(py_sr, ts_sr)
+        self.assertEqual(py_data, ts_data)
+
+    @parameterized.expand(list(itertools.product(
+        ['float32', 'int32', 'int16', 'uint8'],
+        [8000, 16000],
+        [1, 2],
+    )), name_func=name_func)
+    def test_save_wav(self, dtype, sample_rate, num_channels):
+        script_path = self.get_temp_path('save_func.zip')
+        torch.jit.script(py_save_func).save(script_path)
+        ts_save_func = torch.jit.load(script_path)
+
+        expected = get_wav_data(dtype, num_channels)
+        py_path = self.get_temp_path(f'test_save_py_{dtype}_{sample_rate}_{num_channels}.wav')
+        ts_path = self.get_temp_path(f'test_save_ts_{dtype}_{sample_rate}_{num_channels}.wav')
+
+        py_save_func(py_path, expected, sample_rate, True, None)
+        ts_save_func(ts_path, expected, sample_rate, True, None)
+
+        py_data, py_sr = load_wav(py_path)
+        ts_data, ts_sr = load_wav(ts_path)
+
+        self.assertEqual(sample_rate, py_sr)
+        self.assertEqual(sample_rate, ts_sr)
+        self.assertEqual(expected, py_data)
+        self.assertEqual(expected, ts_data)
+
+    @parameterized.expand(list(itertools.product(
+        [8000, 16000],
+        [1, 2],
+        list(range(9)),
+    )), name_func=name_func)
+    def test_save_flac(self, sample_rate, num_channels, compression_level):
+        script_path = self.get_temp_path('save_func.zip')
+        torch.jit.script(py_save_func).save(script_path)
+        ts_save_func = torch.jit.load(script_path)
+
+        expected = get_wav_data('float32', num_channels)
+        py_path = self.get_temp_path(f'test_save_py_{sample_rate}_{num_channels}_{compression_level}.flac')
+        ts_path = self.get_temp_path(f'test_save_ts_{sample_rate}_{num_channels}_{compression_level}.flac')
+
+        py_save_func(py_path, expected, sample_rate, True, compression_level)
+        ts_save_func(ts_path, expected, sample_rate, True, compression_level)
+
+        # converting to 32 bit because flac file has 24 bit depth which scipy cannot handle.
+        py_path_wav = f'{py_path}.wav'
+        ts_path_wav = f'{ts_path}.wav'
+        sox_utils.convert_audio_file(py_path, py_path_wav, bit_depth=32)
+        sox_utils.convert_audio_file(ts_path, ts_path_wav, bit_depth=32)
+
+        py_data, py_sr = load_wav(py_path_wav, normalize=True)
+        ts_data, ts_sr = load_wav(ts_path_wav, normalize=True)
+
+        self.assertEqual(sample_rate, py_sr)
+        self.assertEqual(sample_rate, ts_sr)
+        self.assertEqual(expected, py_data)
+        self.assertEqual(expected, ts_data)
diff --git a/test/test_backend.py b/test/test_backend.py
index 1e8f9e4fd6..6b67cb2898 100644
--- a/test/test_backend.py
+++ b/test/test_backend.py
@@ -1,7 +1,4 @@
-import unittest
-
 import torchaudio
-from torchaudio._internal.module_utils import is_module_available
 
 from . import common_utils
 
@@ -28,15 +25,19 @@ class TestBackendSwitch_NoBackend(BackendSwitchMixin, common_utils.TorchaudioTes
     backend_module = torchaudio.backend.no_backend
 
 
-@unittest.skipIf(
-    not is_module_available('torchaudio._torchaudio'),
-    'torchaudio C++ extension not available')
+@common_utils.skipIfNoExtension
 class TestBackendSwitch_SoX(BackendSwitchMixin, common_utils.TorchaudioTestCase):
     backend = 'sox'
     backend_module = torchaudio.backend.sox_backend
 
 
-@unittest.skipIf(not is_module_available('soundfile'), '"soundfile" not available')
+@common_utils.skipIfNoExtension
+class TestBackendSwitch_SoXIO(BackendSwitchMixin, common_utils.TorchaudioTestCase):
+    backend = 'sox_io'
+    backend_module = torchaudio.backend.sox_io_backend
+
+
+@common_utils.skipIfNoModule('soundfile')
 class TestBackendSwitch_soundfile(BackendSwitchMixin, common_utils.TorchaudioTestCase):
     backend = 'soundfile'
     backend_module = torchaudio.backend.soundfile_backend
diff --git a/test/test_datasets.py b/test/test_datasets.py
index 3ac7fae8ac..c3b0c917da 100644
--- a/test/test_datasets.py
+++ b/test/test_datasets.py
@@ -46,9 +46,8 @@ def test_cmuarctic(self):
         data[0]
 
 
-@common_utils.skipIfNoSoxBackend
 class TestCommonVoice(common_utils.TorchaudioTestCase):
-    backend = 'sox'
+    backend = 'default'
     path = common_utils.get_asset_path()
 
     def test_commonvoice(self):
diff --git a/test/test_io.py b/test/test_io.py
index f58f66ed11..2c8aece85e 100644
--- a/test/test_io.py
+++ b/test/test_io.py
@@ -1,11 +1,24 @@
 import os
 import math
+import shutil
+import tempfile
 import unittest
 
 import torch
 import torchaudio
 
-from .common_utils import BACKENDS, BACKENDS_MP3, create_temp_assets_dir
+from .common_utils import BACKENDS, BACKENDS_MP3, get_asset_path
+
+
+def create_temp_assets_dir():
+    """
+    Creates a temporary directory and moves all files from test/assets there.
+    Returns a Tuple[string, TemporaryDirectory] which is the folder path
+    and object.
+    """
+    tmp_dir = tempfile.TemporaryDirectory()
+    shutil.copytree(get_asset_path(), os.path.join(tmp_dir.name, "assets"))
+    return tmp_dir.name, tmp_dir
 
 
 class Test_LoadSave(unittest.TestCase):
@@ -17,11 +30,15 @@ class Test_LoadSave(unittest.TestCase):
 
     def test_1_save(self):
         for backend in BACKENDS_MP3:
+            if backend == 'sox_io':
+                continue
             with self.subTest():
                 torchaudio.set_audio_backend(backend)
                 self._test_1_save(self.test_filepath, False)
 
         for backend in BACKENDS:
+            if backend == 'sox_io':
+                continue
             with self.subTest():
                 torchaudio.set_audio_backend(backend)
                 self._test_1_save(self.test_filepath_wav, True)
@@ -68,6 +85,8 @@ def _test_1_save(self, test_filepath, normalization):
 
     def test_1_save_sine(self):
         for backend in BACKENDS:
+            if backend == 'sox_io':
+                continue
             with self.subTest():
                 torchaudio.set_audio_backend(backend)
                 self._test_1_save_sine()
@@ -101,11 +120,15 @@ def _test_1_save_sine(self):
 
     def test_2_load(self):
         for backend in BACKENDS_MP3:
+            if backend == 'sox_io':
+                continue
             with self.subTest():
                 torchaudio.set_audio_backend(backend)
                 self._test_2_load(self.test_filepath, 278756)
 
         for backend in BACKENDS:
+            if backend == 'sox_io':
+                continue
             with self.subTest():
                 torchaudio.set_audio_backend(backend)
                 self._test_2_load(self.test_filepath_wav, 276858)
@@ -142,6 +165,8 @@ def _test_2_load(self, test_filepath, length):
 
     def test_2_load_nonormalization(self):
         for backend in BACKENDS_MP3:
+            if backend == 'sox_io':
+                continue
             with self.subTest():
                 torchaudio.set_audio_backend(backend)
                 self._test_2_load_nonormalization(self.test_filepath, 278756)
@@ -159,6 +184,8 @@ def _test_2_load_nonormalization(self, test_filepath, length):
 
     def test_3_load_and_save_is_identity(self):
         for backend in BACKENDS:
+            if backend == 'sox_io':
+                continue
             with self.subTest():
                 torchaudio.set_audio_backend(backend)
                 self._test_3_load_and_save_is_identity()
@@ -197,6 +224,8 @@ def _test_3_load_and_save_is_identity_across_backend(self, backend1, backend2):
 
     def test_4_load_partial(self):
         for backend in BACKENDS_MP3:
+            if backend == 'sox_io':
+                continue
             with self.subTest():
                 torchaudio.set_audio_backend(backend)
                 self._test_4_load_partial()
@@ -239,6 +268,8 @@ def _test_4_load_partial(self):
 
     def test_5_get_info(self):
         for backend in BACKENDS:
+            if backend == 'sox_io':
+                continue
             with self.subTest():
                 torchaudio.set_audio_backend(backend)
                 self._test_5_get_info()
diff --git a/test/test_librosa_compatibility.py b/test/test_librosa_compatibility.py
index 62e9d3ca88..aa933535e8 100644
--- a/test/test_librosa_compatibility.py
+++ b/test/test_librosa_compatibility.py
@@ -160,7 +160,8 @@ class TestTransforms(common_utils.TorchaudioTestCase):
     """Test suite for functions in `transforms` module."""
     def assert_compatibilities(self, n_fft, hop_length, power, n_mels, n_mfcc, sample_rate):
         common_utils.set_audio_backend('default')
-        sound, sample_rate = _load_audio_asset('sinewave.wav')
+        path = common_utils.get_asset_path('sinewave.wav')
+        sound, sample_rate = common_utils.load_wav(path)
         sound_librosa = sound.cpu().numpy().squeeze()  # (64000)
 
         # test core spectrogram
@@ -300,9 +301,9 @@ def test_InverseMelScale(self):
         hop_length = n_fft // 4
 
         # Prepare mel spectrogram input. We use torchaudio to compute one.
-        common_utils.set_audio_backend('default')
-        sound, sample_rate = _load_audio_asset(
-            'steam-train-whistle-daniel_simon.wav', offset=2**10, num_frames=2**14)
+        path = common_utils.get_asset_path('steam-train-whistle-daniel_simon.wav')
+        sound, sample_rate = common_utils.load_wav(path)
+        sound = sound[:, 2**10:2**10 + 2**14]
         sound = sound.mean(dim=0, keepdim=True)
         spec_orig = F.spectrogram(
             sound, pad=0, window=torch.hann_window(n_fft), n_fft=n_fft,
diff --git a/test/test_models.py b/test/test_models.py
index 7bd3f3819d..c54a57cebd 100644
--- a/test/test_models.py
+++ b/test/test_models.py
@@ -1,8 +1,10 @@
 import torch
-from torchaudio.models import Wav2Letter, _MelResNet
+from torchaudio.models import Wav2Letter, _MelResNet, _UpsampleNetwork, _WaveRNN
 
+from . import common_utils
 
-class TestWav2Letter:
+
+class TestWav2Letter(common_utils.TorchaudioTestCase):
 
     def test_waveform(self):
         batch_size = 2
@@ -31,21 +33,110 @@ def test_mfcc(self):
         assert out.size() == (batch_size, num_classes, 2)
 
 
-class TestMelResNet:
+class TestMelResNet(common_utils.TorchaudioTestCase):
 
     def test_waveform(self):
+        """Validate the output dimensions of a _MelResNet block.
+        """
 
-        batch_size = 2
-        num_features = 200
-        input_dims = 100
-        output_dims = 128
-        res_blocks = 10
-        hidden_dims = 128
-        pad = 2
+        n_batch = 2
+        n_time = 200
+        n_freq = 100
+        n_output = 128
+        n_res_block = 10
+        n_hidden = 128
+        kernel_size = 5
 
-        model = _MelResNet(res_blocks, input_dims, hidden_dims, output_dims, pad)
+        model = _MelResNet(n_res_block, n_freq, n_hidden, n_output, kernel_size)
 
-        x = torch.rand(batch_size, input_dims, num_features)
+        x = torch.rand(n_batch, n_freq, n_time)
         out = model(x)
 
-        assert out.size() == (batch_size, output_dims, num_features - pad * 2)
+        assert out.size() == (n_batch, n_output, n_time - kernel_size + 1)
+
+
+class TestUpsampleNetwork(common_utils.TorchaudioTestCase):
+
+    def test_waveform(self):
+        """Validate the output dimensions of a _UpsampleNetwork block.
+        """
+
+        upsample_scales = [5, 5, 8]
+        n_batch = 2
+        n_time = 200
+        n_freq = 100
+        n_output = 256
+        n_res_block = 10
+        n_hidden = 128
+        kernel_size = 5
+
+        total_scale = 1
+        for upsample_scale in upsample_scales:
+            total_scale *= upsample_scale
+
+        model = _UpsampleNetwork(upsample_scales, n_res_block, n_freq, n_hidden, n_output, kernel_size)
+
+        x = torch.rand(n_batch, n_freq, n_time)
+        out1, out2 = model(x)
+
+        assert out1.size() == (n_batch, n_freq, total_scale * (n_time - kernel_size + 1))
+        assert out2.size() == (n_batch, n_output, total_scale * (n_time - kernel_size + 1))
+
+
+class TestWaveRNN(common_utils.TorchaudioTestCase):
+
+    def test_waveform(self):
+        """Validate the output dimensions of a _WaveRNN model in waveform mode.
+        """
+
+        upsample_scales = [5, 5, 8]
+        n_rnn = 512
+        n_fc = 512
+        n_bits = 9
+        sample_rate = 24000
+        hop_length = 200
+        n_batch = 2
+        n_time = 200
+        n_freq = 100
+        n_output = 256
+        n_res_block = 10
+        n_hidden = 128
+        kernel_size = 5
+        mode = 'waveform'
+
+        model = _WaveRNN(upsample_scales, n_bits, sample_rate, hop_length, n_res_block,
+                         n_rnn, n_fc, kernel_size, n_freq, n_hidden, n_output, mode)
+
+        x = torch.rand(n_batch, 1, hop_length * (n_time - kernel_size + 1))
+        mels = torch.rand(n_batch, 1, n_freq, n_time)
+        out = model(x, mels)
+
+        assert out.size() == (n_batch, 1, hop_length * (n_time - kernel_size + 1), 2 ** n_bits)
+
+    def test_mol(self):
+        """Validate the output dimensions of a _WaveRNN model in mol mode.
+        """
+
+        upsample_scales = [5, 5, 8]
+        n_rnn = 512
+        n_fc = 512
+        n_bits = 9
+        sample_rate = 24000
+        hop_length = 200
+        n_batch = 2
+        n_time = 200
+        n_freq = 100
+        n_output = 256
+        n_res_block = 10
+        n_hidden = 128
+        kernel_size = 5
+        mode = 'mol'
+
+        model = _WaveRNN(upsample_scales, n_bits, sample_rate, hop_length, n_res_block,
+                         n_rnn, n_fc, kernel_size, n_freq, n_hidden, n_output, mode)
+
+        x = torch.rand(n_batch, 1, hop_length * (n_time - kernel_size + 1))
+        mels = torch.rand(n_batch, 1, n_freq, n_time)
+        out = model(x, mels)
+
+        assert out.size() == (n_batch, 1, hop_length * (n_time - kernel_size + 1), 30)
diff --git a/test/test_sox_compatibility.py b/test/test_sox_compatibility.py
index 23d58a3fb1..a5d9f8633c 100644
--- a/test/test_sox_compatibility.py
+++ b/test/test_sox_compatibility.py
@@ -9,12 +9,30 @@
 
 
 @common_utils.skipIfNoSoxBackend
-class TestFunctionalFiltering(common_utils.TorchaudioTestCase):
+class TestFunctionalFiltering(common_utils.TempDirMixin, common_utils.TorchaudioTestCase):
     backend = 'sox'
 
+    def setUp(self):
+        # 1. Create int16 signal to save as PCM wav
+        # 2. Write to temp file
+        # 3. Load temp file into tensor to reuse in downstream tests
+        #    Prefer to use common_utils.load_wav() but this implementation does
+        #    not match torchaudio.load and errors on downstream tests
+        super().setUp()
+
+        self.NOISE_SAMPLE_RATE = 44100  # N.B. 44.1 kHz required by SoX deemph effect
+        noise_waveform_as_int = common_utils.get_whitenoise(
+            sample_rate=self.NOISE_SAMPLE_RATE, duration=5, dtype=torch.int16, scale_factor=0.9,
+        )
+        self.noise_filepath = self.get_temp_path("whitenoise.wav")
+        common_utils.save_wav(
+            self.noise_filepath, noise_waveform_as_int, self.NOISE_SAMPLE_RATE
+        )
+        self.noise_waveform, _ = torchaudio.load(self.noise_filepath, normalization=True)
+
     def test_gain(self):
         test_filepath = common_utils.get_asset_path('steam-train-whistle-daniel_simon.wav')
-        waveform, _ = torchaudio.load(test_filepath)
+        waveform, _ = common_utils.load_wav(test_filepath)
 
         waveform_gain = F.gain(waveform, 3)
         self.assertTrue(waveform_gain.abs().max().item(), 1.)
@@ -28,7 +46,7 @@ def test_gain(self):
 
     def test_dither(self):
         test_filepath = common_utils.get_asset_path('steam-train-whistle-daniel_simon.wav')
-        waveform, _ = torchaudio.load(test_filepath)
+        waveform, _ = common_utils.load_wav(test_filepath)
 
         waveform_dithered = F.dither(waveform)
         waveform_dithered_noiseshaped = F.dither(waveform, noise_shaping=True)
@@ -48,7 +66,7 @@ def test_dither(self):
 
     def test_vctk_transform_pipeline(self):
         test_filepath_vctk = common_utils.get_asset_path('VCTK-Corpus', 'wav48', 'p224', 'p224_002.wav')
-        wf_vctk, sr_vctk = torchaudio.load(test_filepath_vctk)
+        wf_vctk, sr_vctk = common_utils.load_wav(test_filepath_vctk)
 
         # rate
         sample = T.Resample(sr_vctk, 16000, resampling_method='sinc_interpolation')
@@ -71,17 +89,14 @@ def test_lowpass(self):
         """
         Test biquad lowpass filter, compare to SoX implementation
         """
-
         cutoff_freq = 3000
 
-        noise_filepath = common_utils.get_asset_path('whitenoise.wav')
         E = torchaudio.sox_effects.SoxEffectsChain()
-        E.set_input_file(noise_filepath)
+        E.set_input_file(self.noise_filepath)
         E.append_effect_to_chain("lowpass", [cutoff_freq])
         sox_output_waveform, sr = E.sox_build_flow_effects()
 
-        waveform, sample_rate = torchaudio.load(noise_filepath, normalization=True)
-        output_waveform = F.lowpass_biquad(waveform, sample_rate, cutoff_freq)
+        output_waveform = F.lowpass_biquad(self.noise_waveform, self.NOISE_SAMPLE_RATE, cutoff_freq)
 
         self.assertEqual(output_waveform, sox_output_waveform, atol=1e-4, rtol=1e-5)
 
@@ -89,37 +104,30 @@ def test_highpass(self):
         """
         Test biquad highpass filter, compare to SoX implementation
         """
-
         cutoff_freq = 2000
 
-        noise_filepath = common_utils.get_asset_path('whitenoise.wav')
         E = torchaudio.sox_effects.SoxEffectsChain()
-        E.set_input_file(noise_filepath)
+        E.set_input_file(self.noise_filepath)
         E.append_effect_to_chain("highpass", [cutoff_freq])
         sox_output_waveform, sr = E.sox_build_flow_effects()
 
-        waveform, sample_rate = torchaudio.load(noise_filepath, normalization=True)
-        output_waveform = F.highpass_biquad(waveform, sample_rate, cutoff_freq)
+        output_waveform = F.highpass_biquad(self.noise_waveform, self.NOISE_SAMPLE_RATE, cutoff_freq)
 
-        # TBD - this fails at the 1e-4 level, debug why
-        self.assertEqual(output_waveform, sox_output_waveform, atol=1e-3, rtol=1e-5)
+        self.assertEqual(output_waveform, sox_output_waveform, atol=1.5e-3, rtol=1e-5)
 
     def test_allpass(self):
         """
         Test biquad allpass filter, compare to SoX implementation
         """
-
         central_freq = 1000
         q = 0.707
 
-        noise_filepath = common_utils.get_asset_path('whitenoise.wav')
         E = torchaudio.sox_effects.SoxEffectsChain()
-        E.set_input_file(noise_filepath)
+        E.set_input_file(self.noise_filepath)
         E.append_effect_to_chain("allpass", [central_freq, str(q) + 'q'])
         sox_output_waveform, sr = E.sox_build_flow_effects()
 
-        waveform, sample_rate = torchaudio.load(noise_filepath, normalization=True)
-        output_waveform = F.allpass_biquad(waveform, sample_rate, central_freq, q)
+        output_waveform = F.allpass_biquad(self.noise_waveform, self.NOISE_SAMPLE_RATE, central_freq, q)
 
         self.assertEqual(output_waveform, sox_output_waveform, atol=1e-4, rtol=1e-5)
 
@@ -127,19 +135,17 @@ def test_bandpass_with_csg(self):
         """
         Test biquad bandpass filter, compare to SoX implementation
         """
-
         central_freq = 1000
         q = 0.707
         const_skirt_gain = True
 
-        noise_filepath = common_utils.get_asset_path('whitenoise.wav')
         E = torchaudio.sox_effects.SoxEffectsChain()
-        E.set_input_file(noise_filepath)
+        E.set_input_file(self.noise_filepath)
         E.append_effect_to_chain("bandpass", ["-c", central_freq, str(q) + 'q'])
         sox_output_waveform, sr = E.sox_build_flow_effects()
 
-        waveform, sample_rate = torchaudio.load(noise_filepath, normalization=True)
-        output_waveform = F.bandpass_biquad(waveform, sample_rate, central_freq, q, const_skirt_gain)
+        output_waveform = F.bandpass_biquad(self.noise_waveform, self.NOISE_SAMPLE_RATE,
+                                            central_freq, q, const_skirt_gain)
 
         self.assertEqual(output_waveform, sox_output_waveform, atol=1e-4, rtol=1e-5)
 
@@ -147,19 +153,17 @@ def test_bandpass_without_csg(self):
         """
         Test biquad bandpass filter, compare to SoX implementation
         """
-
         central_freq = 1000
         q = 0.707
         const_skirt_gain = False
 
-        noise_filepath = common_utils.get_asset_path('whitenoise.wav')
         E = torchaudio.sox_effects.SoxEffectsChain()
-        E.set_input_file(noise_filepath)
+        E.set_input_file(self.noise_filepath)
         E.append_effect_to_chain("bandpass", [central_freq, str(q) + 'q'])
         sox_output_waveform, sr = E.sox_build_flow_effects()
 
-        waveform, sample_rate = torchaudio.load(noise_filepath, normalization=True)
-        output_waveform = F.bandpass_biquad(waveform, sample_rate, central_freq, q, const_skirt_gain)
+        output_waveform = F.bandpass_biquad(self.noise_waveform, self.NOISE_SAMPLE_RATE,
+                                            central_freq, q, const_skirt_gain)
 
         self.assertEqual(output_waveform, sox_output_waveform, atol=1e-4, rtol=1e-5)
 
@@ -167,18 +171,16 @@ def test_bandreject(self):
         """
         Test biquad bandreject filter, compare to SoX implementation
         """
-
         central_freq = 1000
         q = 0.707
 
-        noise_filepath = common_utils.get_asset_path('whitenoise.wav')
         E = torchaudio.sox_effects.SoxEffectsChain()
-        E.set_input_file(noise_filepath)
+        E.set_input_file(self.noise_filepath)
         E.append_effect_to_chain("bandreject", [central_freq, str(q) + 'q'])
         sox_output_waveform, sr = E.sox_build_flow_effects()
 
-        waveform, sample_rate = torchaudio.load(noise_filepath, normalization=True)
-        output_waveform = F.bandreject_biquad(waveform, sample_rate, central_freq, q)
+        output_waveform = F.bandreject_biquad(self.noise_waveform, self.NOISE_SAMPLE_RATE,
+                                              central_freq, q)
 
         self.assertEqual(output_waveform, sox_output_waveform, atol=1e-4, rtol=1e-5)
 
@@ -186,19 +188,16 @@ def test_band_with_noise(self):
         """
         Test biquad band filter with noise mode, compare to SoX implementation
         """
-
         central_freq = 1000
         q = 0.707
         noise = True
 
-        noise_filepath = common_utils.get_asset_path('whitenoise.wav')
         E = torchaudio.sox_effects.SoxEffectsChain()
-        E.set_input_file(noise_filepath)
+        E.set_input_file(self.noise_filepath)
         E.append_effect_to_chain("band", ["-n", central_freq, str(q) + 'q'])
         sox_output_waveform, sr = E.sox_build_flow_effects()
 
-        waveform, sample_rate = torchaudio.load(noise_filepath, normalization=True)
-        output_waveform = F.band_biquad(waveform, sample_rate, central_freq, q, noise)
+        output_waveform = F.band_biquad(self.noise_waveform, self.NOISE_SAMPLE_RATE, central_freq, q, noise)
 
         self.assertEqual(output_waveform, sox_output_waveform, atol=1e-4, rtol=1e-5)
 
@@ -211,14 +210,12 @@ def test_band_without_noise(self):
         q = 0.707
         noise = False
 
-        noise_filepath = common_utils.get_asset_path('whitenoise.wav')
         E = torchaudio.sox_effects.SoxEffectsChain()
-        E.set_input_file(noise_filepath)
+        E.set_input_file(self.noise_filepath)
         E.append_effect_to_chain("band", [central_freq, str(q) + 'q'])
         sox_output_waveform, sr = E.sox_build_flow_effects()
 
-        waveform, sample_rate = torchaudio.load(noise_filepath, normalization=True)
-        output_waveform = F.band_biquad(waveform, sample_rate, central_freq, q, noise)
+        output_waveform = F.band_biquad(self.noise_waveform, self.NOISE_SAMPLE_RATE, central_freq, q, noise)
 
         self.assertEqual(output_waveform, sox_output_waveform, atol=1e-4, rtol=1e-5)
 
@@ -231,14 +228,12 @@ def test_treble(self):
         q = 0.707
         gain = 40
 
-        noise_filepath = common_utils.get_asset_path('whitenoise.wav')
         E = torchaudio.sox_effects.SoxEffectsChain()
-        E.set_input_file(noise_filepath)
+        E.set_input_file(self.noise_filepath)
         E.append_effect_to_chain("treble", [gain, central_freq, str(q) + 'q'])
         sox_output_waveform, sr = E.sox_build_flow_effects()
 
-        waveform, sample_rate = torchaudio.load(noise_filepath, normalization=True)
-        output_waveform = F.treble_biquad(waveform, sample_rate, gain, central_freq, q)
+        output_waveform = F.treble_biquad(self.noise_waveform, self.NOISE_SAMPLE_RATE, gain, central_freq, q)
 
         self.assertEqual(output_waveform, sox_output_waveform, atol=1e-4, rtol=1e-5)
 
@@ -251,14 +246,12 @@ def test_bass(self):
         q = 0.707
         gain = 40
 
-        noise_filepath = common_utils.get_asset_path('whitenoise.wav')
         E = torchaudio.sox_effects.SoxEffectsChain()
-        E.set_input_file(noise_filepath)
+        E.set_input_file(self.noise_filepath)
         E.append_effect_to_chain("bass", [gain, central_freq, str(q) + 'q'])
         sox_output_waveform, sr = E.sox_build_flow_effects()
 
-        waveform, sample_rate = torchaudio.load(noise_filepath, normalization=True)
-        output_waveform = F.bass_biquad(waveform, sample_rate, gain, central_freq, q)
+        output_waveform = F.bass_biquad(self.noise_waveform, self.NOISE_SAMPLE_RATE, gain, central_freq, q)
 
         self.assertEqual(output_waveform, sox_output_waveform, atol=1.5e-4, rtol=1e-5)
 
@@ -266,15 +259,12 @@ def test_deemph(self):
         """
         Test biquad deemph filter, compare to SoX implementation
         """
-
-        noise_filepath = common_utils.get_asset_path('whitenoise.wav')
         E = torchaudio.sox_effects.SoxEffectsChain()
-        E.set_input_file(noise_filepath)
+        E.set_input_file(self.noise_filepath)
         E.append_effect_to_chain("deemph")
         sox_output_waveform, sr = E.sox_build_flow_effects()
 
-        waveform, sample_rate = torchaudio.load(noise_filepath, normalization=True)
-        output_waveform = F.deemph_biquad(waveform, sample_rate)
+        output_waveform = F.deemph_biquad(self.noise_waveform, self.NOISE_SAMPLE_RATE)
 
         self.assertEqual(output_waveform, sox_output_waveform, atol=1e-4, rtol=1e-5)
 
@@ -282,15 +272,12 @@ def test_riaa(self):
         """
         Test biquad riaa filter, compare to SoX implementation
         """
-
-        noise_filepath = common_utils.get_asset_path('whitenoise.wav')
         E = torchaudio.sox_effects.SoxEffectsChain()
-        E.set_input_file(noise_filepath)
+        E.set_input_file(self.noise_filepath)
         E.append_effect_to_chain("riaa")
         sox_output_waveform, sr = E.sox_build_flow_effects()
 
-        waveform, sample_rate = torchaudio.load(noise_filepath, normalization=True)
-        output_waveform = F.riaa_biquad(waveform, sample_rate)
+        output_waveform = F.riaa_biquad(self.noise_waveform, self.NOISE_SAMPLE_RATE)
 
         self.assertEqual(output_waveform, sox_output_waveform, atol=1e-4, rtol=1e-5)
 
@@ -299,14 +286,13 @@ def test_contrast(self):
         Test contrast effect, compare to SoX implementation
         """
         enhancement_amount = 80.
-        noise_filepath = common_utils.get_asset_path('whitenoise.wav')
+
         E = torchaudio.sox_effects.SoxEffectsChain()
-        E.set_input_file(noise_filepath)
+        E.set_input_file(self.noise_filepath)
         E.append_effect_to_chain("contrast", [enhancement_amount])
         sox_output_waveform, sr = E.sox_build_flow_effects()
 
-        waveform, sample_rate = torchaudio.load(noise_filepath, normalization=True)
-        output_waveform = F.contrast(waveform, enhancement_amount)
+        output_waveform = F.contrast(self.noise_waveform, enhancement_amount)
 
         self.assertEqual(output_waveform, sox_output_waveform, atol=1e-4, rtol=1e-5)
 
@@ -316,14 +302,13 @@ def test_dcshift_with_limiter(self):
         """
         shift = 0.5
         limiter_gain = 0.05
-        noise_filepath = common_utils.get_asset_path('whitenoise.wav')
+
         E = torchaudio.sox_effects.SoxEffectsChain()
-        E.set_input_file(noise_filepath)
+        E.set_input_file(self.noise_filepath)
         E.append_effect_to_chain("dcshift", [shift, limiter_gain])
         sox_output_waveform, sr = E.sox_build_flow_effects()
 
-        waveform, _ = torchaudio.load(noise_filepath, normalization=True)
-        output_waveform = F.dcshift(waveform, shift, limiter_gain)
+        output_waveform = F.dcshift(self.noise_waveform, shift, limiter_gain)
 
         self.assertEqual(output_waveform, sox_output_waveform, atol=1e-4, rtol=1e-5)
 
@@ -332,14 +317,13 @@ def test_dcshift_without_limiter(self):
         Test dcshift effect, compare to SoX implementation
         """
         shift = 0.6
-        noise_filepath = common_utils.get_asset_path('whitenoise.wav')
+
         E = torchaudio.sox_effects.SoxEffectsChain()
-        E.set_input_file(noise_filepath)
+        E.set_input_file(self.noise_filepath)
         E.append_effect_to_chain("dcshift", [shift])
         sox_output_waveform, sr = E.sox_build_flow_effects()
 
-        waveform, _ = torchaudio.load(noise_filepath, normalization=True)
-        output_waveform = F.dcshift(waveform, shift)
+        output_waveform = F.dcshift(self.noise_waveform, shift)
 
         self.assertEqual(output_waveform, sox_output_waveform, atol=1e-4, rtol=1e-5)
 
@@ -349,14 +333,13 @@ def test_overdrive(self):
         """
         gain = 30
         colour = 40
-        noise_filepath = common_utils.get_asset_path('whitenoise.wav')
+
         E = torchaudio.sox_effects.SoxEffectsChain()
-        E.set_input_file(noise_filepath)
+        E.set_input_file(self.noise_filepath)
         E.append_effect_to_chain("overdrive", [gain, colour])
         sox_output_waveform, sr = E.sox_build_flow_effects()
 
-        waveform, _ = torchaudio.load(noise_filepath, normalization=True)
-        output_waveform = F.overdrive(waveform, gain, colour)
+        output_waveform = F.overdrive(self.noise_waveform, gain, colour)
 
         self.assertEqual(output_waveform, sox_output_waveform, atol=1e-4, rtol=1e-5)
 
@@ -369,14 +352,14 @@ def test_phaser_sine(self):
         delay_ms = 2.0
         decay = 0.4
         speed = 0.5
-        noise_filepath = common_utils.get_asset_path('whitenoise.wav')
+
         E = torchaudio.sox_effects.SoxEffectsChain()
-        E.set_input_file(noise_filepath)
+        E.set_input_file(self.noise_filepath)
         E.append_effect_to_chain("phaser", [gain_in, gain_out, delay_ms, decay, speed, "-s"])
         sox_output_waveform, sr = E.sox_build_flow_effects()
 
-        waveform, sample_rate = torchaudio.load(noise_filepath, normalization=True)
-        output_waveform = F.phaser(waveform, sample_rate, gain_in, gain_out, delay_ms, decay, speed, sinusoidal=True)
+        output_waveform = F.phaser(self.noise_waveform, self.NOISE_SAMPLE_RATE,
+                                   gain_in, gain_out, delay_ms, decay, speed, sinusoidal=True)
 
         self.assertEqual(output_waveform, sox_output_waveform, atol=1e-4, rtol=1e-5)
 
@@ -389,14 +372,14 @@ def test_phaser_triangle(self):
         delay_ms = 2.0
         decay = 0.4
         speed = 0.5
-        noise_filepath = common_utils.get_asset_path('whitenoise.wav')
+
         E = torchaudio.sox_effects.SoxEffectsChain()
-        E.set_input_file(noise_filepath)
+        E.set_input_file(self.noise_filepath)
         E.append_effect_to_chain("phaser", [gain_in, gain_out, delay_ms, decay, speed, "-t"])
         sox_output_waveform, sr = E.sox_build_flow_effects()
 
-        waveform, sample_rate = torchaudio.load(noise_filepath, normalization=True)
-        output_waveform = F.phaser(waveform, sample_rate, gain_in, gain_out, delay_ms, decay, speed, sinusoidal=False)
+        output_waveform = F.phaser(self.noise_waveform, self.NOISE_SAMPLE_RATE,
+                                   gain_in, gain_out, delay_ms, decay, speed, sinusoidal=False)
 
         self.assertEqual(output_waveform, sox_output_waveform, atol=1e-4, rtol=1e-5)
 
@@ -410,15 +393,14 @@ def test_flanger_triangle_linear(self):
         width = 0.9
         speed = 0.5
         phase = 30
-        noise_filepath = common_utils.get_asset_path('whitenoise.wav')
+
         E = torchaudio.sox_effects.SoxEffectsChain()
-        E.set_input_file(noise_filepath)
+        E.set_input_file(self.noise_filepath)
         E.append_effect_to_chain("flanger", [delay, depth, regen, width, speed, "triangle", phase, "linear"])
         sox_output_waveform, sr = E.sox_build_flow_effects()
 
-        waveform, sample_rate = torchaudio.load(noise_filepath, normalization=True)
-        output_waveform = F.flanger(waveform, sample_rate, delay, depth, regen, width, speed, phase,
-                                    modulation='triangular', interpolation='linear')
+        output_waveform = F.flanger(self.noise_waveform, self.NOISE_SAMPLE_RATE, delay, depth, regen,
+                                    width, speed, phase, modulation='triangular', interpolation='linear')
 
         torch.testing.assert_allclose(output_waveform, sox_output_waveform, atol=1e-4, rtol=1e-5)
 
@@ -432,15 +414,14 @@ def test_flanger_triangle_quad(self):
         width = 0.4
         speed = 0.5
         phase = 40
-        noise_filepath = common_utils.get_asset_path('whitenoise.wav')
+
         E = torchaudio.sox_effects.SoxEffectsChain()
-        E.set_input_file(noise_filepath)
+        E.set_input_file(self.noise_filepath)
         E.append_effect_to_chain("flanger", [delay, depth, regen, width, speed, "triangle", phase, "quadratic"])
         sox_output_waveform, sr = E.sox_build_flow_effects()
 
-        waveform, sample_rate = torchaudio.load(noise_filepath, normalization=True)
-        output_waveform = F.flanger(waveform, sample_rate, delay, depth, regen, width, speed, phase,
-                                    modulation='triangular', interpolation='quadratic')
+        output_waveform = F.flanger(self.noise_waveform, self.NOISE_SAMPLE_RATE, delay, depth,
+                                    regen, width, speed, phase, modulation='triangular', interpolation='quadratic')
 
         torch.testing.assert_allclose(output_waveform, sox_output_waveform, atol=1e-4, rtol=1e-5)
 
@@ -454,15 +435,14 @@ def test_flanger_sine_linear(self):
         width = 0.23
         speed = 1.3
         phase = 60
-        noise_filepath = common_utils.get_asset_path('whitenoise.wav')
+
         E = torchaudio.sox_effects.SoxEffectsChain()
-        E.set_input_file(noise_filepath)
+        E.set_input_file(self.noise_filepath)
         E.append_effect_to_chain("flanger", [delay, depth, regen, width, speed, "sine", phase, "linear"])
         sox_output_waveform, sr = E.sox_build_flow_effects()
 
-        waveform, sample_rate = torchaudio.load(noise_filepath, normalization=True)
-        output_waveform = F.flanger(waveform, sample_rate, delay, depth, regen, width, speed, phase,
-                                    modulation='sinusoidal', interpolation='linear')
+        output_waveform = F.flanger(self.noise_waveform, self.NOISE_SAMPLE_RATE, delay, depth,
+                                    regen, width, speed, phase, modulation='sinusoidal', interpolation='linear')
 
         torch.testing.assert_allclose(output_waveform, sox_output_waveform, atol=1e-4, rtol=1e-5)
 
@@ -476,15 +456,14 @@ def test_flanger_sine_quad(self):
         width = 0.23
         speed = 1.3
         phase = 25
-        noise_filepath = common_utils.get_asset_path('whitenoise.wav')
+
         E = torchaudio.sox_effects.SoxEffectsChain()
-        E.set_input_file(noise_filepath)
+        E.set_input_file(self.noise_filepath)
         E.append_effect_to_chain("flanger", [delay, depth, regen, width, speed, "sine", phase, "quadratic"])
         sox_output_waveform, sr = E.sox_build_flow_effects()
 
-        waveform, sample_rate = torchaudio.load(noise_filepath, normalization=True)
-        output_waveform = F.flanger(waveform, sample_rate, delay, depth, regen, width, speed, phase,
-                                    modulation='sinusoidal', interpolation='quadratic')
+        output_waveform = F.flanger(self.noise_waveform, self.NOISE_SAMPLE_RATE, delay, depth,
+                                    regen, width, speed, phase, modulation='sinusoidal', interpolation='quadratic')
 
         torch.testing.assert_allclose(output_waveform, sox_output_waveform, atol=1e-4, rtol=1e-5)
 
@@ -497,21 +476,17 @@ def test_equalizer(self):
         q = 0.707
         gain = 1
 
-        noise_filepath = common_utils.get_asset_path('whitenoise.wav')
         E = torchaudio.sox_effects.SoxEffectsChain()
-        E.set_input_file(noise_filepath)
+        E.set_input_file(self.noise_filepath)
         E.append_effect_to_chain("equalizer", [center_freq, q, gain])
         sox_output_waveform, sr = E.sox_build_flow_effects()
 
-        waveform, sample_rate = torchaudio.load(noise_filepath, normalization=True)
-        output_waveform = F.equalizer_biquad(waveform, sample_rate, center_freq, gain, q)
+        output_waveform = F.equalizer_biquad(self.noise_waveform, self.NOISE_SAMPLE_RATE, center_freq, gain, q)
 
         self.assertEqual(output_waveform, sox_output_waveform, atol=1e-4, rtol=1e-5)
 
     def test_perf_biquad_filtering(self):
 
-        fn_sine = common_utils.get_asset_path('whitenoise.wav')
-
         b0 = 0.4
         b1 = 0.2
         b2 = 0.9
@@ -521,13 +496,12 @@ def test_perf_biquad_filtering(self):
 
         # SoX method
         E = torchaudio.sox_effects.SoxEffectsChain()
-        E.set_input_file(fn_sine)
+        E.set_input_file(self.noise_filepath)
         E.append_effect_to_chain("biquad", [b0, b1, b2, a0, a1, a2])
         waveform_sox_out, _ = E.sox_build_flow_effects()
 
-        waveform, _ = torchaudio.load(fn_sine, normalization=True)
         waveform_lfilter_out = F.lfilter(
-            waveform, torch.tensor([a0, a1, a2]), torch.tensor([b0, b1, b2])
+            self.noise_waveform, torch.tensor([a0, a1, a2]), torch.tensor([b0, b1, b2])
         )
 
         self.assertEqual(waveform_lfilter_out, waveform_sox_out, atol=1e-4, rtol=1e-5)
diff --git a/test/test_sox_effects.py b/test/test_sox_effects.py
index 68440db8d2..6b12d7cd06 100644
--- a/test/test_sox_effects.py
+++ b/test/test_sox_effects.py
@@ -1,3 +1,4 @@
+import sys
 import math
 import unittest
 
@@ -11,7 +12,7 @@
 class Test_SoxEffectsChain(common_utils.TorchaudioTestCase):
     backend = 'sox'
 
-    test_filepath = common_utils.get_asset_path("steam-train-whistle-daniel_simon.mp3")
+    test_filepath = common_utils.get_asset_path("steam-train-whistle-daniel_simon.wav")
 
     def test_single_channel(self):
         fn_sine = common_utils.get_asset_path("sinewave.wav")
@@ -34,6 +35,7 @@ def test_rate_channels(self):
         self.assertEqual(sr, target_rate)
         self.assertEqual(x.size(0), target_channels)
 
+    @unittest.skipIf(sys.platform == 'darwin', 'This test is known to fail on macOS')
     def test_lowpass_speed(self):
         speed = .8
         si, _ = torchaudio.info(self.test_filepath)
diff --git a/test/test_transforms.py b/test/test_transforms.py
index af2b834fdb..6df9562bc8 100644
--- a/test/test_transforms.py
+++ b/test/test_transforms.py
@@ -45,7 +45,7 @@ def test_mu_law_companding(self):
 
     def test_AmplitudeToDB(self):
         filepath = common_utils.get_asset_path('steam-train-whistle-daniel_simon.wav')
-        waveform, sample_rate = torchaudio.load(filepath)
+        waveform = common_utils.load_wav(filepath)[0]
 
         mag_to_db_transform = transforms.AmplitudeToDB('magnitude', 80.)
         power_to_db_transform = transforms.AmplitudeToDB('power', 80.)
@@ -115,7 +115,7 @@ def test_mel2(self):
         self.assertTrue(mel_transform2.mel_scale.fb.sum(1).ge(0.).all())
         # check on multi-channel audio
         filepath = common_utils.get_asset_path('steam-train-whistle-daniel_simon.wav')
-        x_stereo, sr_stereo = torchaudio.load(filepath)  # (2, 278756), 44100
+        x_stereo = common_utils.load_wav(filepath)[0]  # (2, 278756), 44100
         spectrogram_stereo = s2db(mel_transform(x_stereo))  # (2, 128, 1394)
         self.assertTrue(spectrogram_stereo.dim() == 3)
         self.assertTrue(spectrogram_stereo.size(0) == 2)
@@ -166,7 +166,7 @@ def test_mfcc(self):
 
     def test_resample_size(self):
         input_path = common_utils.get_asset_path('sinewave.wav')
-        waveform, sample_rate = torchaudio.load(input_path)
+        waveform, sample_rate = common_utils.load_wav(input_path)
 
         upsample_rate = sample_rate * 2
         downsample_rate = sample_rate // 2
diff --git a/test/torchscript_consistency_impl.py b/test/torchscript_consistency_impl.py
index 2527cd5fbf..76444dc18a 100644
--- a/test/torchscript_consistency_impl.py
+++ b/test/torchscript_consistency_impl.py
@@ -2,7 +2,6 @@
 import unittest
 
 import torch
-import torchaudio
 import torchaudio.functional as F
 import torchaudio.transforms as T
 
@@ -616,6 +615,5 @@ def test_SlidingWindowCmn(self):
 
     def test_Vad(self):
         filepath = common_utils.get_asset_path("vad-go-mono-32000.wav")
-        common_utils.set_audio_backend('default')
-        waveform, sample_rate = torchaudio.load(filepath)
+        waveform, sample_rate = common_utils.load_wav(filepath)
         self._assert_consistency(T.Vad(sample_rate=sample_rate), waveform)
diff --git a/third_party/CMakeLists.txt b/third_party/CMakeLists.txt
new file mode 100644
index 0000000000..c4be4cb7b0
--- /dev/null
+++ b/third_party/CMakeLists.txt
@@ -0,0 +1,80 @@
+cmake_minimum_required(VERSION 3.1)
+
+project(torchaudio_third_parties)
+include(ExternalProject)
+
+set(INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/install)
+set(ARCHIVE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/archives)
+set(COMMON_ARGS --quiet --disable-shared --enable-static --prefix=${INSTALL_DIR} --with-pic --disable-dependency-tracking --disable-debug --disable-examples --disable-doc)
+
+ExternalProject_Add(libmad
+  PREFIX ${CMAKE_CURRENT_SOURCE_DIR}
+  DOWNLOAD_DIR ${ARCHIVE_DIR}
+  URL https://downloads.sourceforge.net/project/mad/libmad/0.15.1b/libmad-0.15.1b.tar.gz
+  URL_HASH SHA256=bbfac3ed6bfbc2823d3775ebb931087371e142bb0e9bb1bee51a76a6e0078690
+  PATCH_COMMAND patch < ${CMAKE_CURRENT_SOURCE_DIR}/patch/libmad.patch
+  CONFIGURE_COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/src/libmad/configure ${COMMON_ARGS}
+)
+
+ExternalProject_Add(libmp3lame
+  PREFIX ${CMAKE_CURRENT_SOURCE_DIR}
+  DOWNLOAD_DIR ${ARCHIVE_DIR}
+  URL https://downloads.sourceforge.net/project/lame/lame/3.99/lame-3.99.5.tar.gz
+  URL_HASH SHA256=24346b4158e4af3bd9f2e194bb23eb473c75fb7377011523353196b19b9a23ff
+  CONFIGURE_COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/src/libmp3lame/configure ${COMMON_ARGS} --enable-nasm
+)
+
+ExternalProject_Add(libogg
+  PREFIX ${CMAKE_CURRENT_SOURCE_DIR}
+  DOWNLOAD_DIR ${ARCHIVE_DIR}
+  URL https://ftp.osuosl.org/pub/xiph/releases/ogg/libogg-1.3.3.tar.gz
+  URL_HASH SHA256=c2e8a485110b97550f453226ec644ebac6cb29d1caef2902c007edab4308d985
+  CONFIGURE_COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/src/libogg/configure ${COMMON_ARGS}
+)
+
+ExternalProject_Add(libflac
+  PREFIX ${CMAKE_CURRENT_SOURCE_DIR}
+  DEPENDS libogg
+  DOWNLOAD_DIR ${ARCHIVE_DIR}
+  URL https://ftp.osuosl.org/pub/xiph/releases/flac/flac-1.3.2.tar.xz
+  URL_HASH SHA256=91cfc3ed61dc40f47f050a109b08610667d73477af6ef36dcad31c31a4a8d53f
+  CONFIGURE_COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/build_codec_helper.sh ${CMAKE_CURRENT_SOURCE_DIR}/src/libflac/configure ${COMMON_ARGS} --with-ogg
+)
+
+ExternalProject_Add(libvorbis
+  PREFIX ${CMAKE_CURRENT_SOURCE_DIR}
+  DEPENDS libogg
+  DOWNLOAD_DIR ${ARCHIVE_DIR}
+  URL https://ftp.osuosl.org/pub/xiph/releases/vorbis/libvorbis-1.3.6.tar.gz
+  URL_HASH SHA256=6ed40e0241089a42c48604dc00e362beee00036af2d8b3f46338031c9e0351cb
+  CONFIGURE_COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/build_codec_helper.sh ${CMAKE_CURRENT_SOURCE_DIR}/src/libvorbis/configure ${COMMON_ARGS} --with-ogg
+)
+
+ExternalProject_Add(libopus
+  PREFIX ${CMAKE_CURRENT_SOURCE_DIR}
+  DEPENDS libogg
+  DOWNLOAD_DIR ${ARCHIVE_DIR}
+  URL https://ftp.osuosl.org/pub/xiph/releases/opus/opus-1.3.1.tar.gz
+  URL_HASH SHA256=65b58e1e25b2a114157014736a3d9dfeaad8d41be1c8179866f144a2fb44ff9d
+  CONFIGURE_COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/build_codec_helper.sh ${CMAKE_CURRENT_SOURCE_DIR}/src/libopus/configure ${COMMON_ARGS} --with-ogg
+)
+
+ExternalProject_Add(opusfile
+  PREFIX ${CMAKE_CURRENT_SOURCE_DIR}
+  DEPENDS libopus
+  DOWNLOAD_DIR ${ARCHIVE_DIR}
+  STAMP_DIR ${CMAKE_CURRENT_SOURCE_DIR}/src/opusfile-stamp
+  SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/src/opusfile
+  URL https://ftp.osuosl.org/pub/xiph/releases/opus/opusfile-0.12.tar.gz
+  URL_HASH SHA256=118d8601c12dd6a44f52423e68ca9083cc9f2bfe72da7a8c1acb22a80ae3550b
+  CONFIGURE_COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/build_codec_helper.sh ${CMAKE_CURRENT_SOURCE_DIR}/src/opusfile/configure ${COMMON_ARGS} --disable-http
+)
+
+ExternalProject_Add(libsox
+  PREFIX ${CMAKE_CURRENT_SOURCE_DIR}
+  DEPENDS libogg libflac libvorbis opusfile libmp3lame libmad
+  DOWNLOAD_DIR ${ARCHIVE_DIR}
+  URL https://downloads.sourceforge.net/project/sox/sox/14.4.2/sox-14.4.2.tar.bz2
+  URL_HASH SHA256=81a6956d4330e75b5827316e44ae381e6f1e8928003c6aa45896da9041ea149c
+  CONFIGURE_COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/build_codec_helper.sh ${CMAKE_CURRENT_SOURCE_DIR}/src/libsox/configure ${COMMON_ARGS} --with-lame --with-flac --with-mad --with-oggvorbis --without-alsa --without-coreaudio --without-png --without-oss --without-sndfile --with-opus
+)
diff --git a/third_party/build_codec_helper.sh b/third_party/build_codec_helper.sh
new file mode 100755
index 0000000000..e7f2614781
--- /dev/null
+++ b/third_party/build_codec_helper.sh
@@ -0,0 +1,13 @@
+#!/usr/bin/env bash
+# Helper script for building codecs depending on libogg, such as libopus and opus.
+# It is difficult to set environment variable inside of ExternalProject_Add,
+# so this script sets necessary environment variables before running the given command
+
+this_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+install_dir="${this_dir}/install"
+
+export PKG_CONFIG_PATH="${install_dir}/lib/pkgconfig"
+export LDFLAGS="-L${install_dir}/lib ${LDFLAGS}"
+export CPPFLAGS="-I${install_dir}/include ${CPPFLAGS}"
+
+$@
diff --git a/third_party/patch/libmad.patch b/third_party/patch/libmad.patch
new file mode 100644
index 0000000000..a805787831
--- /dev/null
+++ b/third_party/patch/libmad.patch
@@ -0,0 +1,86 @@
+See the followings for the origin of this patch
+http://www.linuxfromscratch.org/blfs/view/svn/multimedia/libmad.html
+http://www.linuxfromscratch.org/patches/blfs/svn/libmad-0.15.1b-fixes-1.patch
+--- src/libmad/configure	2004-02-05 09:34:07.000000000 +0000
++++ src/libmad/configure.new	2020-06-30 21:10:28.528018931 +0000
+@@ -19083,71 +19083,7 @@
+ 
+ if test "$GCC" = yes
+ then
+-    if test -z "$arch"
+-    then
+-	case "$host" in
+-	    i386-*)           ;;
+-	    i?86-*)           arch="-march=i486" ;;
+-	    arm*-empeg-*)     arch="-march=armv4 -mtune=strongarm1100" ;;
+-	    armv4*-*)         arch="-march=armv4 -mtune=strongarm" ;;
+-	    powerpc-*)        ;;
+-	    mips*-agenda-*)   arch="-mcpu=vr4100" ;;
+-	    mips*-luxsonor-*) arch="-mips1 -mcpu=r3000 -Wa,-m4010" ;;
+-	esac
+-    fi
+-
+-    case "$optimize" in
+-	-O|"-O "*)
+-	    optimize="-O"
+-	    optimize="$optimize -fforce-mem"
+-	    optimize="$optimize -fforce-addr"
+-	    : #x optimize="$optimize -finline-functions"
+-	    : #- optimize="$optimize -fstrength-reduce"
+-	    optimize="$optimize -fthread-jumps"
+-	    optimize="$optimize -fcse-follow-jumps"
+-	    optimize="$optimize -fcse-skip-blocks"
+-	    : #x optimize="$optimize -frerun-cse-after-loop"
+-	    : #x optimize="$optimize -frerun-loop-opt"
+-	    : #x optimize="$optimize -fgcse"
+-	    optimize="$optimize -fexpensive-optimizations"
+-	    optimize="$optimize -fregmove"
+-	    : #* optimize="$optimize -fdelayed-branch"
+-	    : #x optimize="$optimize -fschedule-insns"
+-	    optimize="$optimize -fschedule-insns2"
+-	    : #? optimize="$optimize -ffunction-sections"
+-	    : #? optimize="$optimize -fcaller-saves"
+-	    : #> optimize="$optimize -funroll-loops"
+-	    : #> optimize="$optimize -funroll-all-loops"
+-	    : #x optimize="$optimize -fmove-all-movables"
+-	    : #x optimize="$optimize -freduce-all-givs"
+-	    : #? optimize="$optimize -fstrict-aliasing"
+-	    : #* optimize="$optimize -fstructure-noalias"
+-
+-	    case "$host" in
+-		arm*-*)
+-		    optimize="$optimize -fstrength-reduce"
+-		    ;;
+-		mips*-*)
+-		    optimize="$optimize -fstrength-reduce"
+-		    optimize="$optimize -finline-functions"
+-		    ;;
+-		i?86-*)
+-		    optimize="$optimize -fstrength-reduce"
+-		    ;;
+-		powerpc-apple-*)
+-		    # this triggers an internal compiler error with gcc2
+-		    : #optimize="$optimize -fstrength-reduce"
+-
+-		    # this is really only beneficial with gcc3
+-		    : #optimize="$optimize -finline-functions"
+-		    ;;
+-		*)
+-		    # this sometimes provokes bugs in gcc 2.95.2
+-		    : #optimize="$optimize -fstrength-reduce"
+-		    ;;
+-	    esac
+-	    ;;
+-    esac
++    optimize="-O2"
+ fi
+ 
+ case "$host" in
+@@ -21497,6 +21433,7 @@
+ then
+     case "$host" in
+ 	i?86-*)     FPM="INTEL"  ;;
++	x86_64*)    FPM="64BIT"  ;;
+ 	arm*-*)     FPM="ARM"    ;;
+ 	mips*-*)    FPM="MIPS"   ;;
+ 	sparc*-*)   FPM="SPARC"  ;;
diff --git a/torchaudio/backend/sox_io_backend.py b/torchaudio/backend/sox_io_backend.py
new file mode 100644
index 0000000000..4664f733c6
--- /dev/null
+++ b/torchaudio/backend/sox_io_backend.py
@@ -0,0 +1,134 @@
+from typing import Tuple, Optional
+
+import torch
+from torchaudio._internal import (
+    module_utils as _mod_utils,
+)
+
+
+class AudioMetaData:
+    def __init__(self, sample_rate: int, num_frames: int, num_channels: int):
+        self.sample_rate = sample_rate
+        self.num_frames = num_frames
+        self.num_channels = num_channels
+
+
+@_mod_utils.requires_module('torchaudio._torchaudio')
+def info(filepath: str) -> AudioMetaData:
+    """Get signal information of an audio file."""
+    sinfo = torch.ops.torchaudio.sox_io_get_info(filepath)
+    return AudioMetaData(sinfo.get_sample_rate(), sinfo.get_num_frames(), sinfo.get_num_channels())
+
+
+@_mod_utils.requires_module('torchaudio._torchaudio')
+def load(
+        filepath: str,
+        frame_offset: int = 0,
+        num_frames: int = -1,
+        normalize: bool = True,
+        channels_first: bool = True,
+) -> Tuple[torch.Tensor, int]:
+    """Load audio data from file.
+
+    This function can handle all the codecs that underlying libsox can handle, however note the
+    followings.
+
+    Note:
+        This function is tested on the following formats;
+         - WAV
+            - 32-bit floating-point
+            - 32-bit signed integer
+            - 16-bit signed integer
+            -  8-bit unsigned integer
+         - MP3
+         - FLAC
+         - OGG/VORBIS
+
+    By default, this function returns Tensor with ``float32`` dtype and the shape of ``[channel, time]``.
+    The samples are normalized to fit in the range of ``[-1.0, 1.0]``.
+
+    When the input format is WAV with integer type, such as 32-bit signed integer, 16-bit
+    signed integer and 8-bit unsigned integer (24-bit signed integer is not supported),
+    by providing ``normalize=False``, this function can return integer Tensor, where the samples
+    are expressed within the whole range of the corresponding dtype, that is, ``int32`` tensor
+    for 32-bit signed PCM, ``int16`` for 16-bit signed PCM and ``uint8`` for 8-bit unsigned PCM.
+
+    ``normalize`` parameter has no effect on 32-bit floating-point WAV and other formats, such as
+    flac and mp3. For these formats, this function always returns ``float32`` Tensor with values
+    normalized  to ``[-1.0, 1.0]``.
+
+    Args:
+        filepath: Path to audio file
+        frame_offset: Number of frames to skip before start reading data.
+        num_frames: Maximum number of frames to read. -1 reads all the remaining samples, starting
+            from ``frame_offset``. This function may return the less number of frames if there is
+            not enough frames in the given file.
+        normalize: When ``True``, this function always return ``float32``, and sample values are
+            normalized to ``[-1.0, 1.0]``. If input file is integer WAV, giving ``False`` will change
+            the resulting Tensor type to integer type. This argument has no effect for formats other
+            than integer WAV type.
+        channels_first: When True, the returned Tensor has dimension ``[channel, time]``.
+            Otherwise, the returned Tensor's dimension is ``[time, channel]``.
+
+    Returns:
+        torch.Tensor: If the input file has integer wav format and normalization is off, then it has
+            integer type, else ``float32`` type. If ``channels_first=True``, it has
+            ``[channel, time]`` else ``[time, channel]``.
+    """
+    signal = torch.ops.torchaudio.sox_io_load_audio_file(
+        filepath, frame_offset, num_frames, normalize, channels_first)
+    return signal.get_tensor(), signal.get_sample_rate()
+
+
+@_mod_utils.requires_module('torchaudio._torchaudio')
+def save(
+        filepath: str,
+        tensor: torch.Tensor,
+        sample_rate: int,
+        channels_first: bool = True,
+        compression: Optional[float] = None,
+):
+    """Save audio data to file.
+
+    Supported formats are;
+     - WAV
+        - 32-bit floating-point
+        - 32-bit signed integer
+        - 16-bit signed integer
+        -  8-bit unsigned integer
+     - MP3
+     - FLAC
+     - OGG/VORBIS
+
+    Args:
+        filepath: Path to save file.
+        tensor: Audio data to save. must be 2D tensor.
+        sample_rate: sampling rate
+        channels_first: If True, the given tensor is interpreted as ``[channel, time]``.
+        compression: Used for formats other than WAV. This corresponds to ``-C`` option
+            of ``sox`` command.
+            See the detail at http://sox.sourceforge.net/soxformat.html.
+            - MP3: Either bitrate [kbps] with quality factor, such as ``128.2`` or
+                VBR encoding with quality factor such as ``-4.2``. Default: ``-4.5``
+            - FLAC: compression level. Whole number from ``0`` to ``8``.
+                ``8`` is default and highest compression.
+            - OGG/VORBIS: number from -1 to 10; -1 is the highest compression and lowest
+                quality. Default: ``3``.
+    """
+    if compression is None:
+        ext = str(filepath)[-3:].lower()
+        if ext == 'wav':
+            compression = 0.
+        elif ext == 'mp3':
+            compression = -4.5
+        elif ext == 'flac':
+            compression = 8.
+        elif ext in ['ogg', 'vorbis']:
+            compression = 3.
+        else:
+            raise RuntimeError(f'Unsupported file type: "{ext}"')
+    signal = torch.classes.torchaudio.TensorSignal(tensor, sample_rate, channels_first)
+    torch.ops.torchaudio.sox_io_save_audio_file(filepath, signal, compression)
+
+
+load_wav = load
diff --git a/torchaudio/backend/utils.py b/torchaudio/backend/utils.py
index d537f01daf..cb53b3e02f 100644
--- a/torchaudio/backend/utils.py
+++ b/torchaudio/backend/utils.py
@@ -7,6 +7,7 @@
 from . import (
     no_backend,
     sox_backend,
+    sox_io_backend,
     soundfile_backend,
 )
 
@@ -24,6 +25,7 @@ def list_audio_backends() -> List[str]:
         backends.append('soundfile')
     if is_module_available('torchaudio._torchaudio'):
         backends.append('sox')
+        backends.append('sox_io')
     return backends
 
 
@@ -43,6 +45,8 @@ def set_audio_backend(backend: Optional[str]) -> None:
         module = no_backend
     elif backend == 'sox':
         module = sox_backend
+    elif backend == 'sox_io':
+        module = sox_io_backend
     elif backend == 'soundfile':
         module = soundfile_backend
     else:
@@ -69,6 +73,8 @@ def get_audio_backend() -> Optional[str]:
         return None
     if torchaudio.load == sox_backend.load:
         return 'sox'
+    if torchaudio.load == sox_io_backend.load:
+        return 'sox_io'
     if torchaudio.load == soundfile_backend.load:
         return 'soundfile'
     raise ValueError('Unknown backend.')
diff --git a/torchaudio/csrc/register.cpp b/torchaudio/csrc/register.cpp
index 81b1a84c96..3c03232941 100644
--- a/torchaudio/csrc/register.cpp
+++ b/torchaudio/csrc/register.cpp
@@ -1,17 +1,64 @@
 #ifndef TORCHAUDIO_REGISTER_H
 #define TORCHAUDIO_REGISTER_H
 
-#include <torchaudio/csrc/typedefs.h>
+#include <torchaudio/csrc/sox_effects.h>
+#include <torchaudio/csrc/sox_io.h>
+#include <torchaudio/csrc/sox_utils.h>
 
 namespace torchaudio {
 namespace {
 
+////////////////////////////////////////////////////////////////////////////////
+// sox_utils.h
+////////////////////////////////////////////////////////////////////////////////
+static auto registerTensorSignal =
+    torch::class_<sox_utils::TensorSignal>("torchaudio", "TensorSignal")
+        .def(torch::init<torch::Tensor, int64_t, bool>())
+        .def("get_tensor", &sox_utils::TensorSignal::getTensor)
+        .def("get_sample_rate", &sox_utils::TensorSignal::getSampleRate)
+        .def("get_channels_first", &sox_utils::TensorSignal::getChannelsFirst);
+
+////////////////////////////////////////////////////////////////////////////////
+// sox_io.h
+////////////////////////////////////////////////////////////////////////////////
 static auto registerSignalInfo =
-    torch::class_<SignalInfo>("torchaudio", "SignalInfo")
-        .def(torch::init<int64_t, int64_t, int64_t>())
-        .def("get_sample_rate", &SignalInfo::getSampleRate)
-        .def("get_num_channels", &SignalInfo::getNumChannels)
-        .def("get_num_samples", &SignalInfo::getNumSamples);
+    torch::class_<sox_io::SignalInfo>("torchaudio", "SignalInfo")
+        .def("get_sample_rate", &sox_io::SignalInfo::getSampleRate)
+        .def("get_num_channels", &sox_io::SignalInfo::getNumChannels)
+        .def("get_num_frames", &sox_io::SignalInfo::getNumFrames);
+
+static auto registerGetInfo = torch::RegisterOperators().op(
+    torch::RegisterOperators::options()
+        .schema(
+            "torchaudio::sox_io_get_info(str path) -> __torch__.torch.classes.torchaudio.SignalInfo info")
+        .catchAllKernel<decltype(sox_io::get_info), &sox_io::get_info>());
+
+static auto registerLoadAudioFile = torch::RegisterOperators().op(
+    torch::RegisterOperators::options()
+        .schema(
+            "torchaudio::sox_io_load_audio_file(str path, int frame_offset, int num_frames, bool normalize, bool channels_first) -> __torch__.torch.classes.torchaudio.TensorSignal signal")
+        .catchAllKernel<
+            decltype(sox_io::load_audio_file),
+            &sox_io::load_audio_file>());
+
+static auto registerSaveAudioFile = torch::RegisterOperators().op(
+    torch::RegisterOperators::options()
+        .schema(
+            "torchaudio::sox_io_save_audio_file(str path, __torch__.torch.classes.torchaudio.TensorSignal signal, float compression) -> ()")
+        .catchAllKernel<
+            decltype(sox_io::save_audio_file),
+            &sox_io::save_audio_file>());
+
+////////////////////////////////////////////////////////////////////////////////
+// sox_effects.h
+////////////////////////////////////////////////////////////////////////////////
+static auto registerSoxEffects =
+    torch::RegisterOperators(
+        "torchaudio::sox_effects_initialize_sox_effects",
+        &sox_effects::initialize_sox_effects)
+        .op("torchaudio::sox_effects_shutdown_sox_effects",
+            &sox_effects::shutdown_sox_effects)
+        .op("torchaudio::sox_effects_list_effects", &sox_effects::list_effects);
 
 } // namespace
 } // namespace torchaudio
diff --git a/torchaudio/csrc/sox.cpp b/torchaudio/csrc/sox.cpp
index 3ae81bef19..0f099946fd 100644
--- a/torchaudio/csrc/sox.cpp
+++ b/torchaudio/csrc/sox.cpp
@@ -82,17 +82,6 @@ std::tuple<sox_signalinfo_t, sox_encodinginfo_t> get_info(
   return std::make_tuple(fd->signal, fd->encoding);
 }
 
-std::vector<std::string> get_effect_names() {
-  sox_effect_fn_t const * fns = sox_get_effect_fns();
-  std::vector<std::string> sv;
-  for(int i = 0; fns[i]; ++i) {
-    const sox_effect_handler_t *eh = fns[i] ();
-    if(eh && eh->name)
-      sv.push_back(eh->name);
-  }
-  return sv;
-}
-
 int read_audio_file(
     const std::string& file_name,
     at::Tensor output,
@@ -186,16 +175,6 @@ void write_audio_file(
   }
 }
 
-int initialize_sox() {
-  /* Initialization for sox effects.  Only initialize once  */
-  return sox_init();
-}
-
-int shutdown_sox() {
-  /* Shutdown for sox effects.  Do not shutdown between multiple calls  */
-  return sox_quit();
-}
-
 int build_flow_effects(const std::string& file_name,
                        at::Tensor otensor,
                        bool ch_first,
@@ -489,20 +468,8 @@ PYBIND11_MODULE(_torchaudio, m) {
       "get_info",
       &torch::audio::get_info,
       "Gets information about an audio file");
-  m.def(
-      "get_effect_names",
-      &torch::audio::get_effect_names,
-      "Gets the names of all available effects");
   m.def(
       "build_flow_effects",
       &torch::audio::build_flow_effects,
       "build effects and flow chain into tensors");
-  m.def(
-      "initialize_sox",
-      &torch::audio::initialize_sox,
-      "initialize sox for effects");
-  m.def(
-      "shutdown_sox",
-      &torch::audio::shutdown_sox,
-      "shutdown sox for effects");
 }
diff --git a/torchaudio/csrc/sox.h b/torchaudio/csrc/sox.h
index 8d851c9b21..8093f0732e 100644
--- a/torchaudio/csrc/sox.h
+++ b/torchaudio/csrc/sox.h
@@ -45,13 +45,6 @@ void write_audio_file(
 std::tuple<sox_signalinfo_t, sox_encodinginfo_t> get_info(
     const std::string& file_name);
 
-// get names of all sox effects
-std::vector<std::string> get_effect_names();
-
-// Initialize and Shutdown SoX effects chain.  These functions should only be run once.
-int initialize_sox();
-int shutdown_sox();
-
 // Struct for build_flow_effects function
 struct SoxEffect {
   SoxEffect() : ename(""), eopts({""})  { }
diff --git a/torchaudio/csrc/sox_effects.cpp b/torchaudio/csrc/sox_effects.cpp
new file mode 100644
index 0000000000..9a0c2ddc6f
--- /dev/null
+++ b/torchaudio/csrc/sox_effects.cpp
@@ -0,0 +1,54 @@
+#include <sox.h>
+#include <torchaudio/csrc/sox_effects.h>
+
+using namespace torch::indexing;
+
+namespace torchaudio {
+namespace sox_effects {
+
+namespace {
+
+enum SoxEffectsResourceState { NotInitialized, Initialized, ShutDown };
+SoxEffectsResourceState SOX_RESOURCE_STATE = NotInitialized;
+
+} // namespace
+
+void initialize_sox_effects() {
+  if (SOX_RESOURCE_STATE == ShutDown) {
+    throw std::runtime_error(
+        "SoX Effects has been shut down. Cannot initialize again.");
+  }
+  if (SOX_RESOURCE_STATE == NotInitialized) {
+    if (sox_init() != SOX_SUCCESS) {
+      throw std::runtime_error("Failed to initialize sox effects.");
+    };
+    SOX_RESOURCE_STATE = Initialized;
+  }
+};
+
+void shutdown_sox_effects() {
+  if (SOX_RESOURCE_STATE == NotInitialized) {
+    throw std::runtime_error(
+        "SoX Effects is not initialized. Cannot shutdown.");
+  }
+  if (SOX_RESOURCE_STATE == Initialized) {
+    if (sox_quit() != SOX_SUCCESS) {
+      throw std::runtime_error("Failed to initialize sox effects.");
+    };
+    SOX_RESOURCE_STATE = ShutDown;
+  }
+}
+
+std::vector<std::string> list_effects() {
+  std::vector<std::string> names;
+  const sox_effect_fn_t* fns = sox_get_effect_fns();
+  for (int i = 0; fns[i]; ++i) {
+    const sox_effect_handler_t* handler = fns[i]();
+    if (handler && handler->name)
+      names.push_back(handler->name);
+  }
+  return names;
+}
+
+} // namespace sox_effects
+} // namespace torchaudio
diff --git a/torchaudio/csrc/sox_effects.h b/torchaudio/csrc/sox_effects.h
new file mode 100644
index 0000000000..14bdbbfabc
--- /dev/null
+++ b/torchaudio/csrc/sox_effects.h
@@ -0,0 +1,18 @@
+#ifndef TORCHAUDIO_SOX_EFFECTS_H
+#define TORCHAUDIO_SOX_EFFECTS_H
+
+#include <torch/script.h>
+
+namespace torchaudio {
+namespace sox_effects {
+
+void initialize_sox_effects();
+
+void shutdown_sox_effects();
+
+std::vector<std::string> list_effects();
+
+} // namespace sox_effects
+} // namespace torchaudio
+
+#endif
diff --git a/torchaudio/csrc/sox_io.cpp b/torchaudio/csrc/sox_io.cpp
new file mode 100644
index 0000000000..5d308027bb
--- /dev/null
+++ b/torchaudio/csrc/sox_io.cpp
@@ -0,0 +1,170 @@
+#include <sox.h>
+#include <torchaudio/csrc/sox_io.h>
+#include <torchaudio/csrc/sox_utils.h>
+
+using namespace torch::indexing;
+using namespace torchaudio::sox_utils;
+
+namespace torchaudio {
+namespace sox_io {
+
+SignalInfo::SignalInfo(
+    const int64_t sample_rate_,
+    const int64_t num_channels_,
+    const int64_t num_frames_)
+    : sample_rate(sample_rate_),
+      num_channels(num_channels_),
+      num_frames(num_frames_){};
+
+int64_t SignalInfo::getSampleRate() const {
+  return sample_rate;
+}
+
+int64_t SignalInfo::getNumChannels() const {
+  return num_channels;
+}
+
+int64_t SignalInfo::getNumFrames() const {
+  return num_frames;
+}
+
+c10::intrusive_ptr<SignalInfo> get_info(const std::string& path) {
+  SoxFormat sf(sox_open_read(
+      path.c_str(),
+      /*signal=*/nullptr,
+      /*encoding=*/nullptr,
+      /*filetype=*/nullptr));
+
+  if (static_cast<sox_format_t*>(sf) == nullptr) {
+    throw std::runtime_error("Error opening audio file");
+  }
+
+  return c10::make_intrusive<SignalInfo>(
+      static_cast<int64_t>(sf->signal.rate),
+      static_cast<int64_t>(sf->signal.channels),
+      static_cast<int64_t>(sf->signal.length / sf->signal.channels));
+}
+
+c10::intrusive_ptr<TensorSignal> load_audio_file(
+    const std::string& path,
+    const int64_t frame_offset,
+    const int64_t num_frames,
+    const bool normalize,
+    const bool channels_first) {
+  if (frame_offset < 0) {
+    throw std::runtime_error(
+        "Invalid argument: frame_offset must be non-negative.");
+  }
+  if (num_frames == 0 || num_frames < -1) {
+    throw std::runtime_error(
+        "Invalid argument: num_frames must be -1 or greater than 0.");
+  }
+
+  SoxFormat sf(sox_open_read(
+      path.c_str(),
+      /*signal=*/nullptr,
+      /*encoding=*/nullptr,
+      /*filetype=*/nullptr));
+
+  validate_input_file(sf);
+
+  const int64_t num_channels = sf->signal.channels;
+  const int64_t num_total_samples = sf->signal.length;
+  const int64_t sample_start = sf->signal.channels * frame_offset;
+
+  if (sox_seek(sf, sample_start, 0) == SOX_EOF) {
+    throw std::runtime_error("Error reading audio file: offset past EOF.");
+  }
+
+  const int64_t sample_end = [&]() {
+    if (num_frames == -1)
+      return num_total_samples;
+    const int64_t sample_end_ = num_channels * num_frames + sample_start;
+    if (num_total_samples < sample_end_) {
+      // For lossy encoding, it is difficult to predict exact size of buffer for
+      // reading the number of samples required.
+      // So we allocate buffer size of given `num_frames` and ask sox to read as
+      // much as possible. For lossless format, sox reads exact number of
+      // samples, but for lossy encoding, sox can end up reading less. (i.e.
+      // mp3) For the consistent behavior specification between lossy/lossless
+      // format, we allow users to provide `num_frames` value that exceeds #of
+      // available samples, and we adjust it here.
+      return num_total_samples;
+    }
+    return sample_end_;
+  }();
+
+  const int64_t max_samples = sample_end - sample_start;
+
+  // Read samples into buffer
+  std::vector<sox_sample_t> buffer;
+  buffer.reserve(max_samples);
+  const int64_t num_samples = sox_read(sf, buffer.data(), max_samples);
+  if (num_samples == 0) {
+    throw std::runtime_error(
+        "Error reading audio file: empty file or read operation failed.");
+  }
+  // NOTE: num_samples may be smaller than max_samples if the input
+  // format is compressed (i.e. mp3).
+
+  // Convert to Tensor
+  auto tensor = convert_to_tensor(
+      buffer.data(),
+      num_samples,
+      num_channels,
+      get_dtype(sf->encoding.encoding, sf->signal.precision),
+      normalize,
+      channels_first);
+
+  return c10::make_intrusive<TensorSignal>(
+      tensor, static_cast<int64_t>(sf->signal.rate), channels_first);
+}
+
+void save_audio_file(
+    const std::string& file_name,
+    const c10::intrusive_ptr<TensorSignal>& signal,
+    const double compression) {
+  const auto tensor = signal->getTensor();
+  const auto sample_rate = signal->getSampleRate();
+  const auto channels_first = signal->getChannelsFirst();
+
+  validate_input_tensor(tensor);
+
+  const auto filetype = get_filetype(file_name);
+  const auto signal_info =
+      get_signalinfo(tensor, sample_rate, channels_first, filetype);
+  const auto encoding_info =
+      get_encodinginfo(filetype, tensor.dtype(), compression);
+
+  SoxFormat sf(sox_open_write(
+      file_name.c_str(),
+      &signal_info,
+      &encoding_info,
+      /*filetype=*/filetype.c_str(),
+      /*oob=*/nullptr,
+      /*overwrite_permitted=*/nullptr));
+
+  if (static_cast<sox_format_t*>(sf) == nullptr) {
+    throw std::runtime_error("Error saving audio file: failed to open file.");
+  }
+
+  auto tensor_ = tensor;
+  if (channels_first) {
+    tensor_ = tensor_.t();
+  }
+
+  const int64_t frames_per_chunk = 65536;
+  for (int64_t i = 0; i < tensor_.size(0); i += frames_per_chunk) {
+    auto chunk = tensor_.index({Slice(i, i + frames_per_chunk), Slice()});
+    chunk = unnormalize_wav(chunk).contiguous();
+
+    const size_t numel = chunk.numel();
+    if (sox_write(sf, chunk.data_ptr<int32_t>(), numel) != numel) {
+      throw std::runtime_error(
+          "Error saving audio file: failed to write the entier buffer.");
+    }
+  }
+}
+
+} // namespace sox_io
+} // namespace torchaudio
diff --git a/torchaudio/csrc/sox_io.h b/torchaudio/csrc/sox_io.h
new file mode 100644
index 0000000000..5288e911e8
--- /dev/null
+++ b/torchaudio/csrc/sox_io.h
@@ -0,0 +1,41 @@
+#ifndef TORCHAUDIO_SOX_IO_H
+#define TORCHAUDIO_SOX_IO_H
+
+#include <torch/script.h>
+#include <torchaudio/csrc/sox_utils.h>
+
+namespace torchaudio {
+namespace sox_io {
+
+struct SignalInfo : torch::CustomClassHolder {
+  int64_t sample_rate;
+  int64_t num_channels;
+  int64_t num_frames;
+
+  SignalInfo(
+      const int64_t sample_rate_,
+      const int64_t num_channels_,
+      const int64_t num_frames_);
+  int64_t getSampleRate() const;
+  int64_t getNumChannels() const;
+  int64_t getNumFrames() const;
+};
+
+c10::intrusive_ptr<SignalInfo> get_info(const std::string& path);
+
+c10::intrusive_ptr<torchaudio::sox_utils::TensorSignal> load_audio_file(
+    const std::string& path,
+    const int64_t frame_offset = 0,
+    const int64_t num_frames = -1,
+    const bool normalize = true,
+    const bool channels_first = true);
+
+void save_audio_file(
+    const std::string& file_name,
+    const c10::intrusive_ptr<torchaudio::sox_utils::TensorSignal>& signal,
+    const double compression = 0.);
+
+} // namespace sox_io
+} // namespace torchaudio
+
+#endif
diff --git a/torchaudio/csrc/sox_utils.cpp b/torchaudio/csrc/sox_utils.cpp
new file mode 100644
index 0000000000..c1fd8383a8
--- /dev/null
+++ b/torchaudio/csrc/sox_utils.cpp
@@ -0,0 +1,245 @@
+#include <c10/core/ScalarType.h>
+#include <sox.h>
+#include <torchaudio/csrc/sox_utils.h>
+
+namespace torchaudio {
+namespace sox_utils {
+
+TensorSignal::TensorSignal(
+    torch::Tensor tensor_,
+    int64_t sample_rate_,
+    bool channels_first_)
+    : tensor(tensor_),
+      sample_rate(sample_rate_),
+      channels_first(channels_first_){};
+
+torch::Tensor TensorSignal::getTensor() const {
+  return tensor;
+}
+int64_t TensorSignal::getSampleRate() const {
+  return sample_rate;
+}
+bool TensorSignal::getChannelsFirst() const {
+  return channels_first;
+}
+
+SoxFormat::SoxFormat(sox_format_t* fd) noexcept : fd_(fd) {}
+SoxFormat::~SoxFormat() {
+  if (fd_ != nullptr) {
+    sox_close(fd_);
+  }
+}
+sox_format_t* SoxFormat::operator->() const noexcept {
+  return fd_;
+}
+SoxFormat::operator sox_format_t*() const noexcept {
+  return fd_;
+}
+
+void validate_input_file(const SoxFormat& sf) {
+  if (static_cast<sox_format_t*>(sf) == nullptr) {
+    throw std::runtime_error("Error loading audio file: failed to open file.");
+  }
+  if (sf->encoding.encoding == SOX_ENCODING_UNKNOWN) {
+    throw std::runtime_error("Error loading audio file: unknown encoding.");
+  }
+  if (sf->signal.length == 0) {
+    throw std::runtime_error("Error reading audio file: unkown length.");
+  }
+}
+
+void validate_input_tensor(const torch::Tensor tensor) {
+  if (!tensor.device().is_cpu()) {
+    throw std::runtime_error("Input tensor has to be on CPU.");
+  }
+
+  if (tensor.ndimension() != 2) {
+    throw std::runtime_error("Input tensor has to be 2D.");
+  }
+
+  const auto dtype = tensor.dtype();
+  if (!(dtype == torch::kFloat32 || dtype == torch::kInt32 ||
+        dtype == torch::kInt16 || dtype == torch::kUInt8)) {
+    throw std::runtime_error(
+        "Input tensor has to be one of float32, int32, int16 or uint8 type.");
+  }
+}
+
+caffe2::TypeMeta get_dtype(
+    const sox_encoding_t encoding,
+    const unsigned precision) {
+  const auto dtype = [&]() {
+    switch (encoding) {
+      case SOX_ENCODING_UNSIGNED: // 8-bit PCM WAV
+        return torch::kUInt8;
+      case SOX_ENCODING_SIGN2: // 16-bit or 32-bit PCM WAV
+        switch (precision) {
+          case 16:
+            return torch::kInt16;
+          case 32:
+            return torch::kInt32;
+          default:
+            throw std::runtime_error(
+                "Only 16 and 32 bits are supported for signed PCM.");
+        }
+      default:
+        // default to float32 for the other formats, including
+        // 32-bit flaoting-point WAV,
+        // MP3,
+        // FLAC,
+        // VORBIS etc...
+        return torch::kFloat32;
+    }
+  }();
+  return c10::scalarTypeToTypeMeta(dtype);
+}
+
+torch::Tensor convert_to_tensor(
+    sox_sample_t* buffer,
+    const int32_t num_samples,
+    const int32_t num_channels,
+    const caffe2::TypeMeta dtype,
+    const bool normalize,
+    const bool channels_first) {
+  auto t = torch::from_blob(
+      buffer, {num_samples / num_channels, num_channels}, torch::kInt32);
+  // Note: Tensor created from_blob does not own data but borrwos
+  // So make sure to create a new copy after processing samples.
+  if (normalize || dtype == torch::kFloat32) {
+    t = t.to(torch::kFloat32);
+    t *= (t > 0) / 2147483647. + (t < 0) / 2147483648.;
+  } else if (dtype == torch::kInt32) {
+    t = t.clone();
+  } else if (dtype == torch::kInt16) {
+    t.floor_divide_(1 << 16);
+    t = t.to(torch::kInt16);
+  } else if (dtype == torch::kUInt8) {
+    t.floor_divide_(1 << 24);
+    t += 128;
+    t = t.to(torch::kUInt8);
+  } else {
+    throw std::runtime_error("Unsupported dtype.");
+  }
+  if (channels_first) {
+    t = t.transpose(1, 0);
+  }
+  return t.contiguous();
+}
+
+torch::Tensor unnormalize_wav(const torch::Tensor input_tensor) {
+  const auto dtype = input_tensor.dtype();
+  auto tensor = input_tensor;
+  if (dtype == torch::kFloat32) {
+    double multi_pos = 2147483647.;
+    double multi_neg = -2147483648.;
+    auto mult = (tensor > 0) * multi_pos - (tensor < 0) * multi_neg;
+    tensor = tensor.to(torch::dtype(torch::kFloat64));
+    tensor *= mult;
+    tensor.clamp_(multi_neg, multi_pos);
+    tensor = tensor.to(torch::dtype(torch::kInt32));
+  } else if (dtype == torch::kInt32) {
+    // already denormalized
+  } else if (dtype == torch::kInt16) {
+    tensor = tensor.to(torch::dtype(torch::kInt32));
+    tensor *= ((tensor != 0) * 65536);
+  } else if (dtype == torch::kUInt8) {
+    tensor = tensor.to(torch::dtype(torch::kInt32));
+    tensor -= 128;
+    tensor *= 16777216;
+  } else {
+    throw std::runtime_error("Unexpected dtype.");
+  }
+  return tensor;
+}
+
+const std::string get_filetype(const std::string path) {
+  std::string ext = path.substr(path.find_last_of(".") + 1);
+  std::transform(ext.begin(), ext.end(), ext.begin(), ::tolower);
+  return ext;
+}
+
+sox_encoding_t get_encoding(
+    const std::string filetype,
+    const caffe2::TypeMeta dtype) {
+  if (filetype == "mp3")
+    return SOX_ENCODING_MP3;
+  if (filetype == "flac")
+    return SOX_ENCODING_FLAC;
+  if (filetype == "ogg" || filetype == "vorbis")
+    return SOX_ENCODING_VORBIS;
+  if (filetype == "wav") {
+    if (dtype == torch::kUInt8)
+      return SOX_ENCODING_UNSIGNED;
+    if (dtype == torch::kInt16)
+      return SOX_ENCODING_SIGN2;
+    if (dtype == torch::kInt32)
+      return SOX_ENCODING_SIGN2;
+    if (dtype == torch::kFloat32)
+      return SOX_ENCODING_FLOAT;
+    throw std::runtime_error("Unsupported dtype.");
+  }
+  throw std::runtime_error("Unsupported file type.");
+}
+
+unsigned get_precision(
+    const std::string filetype,
+    const caffe2::TypeMeta dtype) {
+  if (filetype == "mp3")
+    return SOX_UNSPEC;
+  if (filetype == "flac")
+    return 24;
+  if (filetype == "ogg" || filetype == "vorbis")
+    return SOX_UNSPEC;
+  if (filetype == "wav") {
+    if (dtype == torch::kUInt8)
+      return 8;
+    if (dtype == torch::kInt16)
+      return 16;
+    if (dtype == torch::kInt32)
+      return 32;
+    if (dtype == torch::kFloat32)
+      return 32;
+    throw std::runtime_error("Unsupported dtype.");
+  }
+  throw std::runtime_error("Unsupported file type.");
+}
+
+sox_signalinfo_t get_signalinfo(
+    const torch::Tensor& tensor,
+    const int64_t sample_rate,
+    const bool channels_first,
+    const std::string filetype) {
+  return sox_signalinfo_t{
+      /*rate=*/static_cast<sox_rate_t>(sample_rate),
+      /*channels=*/static_cast<unsigned>(tensor.size(channels_first ? 0 : 1)),
+      /*precision=*/get_precision(filetype, tensor.dtype()),
+      /*length=*/static_cast<uint64_t>(tensor.numel())};
+}
+
+sox_encodinginfo_t get_encodinginfo(
+    const std::string filetype,
+    const caffe2::TypeMeta dtype,
+    const double compression) {
+  const double compression_ = [&]() {
+    if (filetype == "mp3")
+      return compression;
+    if (filetype == "flac")
+      return compression;
+    if (filetype == "ogg" || filetype == "vorbis")
+      return compression;
+    if (filetype == "wav")
+      return 0.;
+    throw std::runtime_error("Unsupported file type.");
+  }();
+
+  return sox_encodinginfo_t{/*encoding=*/get_encoding(filetype, dtype),
+                            /*bits_per_sample=*/get_precision(filetype, dtype),
+                            /*compression=*/compression_,
+                            /*reverse_bytes=*/sox_option_default,
+                            /*reverse_nibbles=*/sox_option_default,
+                            /*reverse_bits=*/sox_option_default,
+                            /*opposite_endian=*/sox_false};
+}
+
+} // namespace sox_utils
+} // namespace torchaudio
diff --git a/torchaudio/csrc/sox_utils.h b/torchaudio/csrc/sox_utils.h
new file mode 100644
index 0000000000..665187c840
--- /dev/null
+++ b/torchaudio/csrc/sox_utils.h
@@ -0,0 +1,100 @@
+#ifndef TORCHAUDIO_SOX_UTILS_H
+#define TORCHAUDIO_SOX_UTILS_H
+
+#include <sox.h>
+#include <torch/script.h>
+
+namespace torchaudio {
+namespace sox_utils {
+
+struct TensorSignal : torch::CustomClassHolder {
+  torch::Tensor tensor;
+  int64_t sample_rate;
+  bool channels_first;
+
+  TensorSignal(
+      torch::Tensor tensor_,
+      int64_t sample_rate_,
+      bool channels_first_);
+
+  torch::Tensor getTensor() const;
+  int64_t getSampleRate() const;
+  bool getChannelsFirst() const;
+};
+
+/// helper class to automatically close sox_format_t*
+struct SoxFormat {
+  explicit SoxFormat(sox_format_t* fd) noexcept;
+  SoxFormat(const SoxFormat& other) = delete;
+  SoxFormat(SoxFormat&& other) = delete;
+  SoxFormat& operator=(const SoxFormat& other) = delete;
+  SoxFormat& operator=(SoxFormat&& other) = delete;
+  ~SoxFormat();
+  sox_format_t* operator->() const noexcept;
+  operator sox_format_t*() const noexcept;
+
+ private:
+  sox_format_t* fd_;
+};
+
+///
+/// Verify that input file is found, has known encoding, and not empty
+void validate_input_file(const SoxFormat& sf);
+
+///
+/// Verify that input Tensor is 2D, CPU and either uin8, int16, int32 or float32
+void validate_input_tensor(const torch::Tensor);
+
+///
+/// Get target dtype for the given encoding and precision.
+caffe2::TypeMeta get_dtype(
+    const sox_encoding_t encoding,
+    const unsigned precision);
+
+///
+/// Convert sox_sample_t buffer to uint8/int16/int32/float32 Tensor
+/// NOTE: This function might modify the values in the input buffer to
+/// reduce the number of memory copy.
+/// @param buffer Pointer to buffer that contains audio data.
+/// @param num_samples The number of samples to read.
+/// @param num_channels The number of channels. Used to reshape the resulting
+/// Tensor.
+/// @param dtype Target dtype. Determines the output dtype and value range in
+/// conjunction with normalization.
+/// @param noramlize Perform normalization. Only effective when dtype is not
+/// kFloat32. When effective, the output tensor is kFloat32 type and value range
+/// is [-1.0, 1.0]
+/// @param channels_first When True, output Tensor has shape of [num_channels,
+/// num_frames].
+torch::Tensor convert_to_tensor(
+    sox_sample_t* buffer,
+    const int32_t num_samples,
+    const int32_t num_channels,
+    const caffe2::TypeMeta dtype,
+    const bool normalize,
+    const bool channels_first);
+
+///
+/// Convert float32/int32/int16/uint8 Tensor to int32 for Torch -> Sox
+/// conversion.
+torch::Tensor unnormalize_wav(const torch::Tensor);
+
+/// Extract extension from file path
+const std::string get_filetype(const std::string path);
+
+/// Get sox_signalinfo_t for passing a torch::Tensor object.
+sox_signalinfo_t get_signalinfo(
+    const torch::Tensor& tensor,
+    const int64_t sample_rate,
+    const bool channels_first,
+    const std::string filetype);
+
+/// Get sox_encofinginfo_t for saving audoi file
+sox_encodinginfo_t get_encodinginfo(
+    const std::string filetype,
+    const caffe2::TypeMeta dtype,
+    const double compression);
+
+} // namespace sox_utils
+} // namespace torchaudio
+#endif
diff --git a/torchaudio/csrc/typedefs.cpp b/torchaudio/csrc/typedefs.cpp
deleted file mode 100644
index 7b81d665dc..0000000000
--- a/torchaudio/csrc/typedefs.cpp
+++ /dev/null
@@ -1,23 +0,0 @@
-#include <torchaudio/csrc/typedefs.h>
-
-namespace torchaudio {
-SignalInfo::SignalInfo(
-    const int64_t sample_rate_,
-    const int64_t num_channels_,
-    const int64_t num_samples_)
-    : sample_rate(sample_rate_),
-      num_channels(num_channels_),
-      num_samples(num_samples_){};
-
-int64_t SignalInfo::getSampleRate() const {
-  return sample_rate;
-}
-
-int64_t SignalInfo::getNumChannels() const {
-  return num_channels;
-}
-
-int64_t SignalInfo::getNumSamples() const {
-  return num_samples;
-}
-} // namespace torchaudio
diff --git a/torchaudio/csrc/typedefs.h b/torchaudio/csrc/typedefs.h
deleted file mode 100644
index 646ed09f3d..0000000000
--- a/torchaudio/csrc/typedefs.h
+++ /dev/null
@@ -1,23 +0,0 @@
-#ifndef TORCHAUDIO_TYPDEFS_H
-#define TORCHAUDIO_TYPDEFS_H
-
-#include <torch/script.h>
-
-namespace torchaudio {
-struct SignalInfo : torch::CustomClassHolder {
-  int64_t sample_rate;
-  int64_t num_channels;
-  int64_t num_samples;
-
-  SignalInfo(
-      const int64_t sample_rate_,
-      const int64_t num_channels_,
-      const int64_t num_samples_);
-  int64_t getSampleRate() const;
-  int64_t getNumChannels() const;
-  int64_t getNumSamples() const;
-};
-
-} // namespace torchaudio
-
-#endif
diff --git a/torchaudio/extension/extension.py b/torchaudio/extension/extension.py
index 4a2ab82124..b01ba13e39 100644
--- a/torchaudio/extension/extension.py
+++ b/torchaudio/extension/extension.py
@@ -12,38 +12,9 @@ def _init_extension():
         _init_script_module(ext)
     else:
         warnings.warn('torchaudio C++ extension is not available.')
-        _init_dummy_module()
 
 
 def _init_script_module(module):
     path = importlib.util.find_spec(module).origin
     torch.classes.load_library(path)
     torch.ops.load_library(path)
-
-
-def _init_dummy_module():
-    class SignalInfo:
-        """Data class for audio format information
-
-        Used when torchaudio C++ extension is not available for annotating
-        sox_io backend functions so that torchaudio is still importable
-        without extension.
-        This class has to implement the same interface as C++ equivalent.
-        """
-        def __init__(self, sample_rate: int, num_channels: int, num_samples: int):
-            self.sample_rate = sample_rate
-            self.num_channels = num_channels
-            self.num_samples = num_samples
-
-        def get_sample_rate(self):
-            return self.sample_rate
-
-        def get_num_channels(self):
-            return self.num_channels
-
-        def get_num_samples(self):
-            return self.num_samples
-
-    DummyModule = namedtuple('torchaudio', ['SignalInfo'])
-    module = DummyModule(SignalInfo)
-    setattr(torch.classes, 'torchaudio', module)
diff --git a/torchaudio/functional.py b/torchaudio/functional.py
index 28cb6a3fa2..78c8c594c9 100644
--- a/torchaudio/functional.py
+++ b/torchaudio/functional.py
@@ -485,9 +485,10 @@ def complex_norm(
     Returns:
         Tensor: Power of the normed input tensor. Shape of `(..., )`
     """
-    if power == 1.0:
-        return torch.norm(complex_tensor, 2, -1)
-    return torch.norm(complex_tensor, 2, -1).pow(power)
+
+    # Replace by torch.norm once issue is fixed
+    # https://github.com/pytorch/pytorch/issues/34279
+    return complex_tensor.pow(2.).sum(-1).pow(0.5 * power)
 
 
 def angle(
diff --git a/torchaudio/models/_wavernn.py b/torchaudio/models/_wavernn.py
index 04155fb87c..cd2e89a10c 100644
--- a/torchaudio/models/_wavernn.py
+++ b/torchaudio/models/_wavernn.py
@@ -1,105 +1,331 @@
+from typing import List
+
+import torch
 from torch import Tensor
 from torch import nn
 
-__all__ = ["_ResBlock", "_MelResNet"]
+__all__ = ["_ResBlock", "_MelResNet", "_Stretch2d", "_UpsampleNetwork", "_WaveRNN"]
 
 
 class _ResBlock(nn.Module):
-    r"""This is a ResNet block layer. This layer is based on the paper "Deep Residual Learning
-    for Image Recognition". Kaiming He,  Xiangyu Zhang, Shaoqing Ren, Jian Sun. CVPR, 2016.
-    It is a block used in WaveRNN. WaveRNN is based on the paper "Efficient Neural Audio Synthesis".
-    Nal Kalchbrenner, Erich Elsen, Karen Simonyan, Seb Noury, Norman Casagrande, Edward Lockhart,
-    Florian Stimberg, Aaron van den Oord, Sander Dieleman, Koray Kavukcuoglu. arXiv:1802.08435, 2018.
+    r"""ResNet block based on "Deep Residual Learning for Image Recognition"
+
+    The paper link is https://arxiv.org/pdf/1512.03385.pdf.
 
     Args:
-        num_dims: the number of compute dimensions in the input (default=128).
+        n_freq: the number of bins in a spectrogram (default=128)
 
-    Examples::
-        >>> resblock = _ResBlock(num_dims=128)
-        >>> input = torch.rand(10, 128, 512)
-        >>> output = resblock(input)
+    Examples
+        >>> resblock = _ResBlock()
+        >>> input = torch.rand(10, 128, 512)  # a random spectrogram
+        >>> output = resblock(input)  # shape: (10, 128, 512)
     """
 
-    def __init__(self, num_dims: int = 128) -> None:
+    def __init__(self, n_freq: int = 128) -> None:
         super().__init__()
 
         self.resblock_model = nn.Sequential(
-            nn.Conv1d(in_channels=num_dims, out_channels=num_dims, kernel_size=1, bias=False),
-            nn.BatchNorm1d(num_dims),
+            nn.Conv1d(in_channels=n_freq, out_channels=n_freq, kernel_size=1, bias=False),
+            nn.BatchNorm1d(n_freq),
             nn.ReLU(inplace=True),
-            nn.Conv1d(in_channels=num_dims, out_channels=num_dims, kernel_size=1, bias=False),
-            nn.BatchNorm1d(num_dims)
+            nn.Conv1d(in_channels=n_freq, out_channels=n_freq, kernel_size=1, bias=False),
+            nn.BatchNorm1d(n_freq)
         )
 
-    def forward(self, x: Tensor) -> Tensor:
+    def forward(self, specgram: Tensor) -> Tensor:
         r"""Pass the input through the _ResBlock layer.
-
         Args:
-            x: the input sequence to the _ResBlock layer (required).
+            specgram (Tensor): the input sequence to the _ResBlock layer (n_batch, n_freq, n_time).
 
-        Shape:
-            - x: :math:`(N, S, T)`.
-            - output: :math:`(N, S, T)`.
-        where N is the batch size, S is the number of input sequence,
-        T is the length of input sequence.
+        Return:
+            Tensor shape: (n_batch, n_freq, n_time)
         """
 
-        residual = x
-        return self.resblock_model(x) + residual
+        return self.resblock_model(specgram) + specgram
 
 
 class _MelResNet(nn.Module):
-    r"""This is a MelResNet layer based on a stack of ResBlocks. It is a block used in WaveRNN.
-    WaveRNN is based on the paper "Efficient Neural Audio Synthesis". Nal Kalchbrenner, Erich Elsen,
-    Karen Simonyan, Seb Noury, Norman Casagrande, Edward Lockhart, Florian Stimberg, Aaron van den Oord,
-    Sander Dieleman, Koray Kavukcuoglu. arXiv:1802.08435, 2018.
+    r"""MelResNet layer uses a stack of ResBlocks on spectrogram.
 
     Args:
-        res_blocks: the number of ResBlock in stack (default=10).
-        input_dims: the number of input sequence (default=100).
-        hidden_dims: the number of compute dimensions (default=128).
-        output_dims: the number of output sequence (default=128).
-        pad: the number of kernal size (pad * 2 + 1) in the first Conv1d layer (default=2).
-
-    Examples::
-        >>> melresnet = _MelResNet(res_blocks=10, input_dims=100,
-                                   hidden_dims=128, output_dims=128, pad=2)
-        >>> input = torch.rand(10, 100, 512)
-        >>> output = melresnet(input)
+        n_res_block: the number of ResBlock in stack (default=10)
+        n_freq: the number of bins in a spectrogram (default=128)
+        n_hidden: the number of hidden dimensions (default=128)
+        n_output: the number of output dimensions (default=128)
+        kernel_size: the number of kernel size in the first Conv1d layer (default=5)
+
+    Examples
+        >>> melresnet = _MelResNet()
+        >>> input = torch.rand(10, 128, 512)  # a random spectrogram
+        >>> output = melresnet(input)  # shape: (10, 128, 508)
     """
 
-    def __init__(self, res_blocks: int = 10,
-                 input_dims: int = 100,
-                 hidden_dims: int = 128,
-                 output_dims: int = 128,
-                 pad: int = 2) -> None:
+    def __init__(self,
+                 n_res_block: int = 10,
+                 n_freq: int = 128,
+                 n_hidden: int = 128,
+                 n_output: int = 128,
+                 kernel_size: int = 5) -> None:
         super().__init__()
 
-        kernel_size = pad * 2 + 1
-        ResBlocks = []
-
-        for i in range(res_blocks):
-            ResBlocks.append(_ResBlock(hidden_dims))
+        ResBlocks = [_ResBlock(n_hidden) for _ in range(n_res_block)]
 
         self.melresnet_model = nn.Sequential(
-            nn.Conv1d(in_channels=input_dims, out_channels=hidden_dims, kernel_size=kernel_size, bias=False),
-            nn.BatchNorm1d(hidden_dims),
+            nn.Conv1d(in_channels=n_freq, out_channels=n_hidden, kernel_size=kernel_size, bias=False),
+            nn.BatchNorm1d(n_hidden),
             nn.ReLU(inplace=True),
             *ResBlocks,
-            nn.Conv1d(in_channels=hidden_dims, out_channels=output_dims, kernel_size=1)
+            nn.Conv1d(in_channels=n_hidden, out_channels=n_output, kernel_size=1)
         )
 
-    def forward(self, x: Tensor) -> Tensor:
+    def forward(self, specgram: Tensor) -> Tensor:
         r"""Pass the input through the _MelResNet layer.
+        Args:
+            specgram (Tensor): the input sequence to the _MelResNet layer (n_batch, n_freq, n_time).
+
+        Return:
+            Tensor shape: (n_batch, n_output, n_time - kernel_size + 1)
+        """
+
+        return self.melresnet_model(specgram)
+
+
+class _Stretch2d(nn.Module):
+    r"""Upscale the frequency and time dimensions of a spectrogram.
+
+    Args:
+        time_scale: the scale factor in time dimension
+        freq_scale: the scale factor in frequency dimension
+
+    Examples
+        >>> stretch2d = _Stretch2d(time_scale=10, freq_scale=5)
+
+        >>> input = torch.rand(10, 100, 512)  # a random spectrogram
+        >>> output = stretch2d(input)  # shape: (10, 500, 5120)
+    """
+
+    def __init__(self,
+                 time_scale: int,
+                 freq_scale: int) -> None:
+        super().__init__()
+
+        self.freq_scale = freq_scale
+        self.time_scale = time_scale
+
+    def forward(self, specgram: Tensor) -> Tensor:
+        r"""Pass the input through the _Stretch2d layer.
 
         Args:
-            x: the input sequence to the _MelResNet layer (required).
+            specgram (Tensor): the input sequence to the _Stretch2d layer (..., n_freq, n_time).
 
-        Shape:
-            - x: :math:`(N, S, T)`.
-            - output: :math:`(N, P, T - 2 * pad)`.
-        where N is the batch size, S is the number of input sequence,
-        P is the number of output sequence, T is the length of input sequence.
+        Return:
+            Tensor shape: (..., n_freq * freq_scale, n_time * time_scale)
         """
 
-        return self.melresnet_model(x)
+        return specgram.repeat_interleave(self.freq_scale, -2).repeat_interleave(self.time_scale, -1)
+
+
+class _UpsampleNetwork(nn.Module):
+    r"""Upscale the dimensions of a spectrogram.
+
+    Args:
+        upsample_scales: the list of upsample scales
+        n_res_block: the number of ResBlock in stack (default=10)
+        n_freq: the number of bins in a spectrogram (default=128)
+        n_hidden: the number of hidden dimensions (default=128)
+        n_output: the number of output dimensions (default=128)
+        kernel_size: the number of kernel size in the first Conv1d layer (default=5)
+
+    Examples
+        >>> upsamplenetwork = _UpsampleNetwork(upsample_scales=[4, 4, 16])
+        >>> input = torch.rand(10, 128, 10)  # a random spectrogram
+        >>> output = upsamplenetwork(input)  # shape: (10, 1536, 128), (10, 1536, 128)
+    """
+
+    def __init__(self,
+                 upsample_scales: List[int],
+                 n_res_block: int = 10,
+                 n_freq: int = 128,
+                 n_hidden: int = 128,
+                 n_output: int = 128,
+                 kernel_size: int = 5) -> None:
+        super().__init__()
+
+        total_scale = 1
+        for upsample_scale in upsample_scales:
+            total_scale *= upsample_scale
+
+        self.indent = (kernel_size - 1) // 2 * total_scale
+        self.resnet = _MelResNet(n_res_block, n_freq, n_hidden, n_output, kernel_size)
+        self.resnet_stretch = _Stretch2d(total_scale, 1)
+
+        up_layers = []
+        for scale in upsample_scales:
+            stretch = _Stretch2d(scale, 1)
+            conv = nn.Conv2d(in_channels=1,
+                             out_channels=1,
+                             kernel_size=(1, scale * 2 + 1),
+                             padding=(0, scale),
+                             bias=False)
+            conv.weight.data.fill_(1. / (scale * 2 + 1))
+            up_layers.append(stretch)
+            up_layers.append(conv)
+        self.upsample_layers = nn.Sequential(*up_layers)
+
+    def forward(self, specgram: Tensor) -> Tensor:
+        r"""Pass the input through the _UpsampleNetwork layer.
+
+        Args:
+            specgram (Tensor): the input sequence to the _UpsampleNetwork layer (n_batch, n_freq, n_time)
+
+        Return:
+            Tensor shape: (n_batch, n_freq, (n_time - kernel_size + 1) * total_scale),
+                          (n_batch, n_output, (n_time - kernel_size + 1) * total_scale)
+        where total_scale is the product of all elements in upsample_scales.
+        """
+
+        resnet_output = self.resnet(specgram).unsqueeze(1)
+        resnet_output = self.resnet_stretch(resnet_output)
+        resnet_output = resnet_output.squeeze(1)
+
+        specgram = specgram.unsqueeze(1)
+        upsampling_output = self.upsample_layers(specgram)
+        upsampling_output = upsampling_output.squeeze(1)[:, :, self.indent:-self.indent]
+
+        return upsampling_output, resnet_output
+
+
+class _WaveRNN(nn.Module):
+    r"""WaveRNN model based on the implementation from `fatchord <https://github.com/fatchord/WaveRNN>`_.
+
+    The original implementation was introduced in
+    `"Efficient Neural Audio Synthesis" <https://arxiv.org/pdf/1802.08435.pdf>`_.
+    The input channels of waveform and spectrogram have to be 1. The product of
+    `upsample_scales` must equal `hop_length`.
+
+    Args:
+        upsample_scales: the list of upsample scales
+        n_bits: the bits of output waveform
+        sample_rate: the rate of audio dimensions (samples per second)
+        hop_length: the number of samples between the starts of consecutive frames
+        n_res_block: the number of ResBlock in stack (default=10)
+        n_rnn: the dimension of RNN layer (default=512)
+        n_fc: the dimension of fully connected layer (default=512)
+        kernel_size: the number of kernel size in the first Conv1d layer (default=5)
+        n_freq: the number of bins in a spectrogram (default=128)
+        n_hidden: the number of hidden dimensions (default=128)
+        n_output: the number of output dimensions (default=128)
+        mode: the mode of waveform in ['waveform', 'mol'] (default='waveform')
+
+    Example
+        >>> wavernn = _waveRNN(upsample_scales=[5,5,8], n_bits=9, sample_rate=24000, hop_length=200)
+        >>> waveform, sample_rate = torchaudio.load(file)
+        >>> # waveform shape: (n_batch, n_channel, (n_time - kernel_size + 1) * hop_length)
+        >>> specgram = MelSpectrogram(sample_rate)(waveform)  # shape: (n_batch, n_channel, n_freq, n_time)
+        >>> output = wavernn(waveform, specgram)
+        >>> # output shape: (n_batch, n_channel, (n_time - kernel_size + 1) * hop_length, 2 ** n_bits)
+    """
+
+    def __init__(self,
+                 upsample_scales: List[int],
+                 n_bits: int,
+                 sample_rate: int,
+                 hop_length: int,
+                 n_res_block: int = 10,
+                 n_rnn: int = 512,
+                 n_fc: int = 512,
+                 kernel_size: int = 5,
+                 n_freq: int = 128,
+                 n_hidden: int = 128,
+                 n_output: int = 128,
+                 mode: str = 'waveform') -> None:
+        super().__init__()
+
+        self.mode = mode
+        self.kernel_size = kernel_size
+
+        if self.mode == 'waveform':
+            self.n_classes = 2 ** n_bits
+        elif self.mode == 'mol':
+            self.n_classes = 30
+        else:
+            raise ValueError(f"Expected mode: `waveform` or `mol`, but found {self.mode}")
+
+        self.n_rnn = n_rnn
+        self.n_aux = n_output // 4
+        self.hop_length = hop_length
+        self.sample_rate = sample_rate
+
+        total_scale = 1
+        for upsample_scale in upsample_scales:
+            total_scale *= upsample_scale
+        if total_scale != self.hop_length:
+            raise ValueError(f"Expected: total_scale == hop_length, but found {total_scale} != {hop_length}")
+
+        self.upsample = _UpsampleNetwork(upsample_scales, n_res_block, n_freq, n_hidden, n_output, kernel_size)
+        self.fc = nn.Linear(n_freq + self.n_aux + 1, n_rnn)
+
+        self.rnn1 = nn.GRU(n_rnn, n_rnn, batch_first=True)
+        self.rnn2 = nn.GRU(n_rnn + self.n_aux, n_rnn, batch_first=True)
+
+        self.relu1 = nn.ReLU(inplace=True)
+        self.relu2 = nn.ReLU(inplace=True)
+
+        self.fc1 = nn.Linear(n_rnn + self.n_aux, n_fc)
+        self.fc2 = nn.Linear(n_fc + self.n_aux, n_fc)
+        self.fc3 = nn.Linear(n_fc, self.n_classes)
+
+    def forward(self, waveform: Tensor, specgram: Tensor) -> Tensor:
+        r"""Pass the input through the _WaveRNN model.
+
+        Args:
+            waveform: the input waveform to the _WaveRNN layer (n_batch, 1, (n_time - kernel_size + 1) * hop_length)
+            specgram: the input spectrogram to the _WaveRNN layer (n_batch, 1, n_freq, n_time)
+
+        Return:
+            Tensor shape: (n_batch, 1, (n_time - kernel_size + 1) * hop_length, 2 ** n_bits)
+        """
+
+        assert waveform.size(1) == 1, 'Require the input channel of waveform is 1'
+        assert specgram.size(1) == 1, 'Require the input channel of specgram is 1'
+        # remove channel dimension until the end
+        waveform, specgram = waveform.squeeze(1), specgram.squeeze(1)
+
+        batch_size = waveform.size(0)
+        h1 = torch.zeros(1, batch_size, self.n_rnn, dtype=waveform.dtype, device=waveform.device)
+        h2 = torch.zeros(1, batch_size, self.n_rnn, dtype=waveform.dtype, device=waveform.device)
+        # output of upsample:
+        # specgram: (n_batch, n_freq, (n_time - kernel_size + 1) * total_scale)
+        # aux: (n_batch, n_output, (n_time - kernel_size + 1) * total_scale)
+        specgram, aux = self.upsample(specgram)
+        specgram = specgram.transpose(1, 2)
+        aux = aux.transpose(1, 2)
+
+        aux_idx = [self.n_aux * i for i in range(5)]
+        a1 = aux[:, :, aux_idx[0]:aux_idx[1]]
+        a2 = aux[:, :, aux_idx[1]:aux_idx[2]]
+        a3 = aux[:, :, aux_idx[2]:aux_idx[3]]
+        a4 = aux[:, :, aux_idx[3]:aux_idx[4]]
+
+        x = torch.cat([waveform.unsqueeze(-1), specgram, a1], dim=-1)
+        x = self.fc(x)
+        res = x
+        x, _ = self.rnn1(x, h1)
+
+        x = x + res
+        res = x
+        x = torch.cat([x, a2], dim=-1)
+        x, _ = self.rnn2(x, h2)
+
+        x = x + res
+        x = torch.cat([x, a3], dim=-1)
+        x = self.fc1(x)
+        x = self.relu1(x)
+
+        x = torch.cat([x, a4], dim=-1)
+        x = self.fc2(x)
+        x = self.relu2(x)
+        x = self.fc3(x)
+
+        # bring back channel dimension
+        return x.unsqueeze(1)
diff --git a/torchaudio/sox_effects/__init__.py b/torchaudio/sox_effects/__init__.py
index 115b70c895..507dc5c3af 100644
--- a/torchaudio/sox_effects/__init__.py
+++ b/torchaudio/sox_effects/__init__.py
@@ -9,4 +9,6 @@
 
 
 if _mod_utils.is_module_available('torchaudio._torchaudio'):
+    import atexit
     init_sox_effects()
+    atexit.register(shutdown_sox_effects)
diff --git a/torchaudio/sox_effects/sox_effects.py b/torchaudio/sox_effects/sox_effects.py
index 6a6b5be0a5..0aee312126 100644
--- a/torchaudio/sox_effects/sox_effects.py
+++ b/torchaudio/sox_effects/sox_effects.py
@@ -1,4 +1,3 @@
-import atexit
 from typing import Any, Callable, List, Optional, Tuple, Union
 
 import torch
@@ -13,19 +12,8 @@
     from torchaudio import _torchaudio
 
 
-_SOX_INITIALIZED: Optional[bool] = False
-# This variable has a micro lifecycle. (False -> True -> None)
-# False: Not initialized
-# True: Initialized
-# None: Already shut down (should not be initialized again.)
-
-_SOX_SUCCESS_CODE = 0
-# defined at
-# https://fossies.org/dox/sox-14.4.2/sox_8h.html#a8e07e80cebeff3339265d89c387cea93a9ef2b87ec303edfe40751d9a85fadeeb
-
-
 @_mod_utils.requires_module('torchaudio._torchaudio')
-def init_sox_effects() -> int:
+def init_sox_effects() -> None:
     """Initialize resources required to use ``SoxEffectsChain``
 
     You do not need to call this function manually. It is called automatically.
@@ -33,50 +21,26 @@ def init_sox_effects() -> int:
     Once initialized, you do not need to call this function again across the multiple call of
     ``SoxEffectsChain.sox_build_flow_effects``, though it is safe to do so as long as
     ``shutdown_sox_effects`` is not called yet.
-    Once ``shutdown_sox_effects`` is called, you can no longer use SoX effects and calling
-    this function results in `RuntimeError`.
+    Once ``shutdown_sox_effects`` is called, you can no longer use SoX effects and
+    initializing again will result in error.
 
     Note:
         This function is not required for simple loading.
-
-    Returns:
-        int: Code corresponding to sox_error_t enum. See
-        https://fossies.org/dox/sox-14.4.2/sox_8h.html#a8e07e80cebeff3339265d89c387cea93
     """
-    global _SOX_INITIALIZED
-    if _SOX_INITIALIZED is None:
-        raise RuntimeError('SoX effects chain has been already shut down. Can not initialize again.')
-    if not _SOX_INITIALIZED:
-        code = _torchaudio.initialize_sox()
-        if code == _SOX_SUCCESS_CODE:
-            _SOX_INITIALIZED = True
-            atexit.register(shutdown_sox_effects)
-        return code
-    return _SOX_SUCCESS_CODE
+    torch.ops.torchaudio.sox_effects_initialize_sox_effects()
 
 
 @_mod_utils.requires_module("torchaudio._torchaudio")
-def shutdown_sox_effects() -> int:
+def shutdown_sox_effects() -> None:
     """Clean up resources required to use ``SoxEffectsChain``
 
     You do not need to call this function manually. It is called automatically.
 
     It is safe to call this function multiple times.
-    Once ``shutdown_sox_effects`` is called, you can no longer use SoX effects and calling
-    this function results in `RuntimeError`.
-
-
-    Returns:
-        int: Code corresponding to sox_error_t enum. See
-        https://fossies.org/dox/sox-14.4.2/sox_8h.html#a8e07e80cebeff3339265d89c387cea93
+    Once ``shutdown_sox_effects`` is called, you can no longer use SoX effects and
+    initializing again will result in error.
     """
-    global _SOX_INITIALIZED
-    if _SOX_INITIALIZED:
-        code = _torchaudio.shutdown_sox()
-        if code == _SOX_INITIALIZED:
-            _SOX_INITIALIZED = None
-        return code
-    return _SOX_SUCCESS_CODE
+    torch.ops.torchaudio.sox_effects_shutdown_sox_effects()
 
 
 @_mod_utils.requires_module('torchaudio._torchaudio')
@@ -88,7 +52,7 @@ def effect_names() -> List[str]:
     Example
         >>> EFFECT_NAMES = torchaudio.sox_effects.effect_names()
     """
-    return _torchaudio.get_effect_names()
+    return torch.ops.torchaudio.sox_effects_list_effects()
 
 
 @_mod_utils.requires_module('torchaudio._torchaudio')