diff --git a/.azure-pipelines/scripts/codeScan/pylint/pylint.sh b/.azure-pipelines/scripts/codeScan/pylint/pylint.sh index b15da8c91b3..938c5ecdc6c 100644 --- a/.azure-pipelines/scripts/codeScan/pylint/pylint.sh +++ b/.azure-pipelines/scripts/codeScan/pylint/pylint.sh @@ -10,13 +10,13 @@ pip install -r /neural-compressor/requirements.txt pip install torch==1.12.0 python -m pylint -f json --disable=R,C,W,E1129 --enable=line-too-long --max-line-length=120 --extension-pkg-whitelist=numpy --ignored-classes=TensorProto,NodeProto \ ---ignored-modules=tensorflow,torch,torch.quantization,torch.tensor,torchvision,mxnet,onnx,onnxruntime,intel_extension_for_pytorch /neural-compressor/neural_compressor \ +--ignored-modules=tensorflow,torch,torch.quantization,torch.tensor,torchvision,fairseq,mxnet,onnx,onnxruntime,intel_extension_for_pytorch /neural-compressor/neural_compressor \ > $log_dir/pylint.json exit_code=$? $BOLD_YELLOW && echo " ----------------- Current pylint cmd start --------------------------" && $RESET -echo "python -m pylint -f json --disable=R,C,W,E1129 --enable=line-too-long --max-line-length=120 --extension-pkg-whitelist=numpy --ignored-classes=TensorProto,NodeProto --ignored-modules=tensorflow,torch,torch.quantization,torch.tensor,torchvision,mxnet,onnx,onnxruntime,intel_extension_for_pytorch /neural-compressor/neural_compressor > $log_dir/pylint.json" +echo "python -m pylint -f json --disable=R,C,W,E1129 --enable=line-too-long --max-line-length=120 --extension-pkg-whitelist=numpy --ignored-classes=TensorProto,NodeProto --ignored-modules=tensorflow,torch,torch.quantization,torch.tensor,torchvision,fairseq,mxnet,onnx,onnxruntime,intel_extension_for_pytorch /neural-compressor/neural_compressor > $log_dir/pylint.json" $BOLD_YELLOW && echo " ----------------- Current pylint cmd end --------------------------" && $RESET $BOLD_YELLOW && echo " ----------------- Current log file output start --------------------------" && $RESET diff --git a/.gitignore b/.gitignore index d039b651c1f..509d3f1d1a9 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,7 @@ .idea /venv/ */__pycache__ +.ipynb_checkpoints/ *.snapshot *.csv *.pb @@ -17,4 +18,4 @@ build/ _build lpot_workspace/ .torch/ -node_modules \ No newline at end of file +node_modules diff --git a/docs/source/NAS.md b/docs/source/NAS.md index 98eac4d8217..8ad4a43554a 100644 --- a/docs/source/NAS.md +++ b/docs/source/NAS.md @@ -81,7 +81,7 @@ class NASBase(object): def search(self, res_save_path=None): # NAS search process. - ... + ... def estimate(self, model): # pragma: no cover # Estimate performance of the model. Depends on specific NAS algorithm. @@ -175,3 +175,5 @@ Following examples are supported in Intel® Neural Compressor: - DyNAS MobileNetV3 supernet Example: - [DyNAS MobileNetV3 supernet Example](../examples/notebook/dynas/MobileNetV3_Supernet_NAS.ipynb): DyNAS with MobileNetV3 supernet on ImageNet dataset. +- DyNAS Transformer LT supernet Example: + - [DyNAS Transformer LT supernet Example](../examples/notebook/dynas/Transformer_LT_Supernet_NAS.ipynb): DyNAS with Transformer LT supernet on WMT En-De dataset. diff --git a/examples/notebook/dynas/MobileNetV3_Supernet_NAS.ipynb b/examples/notebook/dynas/MobileNetV3_Supernet_NAS.ipynb index 4fdbc291284..cbbd678b4bc 100644 --- a/examples/notebook/dynas/MobileNetV3_Supernet_NAS.ipynb +++ b/examples/notebook/dynas/MobileNetV3_Supernet_NAS.ipynb @@ -13,7 +13,7 @@ "\n", "#### Super-Networks\n", "\n", - "The computational overhead of evaluating DNN architectures during the neural architecture search process can be very costly due to the training and validation cycles. To address the training overhead, novel weight-sharing approaches known as one-shot or super-networks have offered a way to mitigate the training overhead by reducing training times from thousands to a few GPU days. These approaches train a task-specific super-network architecture with a weight-sharing mechanism that allows the sub-networks to be treated as unique individual architectures. This enables sub-network model extraction and validation without a separate training cycle. This tutorial offers pre-trained Once-for-All (OFA) super-networks [1] for the image classification task on ImageNet-ilsvrc2012.\n", + "The computational overhead of evaluating DNN architectures during the neural architecture search process can be very costly due to the training and validation cycles. To address the training overhead, novel weight-sharing approaches known as one-shot or super-networks have offered a way to mitigate the training overhead by reducing training times from thousands to a few GPU days. These approaches train a task-specific super-network architecture with a weight-sharing mechanism that allows the sub-networks to be treated as unique individual architectures. This enables sub-network model extraction and validation without a separate training cycle. This tutorial offers pre-trained Once-for-All (OFA) super-networks [1] for the image classification on ImageNet-ilsvrc2012 as well as Transformer Language Translation (based on [6]) for the language translation tasks.\n", "\n", "#### Methodology\n", "\n", @@ -38,7 +38,25 @@ "metadata": {}, "outputs": [], "source": [ - "!pip install neural_compressor autograd==1.4 fvcore==0.1.5.post20220119 numpy==1.19.2 ofa==0.1.0.post202203231606 pandas==1.1.5 pymoo==0.5.0 pyyaml==5.4.1 scikit-learn==0.24.2 scipy==1.5.4 torch==1.10.1 torchvision==0.11.2" + "!pip -q install neural_compressor autograd==1.4 fvcore==0.1.5.post20220119 numpy ofa==0.1.0.post202203231606 pandas==1.1.5 pymoo==0.5.0 pyyaml==5.4.1 scikit-learn==0.24.2 scipy==1.5.4 torch==1.10.1 torchvision==0.11.2" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Alternatievely, if you have a local copy of https://github.com/intel/neural-compressor, you can uncomment and run the code below:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# import sys\n", + "# sys.path.insert(0,'')\n", + "# !pip install -q autograd==1.4 fvcore==0.1.5.post20220119 numpy ofa==0.1.0.post202203231606 pandas==1.1.5 pymoo==0.5.0 pyyaml==5.4.1 scikit-learn==0.24.2 scipy==1.5.4 torch==1.10.1 torchvision==0.11.2 sacremoses==0.0.53 torchprofile==0.0.4 fairseq==0.12.2" ] }, { @@ -84,12 +102,16 @@ "metadata": {}, "source": [ "### Define Architecture\n", - "We currently leverage pre-trained Once-for-All (OFA) super-networks [4] for the image classification task on ImageNet-ilsvrc2012. In the case where the super-network PyTorch model download fails, you can manually copy the pre-trained models from https://github.com/mit-han-lab/once-for-all and place them in the `.torch/ofa_nets` path. \n", + "We currently support pre-trained super-networks:\n", + "\n", + "1. Once-for-All (OFA) super-networks [4] for the image classification task on ImageNet-ilsvrc2012. In the case where the super-network PyTorch model download fails, you can manually copy the pre-trained models from https://github.com/mit-han-lab/once-for-all and place them in the `.torch/ofa_nets` path.\n", + "2. Hardware-Aware-Transformers (HAT) supernetwork [6] for language translation task on WMT14 En-De. To run this supernetwork you have to manually download preprocessed dataset from https://github.com/mit-han-lab/hardware-aware-transformers/blob/master/configs/wmt14.en-de/get_preprocessed.sh and pretrained model from https://www.dropbox.com/s/pkdddxvvpw9a4vq/HAT_wmt14ende_super_space0.pt?dl=0\n", "\n", "Super-network options (choose 1): \n", "- `ofa_resnet50` - based on the ResNet50 architecture [4]. Search space of ~$10^{15}$ architectures.\n", "- `ofa_mbv3_d234_e346_k357_w1.0` - based on the MobileNetV3 architecture [5], width multiplier 1.0. Search space of ~$10^{19}$ architectures.\n", - "- `ofa_mbv3_d234_e346_k357_w1.2` - based on the MobileNetV3 architecture [5], width multiplier 1.2. Search space of ~$10^{19}$ architectures. " + "- `ofa_mbv3_d234_e346_k357_w1.2` - based on the MobileNetV3 architecture [5], width multiplier 1.2. Search space of ~$10^{19}$ architectures. \n", + "- `transformer_lt_wmt_en_de` - based on the Transformer architecture [7]." ] }, { @@ -113,7 +135,7 @@ "* `['acc', 'lat']` \n", "\n", "Description:\n", - "* `'acc'` - ImageNet Top-1 Accuracy (%)\n", + "* `'acc'` - ImageNet Top-1 Accuracy (%) (for OFA supetnetworks) and Bleu (for Transformer LT)\n", "* `'macs'` - Multiply-and-accumulates as measured from FVCore. \n", "* `'lat'` - Latency (inference time) measurement (ms)" ] @@ -137,7 +159,8 @@ "* `config.dynas.num_evals` - Validation measurement count, a higher count comes with greater computational cost but a higher chance of finding optimal sub-networks\n", "* `config.dynas.results_csv_path` - Location of the search (validation measurement) results. This file is also used to provide training data to the metric predictors. \n", "* `config.dynas.batch_size` - Batch size used during latency measurements.\n", - "* `config.dynas.dataset_path` - Path to the imagenet-ilsvrc2012 dataset. This can be obtained at: https://www.image-net.org/download.php" + "* `config.dynas.dataset_path` - For OFA it's a path to the imagenet-ilsvrc2012 dataset. This can be obtained at: https://www.image-net.org/download.php; For Transformer LT it's a path to preprocessed WMT EnDe directory (`(...)/data/binary/wmt16_en_de`)\n", + "* `config.dynas.supernet_ckpt_path` - Transformer LT only. Path to downloaded pretrained super-network (`HAT_wmt14ende_super_space0.pt` file)." ] }, { @@ -272,8 +295,10 @@ "[1] Cai, H., Gan, C., & Han, S. (2020). Once for All: Train One Network and Specialize it for Efficient Deployment. ArXiv, abs/1908.09791. \n", "[2] K. Deb, A. Pratap, S. Agarwal and T. Meyarivan, \"A fast and elitist multiobjective genetic algorithm: NSGA-II,\" in IEEE Transactions on Evolutionary Computation, vol. 6, no. 2, pp. 182-197, April 2002, doi: 10.1109/4235.996017. \n", "[3] Cummings, D., Sarah, A., Sridhar, S.N., Szankin, M., Muñoz, J.P., & Sundaresan, S. (2022). A Hardware-Aware Framework for Accelerating Neural Architecture Search Across Modalities. ArXiv, abs/2205.10358. \n", - "[4] He, K., Zhang, X., Ren, S., & Sun, J. (2016). Deep Residual Learning for Image Recognition. 2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), 770-778. \n", - "[5] Howard, A.G., Sandler, M., Chu, G., Chen, L., Chen, B., Tan, M., Wang, W., Zhu, Y., Pang, R., Vasudevan, V., Le, Q.V., & Adam, H. (2019). Searching for MobileNetV3. 2019 IEEE/CVF International Conference on Computer Vision (ICCV), 1314-1324. " + "[4] He, K., Zhang, X., Ren, S., & Sun, J. (2016). Deep Residual Learning for Image Recognition. 2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), 770-778. \n", + "[5] Howard, A.G., Sandler, M., Chu, G., Chen, L., Chen, B., Tan, M., Wang, W., Zhu, Y., Pang, R., Vasudevan, V., Le, Q.V., & Adam, H. (2019). Searching for MobileNetV3. 2019 IEEE/CVF International Conference on Computer Vision (ICCV), 1314-1324. \n", + "[6] Wang, H., Wu, Z., Liu, Z., Cai, H., Zhu, L., Gan, C. and Han, S., 2020. Hat: Hardware-aware transformers for efficient natural language processing. arXiv preprint arXiv:2005.14187. \n", + "[7] Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A.N., Kaiser, Ł. and Polosukhin, I., 2017. Attention is all you need. Advances in neural information processing systems, 30." ] }, { @@ -300,7 +325,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.10" + "version": "3.7.11" } }, "nbformat": 4, diff --git a/examples/notebook/dynas/Transformer_LT_Supernet_NAS.ipynb b/examples/notebook/dynas/Transformer_LT_Supernet_NAS.ipynb new file mode 100644 index 00000000000..1e7ffcd71b5 --- /dev/null +++ b/examples/notebook/dynas/Transformer_LT_Supernet_NAS.ipynb @@ -0,0 +1,310 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Introduction\n", + "\n", + "This tutorial demonstrates how to perform a multi-objective neural architecture search (NAS) on a MobileNetV3 one-shot weight-sharing super-network [1] using the Intel® Neural Compressor Dynamic NAS (DyNAS) search approach. \n", + "\n", + "#### Background\n", + "Neural architecture search, the study of automating the discovery of optimal deep neural network architectures for tasks in domains such as computer vision and natural language processing, has seen rapid growth in the machine learning research community. While there have been many recent advancements in NAS, there is still a significant focus on reducing the computational cost incurred when validating discovered architectures by making search more efficient. Evolutionary algorithms, specifically genetic algorithms, have a history of usage in NAS and continue to gain popularity as a highly efficient way to explore the architecture objective space. In this tutorial, we show how evolutionary algorithms [2] can be paired with lightly trained objective predictors in an iterative cycle to accelerate multi-objective architectural exploration. Specifically, we use a bi-level optimization approach [3] denoted as `dynas`. This technique is ~4x more sample efficient than typical one-shot predictor-based NAS approaches. \n", + "\n", + "#### Super-Networks\n", + "\n", + "The computational overhead of evaluating DNN architectures during the neural architecture search process can be very costly due to the training and validation cycles. To address the training overhead, novel weight-sharing approaches known as one-shot or super-networks have offered a way to mitigate the training overhead by reducing training times from thousands to a few GPU days. These approaches train a task-specific super-network architecture with a weight-sharing mechanism that allows the sub-networks to be treated as unique individual architectures. This enables sub-network model extraction and validation without a separate training cycle. This tutorial offers pre-trained Once-for-All (OFA) super-networks [1] for the image classification on ImageNet-ilsvrc2012 as well as Transformer Language Translation (based on [6]) for the language translation tasks.\n", + "\n", + "#### Methodology\n", + "\n", + "The flow of the DyNAS approach (`approach='dynas'`) is shown in the following figure. Currently, three pre-trained super-network options for the image classification task are provided. In the first phase of the search, a small population (`config.dynas.population`) of sub-networks are randomly sampled and evaluated (validation measurement) to provide the initial training set for the inner predictor loop. After the predictors are trained, a multi-objective evolutionary search (`search_algorithm`) is performed in the predictor objective space. After an extensive search is performed, the best performing sub-network configurations are selected to be the next iteration's validation population. The cycle continues until the search concludes when the user defined evaluation count (`config.dynas.num_evals`) is met. \n", + " \n", + "
\n", + "
\n", + "\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Prerequisites\n", + "\n", + "For released version of Neural Compressor:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install -q neural_compressor autograd==1.4 fvcore==0.1.5.post20220119 numpy ofa==0.1.0.post202203231606 pandas==1.1.5 pymoo==0.5.0 pyyaml==5.4.1 scikit-learn==0.24.2 scipy==1.5.4 torch==1.10.1 torchvision==0.11.2 sacremoses==0.0.53 torchprofile==0.0.4 fairseq==0.12.2" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Alternatievely, if you have a local copy of https://github.com/intel/neural-compressor, you can uncomment and run the code below:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# import sys\n", + "# sys.path.insert(0,'')\n", + "# !pip install -q autograd==1.4 fvcore==0.1.5.post20220119 numpy ofa==0.1.0.post202203231606 pandas==1.1.5 pymoo==0.5.0 pyyaml==5.4.1 scikit-learn==0.24.2 scipy==1.5.4 torch==1.10.1 torchvision==0.11.2 sacremoses==0.0.53 torchprofile==0.0.4 fairseq==0.12.2" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Import Packages" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from neural_compressor.conf.config import NASConfig\n", + "from neural_compressor.experimental.nas import NAS\n", + "from neural_compressor.experimental.nas.dynast.dynas_utils import TorchVisionReference" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Configure NAS Algorithm\n", + "\n", + "The `NASConfig` class allows us to define the appropriate paramenters for determining how the neural architecture search is performed. Currently, the following multi-objective evolutionary algorithms are supported by the `dynas` approach: \n", + "* `'nsga2'`\n", + "* `'age'`" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "config = NASConfig(approach='dynas', search_algorithm='nsga2')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Define Architecture\n", + "We currently support pre-trained super-networks:\n", + "\n", + "1. Once-for-All (OFA) super-networks [4] for the image classification task on ImageNet-ilsvrc2012. In the case where the super-network PyTorch model download fails, you can manually copy the pre-trained models from https://github.com/mit-han-lab/once-for-all and place them in the `.torch/ofa_nets` path.\n", + "2. Hardware-Aware-Transformers (HAT) supernetwork [6] for language translation task on WMT14 En-De. To run this supernetwork you have to manually download preprocessed dataset from https://github.com/mit-han-lab/hardware-aware-transformers/blob/master/configs/wmt14.en-de/get_preprocessed.sh and pretrained model from https://www.dropbox.com/s/pkdddxvvpw9a4vq/HAT_wmt14ende_super_space0.pt?dl=0\n", + "\n", + "Super-network options (choose 1): \n", + "- `ofa_resnet50` - based on the ResNet50 architecture [4]. Search space of ~$10^{15}$ architectures.\n", + "- `ofa_mbv3_d234_e346_k357_w1.0` - based on the MobileNetV3 architecture [5], width multiplier 1.0. Search space of ~$10^{19}$ architectures.\n", + "- `ofa_mbv3_d234_e346_k357_w1.2` - based on the MobileNetV3 architecture [5], width multiplier 1.2. Search space of ~$10^{19}$ architectures. \n", + "- `transformer_lt_wmt_en_de` - based on the Transformer architecture [7]." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "config.dynas.supernet = 'transformer_lt_wmt_en_de'\n", + "config.seed = 42" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Select performance metrics\n", + "\n", + "Performance metric options are as follows. Currently, the `dynas` approach supports the use exactly 2 objectives.\n", + "* `['acc', 'macs'] `\n", + "* `['acc', 'lat']` \n", + "\n", + "Description:\n", + "* `'acc'` - ImageNet Top-1 Accuracy (%) (for OFA supetnetworks) and Bleu (for Transformer LT)\n", + "* `'macs'` - Multiply-and-accumulates as measured from FVCore. \n", + "* `'lat'` - Latency (inference time) measurement (ms)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "config.dynas.metrics = ['acc', 'macs']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Search parameters\n", + "\n", + "* `config.dynas.population` - Size of the population for evolutionary/genetic algorithm (50 recommended)\n", + "* `config.dynas.num_evals` - Validation measurement count, a higher count comes with greater computational cost but a higher chance of finding optimal sub-networks\n", + "* `config.dynas.results_csv_path` - Location of the search (validation measurement) results. This file is also used to provide training data to the metric predictors. \n", + "* `config.dynas.batch_size` - Batch size used during latency measurements.\n", + "* `config.dynas.dataset_path` - For OFA it's a path to the imagenet-ilsvrc2012 dataset. This can be obtained at: https://www.image-net.org/download.php; For Transformer LT it's a path to preprocessed WMT EnDe directory (`(...)/data/binary/wmt16_en_de`)\n", + "* `config.dynas.supernet_ckpt_path` - Transformer LT only. Path to downloaded pretrained super-network (`HAT_wmt14ende_super_space0.pt` file)." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "config.dynas.population = 50\n", + "config.dynas.num_evals = 250\n", + "config.dynas.results_csv_path = 'results_transformerlt_macs.csv'\n", + "config.dynas.batch_size = 64\n", + "config.dynas.dataset_path = '/datasets/hat_dataset/data/binary/wmt16_en_de' # example\n", + "config.dynas.supernet_ckpt_path ='/datasets/hat_dataset/HAT_wmt14ende_super_space0.pt' # example" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Perform Search\n", + "\n", + "After the DyNAS configuration parameters are set, the search process can be started. Depending on how many evaluations `config.dynas.num_evals` were defined, the search time can vary from hours to days. \n", + "The search process will populate the `config.dynas.results_csv_path` file and will also return a list of the final iteration's best sub-network population recommondation. \n", + "\n", + "Note: example search results are provided for the plotting section if you wish to skip this step for now. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "agent = NAS(config)\n", + "results = agent.search()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Plot Search Results in the Multi-Objective Space" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "from matplotlib.cm import ScalarMappable\n", + "\n", + "fig, ax = plt.subplots(figsize=(7,5))\n", + "\n", + "number_of_evals = 500\n", + "df_dynas = pd.read_csv(config.dynas.results_csv_path)[:number_of_evals]\n", + "df_dynas.columns = ['config', 'date', 'lat', 'macs', 'top1']\n", + "\n", + "cm = plt.cm.get_cmap('viridis_r')\n", + "count = [x for x in range(len(df_dynas))]\n", + "\n", + "ax.scatter(df_dynas['macs'].values, df_dynas['top1'].values, marker='^', alpha=0.8, c=count, \n", + " cmap=cm, label='Discovered DNN Model', s=10)\n", + "ax.set_title(f'Intel® Neural Compressor\\nDynamic NAS (DyNAS)\\nSupernet:{config.dynas.supernet}')\n", + "ax.set_xlabel('MACs', fontsize=13)\n", + "ax.set_ylabel('BLEU Score (%)', fontsize=13)\n", + "ax.legend(fancybox=True, fontsize=10, framealpha=1, borderpad=0.2, loc='lower right')\n", + "ax.grid(True, alpha=0.3)\n", + "\n", + "# Eval Count bar\n", + "norm = plt.Normalize(0, len(df_dynas))\n", + "sm = ScalarMappable(norm=norm, cmap=cm)\n", + "cbar = fig.colorbar(sm, ax=ax, shrink=0.85)\n", + "cbar.ax.set_title(\" Evaluation\\n Count\", fontsize=8)\n", + "\n", + "fig.tight_layout(pad=2)\n", + "plt.show();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# References" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "[1] Cai, H., Gan, C., & Han, S. (2020). Once for All: Train One Network and Specialize it for Efficient Deployment. ArXiv, abs/1908.09791. \n", + "[2] K. Deb, A. Pratap, S. Agarwal and T. Meyarivan, \"A fast and elitist multiobjective genetic algorithm: NSGA-II,\" in IEEE Transactions on Evolutionary Computation, vol. 6, no. 2, pp. 182-197, April 2002, doi: 10.1109/4235.996017. \n", + "[3] Cummings, D., Sarah, A., Sridhar, S.N., Szankin, M., Muñoz, J.P., & Sundaresan, S. (2022). A Hardware-Aware Framework for Accelerating Neural Architecture Search Across Modalities. ArXiv, abs/2205.10358. \n", + "[4] He, K., Zhang, X., Ren, S., & Sun, J. (2016). Deep Residual Learning for Image Recognition. 2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), 770-778. \n", + "[5] Howard, A.G., Sandler, M., Chu, G., Chen, L., Chen, B., Tan, M., Wang, W., Zhu, Y., Pang, R., Vasudevan, V., Le, Q.V., & Adam, H. (2019). Searching for MobileNetV3. 2019 IEEE/CVF International Conference on Computer Vision (ICCV), 1314-1324. \n", + "[6] Wang, H., Wu, Z., Liu, Z., Cai, H., Zhu, L., Gan, C. and Han, S., 2020. Hat: Hardware-aware transformers for efficient natural language processing. arXiv preprint arXiv:2005.14187. \n", + "[7] Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A.N., Kaiser, Ł. and Polosukhin, I., 2017. Attention is all you need. Advances in neural information processing systems, 30." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.11" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/examples/notebook/dynas/results_transformerlt_macs.csv b/examples/notebook/dynas/results_transformerlt_macs.csv new file mode 100644 index 00000000000..326d9894762 --- /dev/null +++ b/examples/notebook/dynas/results_transformerlt_macs.csv @@ -0,0 +1,501 @@ +Sub-network,Date,Latency (ms),MACs,BLEU +"{'encoder_embed_dim': [640], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 3072, 2048, 3072, 2048, 2048], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 1024, 2048, 2048], 'decoder_layer_num': [1], 'encoder_self_attention_heads': [4, 4, 8, 4, 8, 4], 'decoder_self_attention_heads': [4, 8, 8, 4, 8, 8], 'decoder_ende_attention_heads': [8, 4, 8, 4, 8, 4], 'decoder_arbitrary_ende_attn': [2, 2, 1, 2, -1, -1]}",2022-11-29 22:54:58.796773,0,1397702484,23.35221720436182 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [640], 'encoder_ffn_embed_dim': [3072, 3072, 2048, 2048, 1024, 3072], 'decoder_ffn_embed_dim': [3072, 2048, 2048, 1024, 1024, 1024], 'decoder_layer_num': [5], 'encoder_self_attention_heads': [4, 4, 4, 4, 4, 8], 'decoder_self_attention_heads': [4, 8, 8, 4, 4, 4], 'decoder_ende_attention_heads': [4, 4, 4, 4, 8, 8], 'decoder_arbitrary_ende_attn': [1, 1, 1, 2, 2, -1]}",2022-11-29 22:55:36.708362,0,2117790828,25.699488742308187 +"{'encoder_embed_dim': [640], 'decoder_embed_dim': [640], 'encoder_ffn_embed_dim': [2048, 3072, 1024, 3072, 1024, 2048], 'decoder_ffn_embed_dim': [3072, 2048, 3072, 3072, 3072, 3072], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [4, 8, 8, 4, 8, 4], 'decoder_self_attention_heads': [8, 8, 4, 4, 8, 4], 'decoder_ende_attention_heads': [4, 8, 8, 8, 8, 8], 'decoder_arbitrary_ende_attn': [-1, -1, 2, -1, -1, 1]}",2022-11-29 22:56:06.143948,0,1700582490,25.0628359775166 +"{'encoder_embed_dim': [640], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [3072, 2048, 2048, 1024, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 1024, 2048], 'decoder_layer_num': [3], 'encoder_self_attention_heads': [8, 8, 4, 4, 4, 8], 'decoder_self_attention_heads': [8, 8, 4, 8, 8, 4], 'decoder_ende_attention_heads': [4, 4, 4, 4, 4, 8], 'decoder_arbitrary_ende_attn': [2, 2, 2, -1, 1, -1]}",2022-11-29 22:56:40.372306,0,1593972576,25.51774692114225 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 3072, 2048, 2048, 1024, 3072], 'decoder_ffn_embed_dim': [2048, 1024, 1024, 3072, 1024, 1024], 'decoder_layer_num': [1], 'encoder_self_attention_heads': [4, 8, 4, 8, 8, 4], 'decoder_self_attention_heads': [4, 8, 4, 4, 8, 4], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [-1, -1, 1, -1, 1, 1]}",2022-11-29 22:57:09.483908,0,1234590804,22.56186718543443 +"{'encoder_embed_dim': [640], 'decoder_embed_dim': [640], 'encoder_ffn_embed_dim': [3072, 3072, 3072, 2048, 3072, 3072], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 1024, 3072], 'decoder_layer_num': [5], 'encoder_self_attention_heads': [4, 8, 8, 4, 4, 4], 'decoder_self_attention_heads': [4, 8, 8, 4, 8, 8], 'decoder_ende_attention_heads': [8, 8, 4, 4, 8, 4], 'decoder_arbitrary_ende_attn': [2, 1, 1, 1, -1, 1]}",2022-11-29 22:57:47.479253,0,2320469868,26.46877217919795 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [3072, 1024, 3072, 1024, 1024, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 3072, 3072, 3072, 3072], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [4, 8, 4, 4, 4, 8], 'decoder_self_attention_heads': [4, 8, 8, 4, 4, 8], 'decoder_ende_attention_heads': [4, 8, 4, 4, 8, 4], 'decoder_arbitrary_ende_attn': [2, -1, 1, 1, 1, -1]}",2022-11-29 22:58:16.629295,0,1269811290,24.64774544301779 +"{'encoder_embed_dim': [640], 'decoder_embed_dim': [640], 'encoder_ffn_embed_dim': [3072, 3072, 2048, 2048, 3072, 3072], 'decoder_ffn_embed_dim': [3072, 1024, 1024, 1024, 1024, 2048], 'decoder_layer_num': [6], 'encoder_self_attention_heads': [8, 4, 4, 8, 4, 4], 'decoder_self_attention_heads': [4, 4, 4, 8, 8, 4], 'decoder_ende_attention_heads': [8, 8, 4, 4, 4, 4], 'decoder_arbitrary_ende_attn': [2, 2, -1, 1, -1, 2]}",2022-11-29 22:58:57.917629,0,2481530994,26.07415311884126 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [640], 'encoder_ffn_embed_dim': [3072, 3072, 1024, 1024, 1024, 2048], 'decoder_ffn_embed_dim': [1024, 2048, 2048, 1024, 1024, 2048], 'decoder_layer_num': [1], 'encoder_self_attention_heads': [4, 4, 8, 8, 8, 8], 'decoder_self_attention_heads': [8, 8, 8, 4, 4, 4], 'decoder_ende_attention_heads': [8, 4, 4, 4, 8, 4], 'decoder_arbitrary_ende_attn': [2, 1, 2, 1, 1, -1]}",2022-11-29 22:59:26.905633,0,1319024724,22.493311676649537 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 3072, 1024, 1024, 3072, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 2048, 2048, 1024, 3072], 'decoder_layer_num': [6], 'encoder_self_attention_heads': [4, 8, 8, 4, 4, 8], 'decoder_self_attention_heads': [4, 8, 8, 8, 8, 4], 'decoder_ende_attention_heads': [4, 8, 4, 4, 8, 4], 'decoder_arbitrary_ende_attn': [2, -1, 1, 2, 2, 1]}",2022-11-29 23:00:07.328829,0,1880709234,26.00344571579533 +"{'encoder_embed_dim': [640], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [3072, 3072, 1024, 2048, 2048, 2048], 'decoder_ffn_embed_dim': [3072, 2048, 1024, 1024, 1024, 1024], 'decoder_layer_num': [3], 'encoder_self_attention_heads': [8, 4, 8, 8, 4, 4], 'decoder_self_attention_heads': [8, 4, 8, 8, 8, 8], 'decoder_ende_attention_heads': [4, 4, 4, 4, 4, 4], 'decoder_arbitrary_ende_attn': [-1, -1, 1, 2, -1, 1]}",2022-11-29 23:00:38.792088,0,1671939936,25.692425623480723 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [640], 'encoder_ffn_embed_dim': [2048, 1024, 1024, 3072, 2048, 3072], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 1024, 1024, 1024], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [4, 8, 4, 8, 8, 4], 'decoder_self_attention_heads': [4, 4, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 4, 8, 4, 4, 8], 'decoder_arbitrary_ende_attn': [2, 2, 2, -1, 1, -1]}",2022-11-29 23:01:14.719074,0,1804297062,26.07342689295033 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [640], 'encoder_ffn_embed_dim': [1024, 2048, 2048, 2048, 3072, 2048], 'decoder_ffn_embed_dim': [3072, 2048, 3072, 2048, 3072, 2048], 'decoder_layer_num': [6], 'encoder_self_attention_heads': [8, 4, 4, 8, 4, 4], 'decoder_self_attention_heads': [4, 4, 8, 8, 4, 4], 'decoder_ende_attention_heads': [8, 8, 8, 8, 4, 8], 'decoder_arbitrary_ende_attn': [-1, 1, 2, 1, 2, -1]}",2022-11-29 23:01:56.128203,0,2350798194,26.332192395799687 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 2048, 2048, 3072, 2048], 'decoder_ffn_embed_dim': [2048, 2048, 2048, 1024, 3072, 1024], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 4, 8, 4, 8, 4], 'decoder_self_attention_heads': [8, 4, 8, 4, 4, 4], 'decoder_ende_attention_heads': [8, 8, 8, 4, 4, 8], 'decoder_arbitrary_ende_attn': [2, 2, -1, 1, -1, 1]}",2022-11-29 23:02:26.540354,0,1397483610,25.69929087830039 +"{'encoder_embed_dim': [640], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 3072, 1024, 2048, 2048], 'decoder_ffn_embed_dim': [1024, 2048, 2048, 2048, 3072, 1024], 'decoder_layer_num': [6], 'encoder_self_attention_heads': [4, 8, 8, 4, 4, 4], 'decoder_self_attention_heads': [4, 4, 8, 8, 4, 4], 'decoder_ende_attention_heads': [8, 8, 4, 8, 8, 8], 'decoder_arbitrary_ende_attn': [1, -1, 2, 1, 1, 2]}",2022-11-29 23:03:07.262003,0,2119699314,26.35980541802738 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 1024, 1024, 2048, 1024], 'decoder_ffn_embed_dim': [2048, 3072, 1024, 1024, 3072, 2048], 'decoder_layer_num': [1], 'encoder_self_attention_heads': [4, 4, 8, 8, 8, 8], 'decoder_self_attention_heads': [4, 8, 4, 4, 4, 4], 'decoder_ende_attention_heads': [4, 8, 8, 4, 8, 4], 'decoder_arbitrary_ende_attn': [2, 1, -1, 2, -1, -1]}",2022-11-29 23:03:35.089457,0,1110604884,22.97494000005183 +"{'encoder_embed_dim': [640], 'decoder_embed_dim': [640], 'encoder_ffn_embed_dim': [2048, 1024, 3072, 2048, 2048, 2048], 'decoder_ffn_embed_dim': [3072, 3072, 1024, 2048, 1024, 1024], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [4, 4, 8, 8, 8, 8], 'decoder_self_attention_heads': [4, 8, 4, 4, 4, 4], 'decoder_ende_attention_heads': [4, 4, 4, 8, 8, 8], 'decoder_arbitrary_ende_attn': [2, 1, 1, -1, 1, 2]}",2022-11-29 23:04:06.642167,0,1801651290,25.757473996484833 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [640], 'encoder_ffn_embed_dim': [1024, 2048, 2048, 3072, 2048, 2048], 'decoder_ffn_embed_dim': [1024, 3072, 3072, 1024, 1024, 3072], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [4, 8, 4, 8, 8, 8], 'decoder_self_attention_heads': [8, 8, 4, 8, 8, 4], 'decoder_ende_attention_heads': [8, 8, 8, 4, 8, 8], 'decoder_arbitrary_ende_attn': [-1, 2, 2, -1, -1, -1]}",2022-11-29 23:04:41.625679,0,1888961382,25.85426108217189 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [640], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 2048, 2048, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 3072, 3072, 1024, 3072], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 8, 4, 4, 8, 4], 'decoder_self_attention_heads': [8, 4, 4, 4, 8, 4], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 8], 'decoder_arbitrary_ende_attn': [2, 1, -1, 1, 1, 1]}",2022-11-29 23:05:11.833000,0,1490960730,25.63143521434478 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 3072, 3072, 2048, 1024], 'decoder_ffn_embed_dim': [1024, 3072, 2048, 3072, 1024, 1024], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [4, 4, 8, 4, 8, 8], 'decoder_self_attention_heads': [8, 8, 4, 4, 4, 8], 'decoder_ende_attention_heads': [8, 4, 8, 8, 8, 8], 'decoder_arbitrary_ende_attn': [2, -1, 1, 1, 1, 2]}",2022-11-29 23:05:41.444763,0,1364183130,25.072061221515387 +"{'encoder_embed_dim': [640], 'decoder_embed_dim': [640], 'encoder_ffn_embed_dim': [3072, 2048, 2048, 1024, 1024, 3072], 'decoder_ffn_embed_dim': [3072, 1024, 1024, 1024, 3072, 1024], 'decoder_layer_num': [6], 'encoder_self_attention_heads': [4, 8, 4, 4, 8, 8], 'decoder_self_attention_heads': [4, 8, 4, 4, 8, 8], 'decoder_ende_attention_heads': [4, 4, 8, 8, 8, 8], 'decoder_arbitrary_ende_attn': [-1, 1, -1, 1, 1, -1]}",2022-11-29 23:06:20.219204,0,2281236594,26.08920225424034 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [640], 'encoder_ffn_embed_dim': [1024, 2048, 2048, 3072, 1024, 3072], 'decoder_ffn_embed_dim': [1024, 2048, 1024, 3072, 2048, 1024], 'decoder_layer_num': [3], 'encoder_self_attention_heads': [8, 4, 4, 4, 4, 4], 'decoder_self_attention_heads': [4, 4, 8, 8, 8, 4], 'decoder_ende_attention_heads': [4, 4, 8, 4, 4, 4], 'decoder_arbitrary_ende_attn': [2, 2, 2, 1, -1, -1]}",2022-11-29 23:06:54.484344,0,1688332896,25.54971935098368 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 1024, 3072, 3072, 3072], 'decoder_ffn_embed_dim': [1024, 2048, 3072, 1024, 3072, 2048], 'decoder_layer_num': [3], 'encoder_self_attention_heads': [8, 4, 8, 4, 8, 4], 'decoder_self_attention_heads': [8, 4, 4, 8, 8, 8], 'decoder_ende_attention_heads': [8, 4, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, 1, 1, 1, 1, 2]}",2022-11-29 23:07:31.477666,0,1540039776,25.66937359699742 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [640], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 2048, 2048], 'decoder_ffn_embed_dim': [2048, 2048, 3072, 3072, 2048, 3072], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [4, 8, 8, 8, 4, 8], 'decoder_self_attention_heads': [4, 8, 8, 8, 8, 8], 'decoder_ende_attention_heads': [4, 8, 4, 4, 8, 8], 'decoder_arbitrary_ende_attn': [-1, -1, 2, 2, 1, 1]}",2022-11-29 23:08:00.927883,0,1543246170,25.23650526106691 +"{'encoder_embed_dim': [640], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 2048, 1024, 2048, 3072, 1024], 'decoder_ffn_embed_dim': [3072, 2048, 1024, 3072, 1024, 2048], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [4, 8, 8, 4, 4, 8], 'decoder_self_attention_heads': [8, 8, 8, 8, 8, 8], 'decoder_ende_attention_heads': [4, 8, 8, 4, 8, 8], 'decoder_arbitrary_ende_attn': [2, 1, 2, 2, -1, 2]}",2022-11-29 23:08:37.748235,0,1840608102,25.84950449942653 +"{'encoder_embed_dim': [640], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 2048, 2048, 2048, 2048, 3072], 'decoder_ffn_embed_dim': [1024, 2048, 2048, 3072, 1024, 3072], 'decoder_layer_num': [3], 'encoder_self_attention_heads': [8, 4, 8, 8, 4, 4], 'decoder_self_attention_heads': [4, 4, 4, 8, 8, 8], 'decoder_ende_attention_heads': [4, 4, 4, 4, 8, 8], 'decoder_arbitrary_ende_attn': [1, 2, 1, 2, 2, -1]}",2022-11-29 23:09:14.377541,0,1662908256,25.748175360241753 +"{'encoder_embed_dim': [640], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [3072, 3072, 2048, 1024, 3072, 3072], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 2048, 3072, 1024], 'decoder_layer_num': [5], 'encoder_self_attention_heads': [4, 4, 4, 8, 4, 4], 'decoder_self_attention_heads': [8, 8, 4, 8, 4, 8], 'decoder_ende_attention_heads': [8, 4, 8, 4, 4, 4], 'decoder_arbitrary_ende_attn': [2, 1, 1, 2, 1, -1]}",2022-11-29 23:09:53.068127,0,2074525548,26.02259252150837 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [640], 'encoder_ffn_embed_dim': [1024, 1024, 1024, 3072, 2048, 1024], 'decoder_ffn_embed_dim': [2048, 2048, 3072, 2048, 3072, 2048], 'decoder_layer_num': [5], 'encoder_self_attention_heads': [4, 8, 4, 8, 4, 8], 'decoder_self_attention_heads': [8, 8, 8, 4, 8, 4], 'decoder_ende_attention_heads': [8, 8, 4, 4, 4, 4], 'decoder_arbitrary_ende_attn': [1, 2, 1, -1, 2, 2]}",2022-11-29 23:10:31.871226,0,2061818988,26.071718195164653 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 1024, 3072], 'decoder_ffn_embed_dim': [2048, 2048, 3072, 3072, 1024, 1024], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [4, 8, 4, 4, 4, 4], 'decoder_self_attention_heads': [4, 8, 8, 8, 4, 4], 'decoder_ende_attention_heads': [8, 4, 4, 8, 4, 4], 'decoder_arbitrary_ende_attn': [2, 1, -1, 2, 1, 2]}",2022-11-29 23:11:01.734997,0,1412290650,25.399141175298542 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [3072, 2048, 3072, 2048, 3072, 1024], 'decoder_ffn_embed_dim': [2048, 3072, 3072, 2048, 1024, 1024], 'decoder_layer_num': [6], 'encoder_self_attention_heads': [4, 8, 8, 4, 4, 8], 'decoder_self_attention_heads': [8, 8, 4, 8, 4, 8], 'decoder_ende_attention_heads': [4, 4, 8, 8, 4, 4], 'decoder_arbitrary_ende_attn': [1, 2, -1, -1, -1, 1]}",2022-11-29 23:11:40.315729,0,1971394674,26.21330617046487 +"{'encoder_embed_dim': [640], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 2048, 2048, 2048, 2048, 1024], 'decoder_ffn_embed_dim': [3072, 2048, 3072, 2048, 3072, 1024], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [4, 8, 4, 4, 8, 8], 'decoder_self_attention_heads': [4, 4, 8, 8, 8, 8], 'decoder_ende_attention_heads': [4, 8, 8, 4, 4, 4], 'decoder_arbitrary_ende_attn': [2, 1, 1, 1, 1, -1]}",2022-11-29 23:12:16.427543,0,1830900582,26.184771020867597 +"{'encoder_embed_dim': [640], 'decoder_embed_dim': [640], 'encoder_ffn_embed_dim': [3072, 1024, 2048, 3072, 1024, 1024], 'decoder_ffn_embed_dim': [3072, 3072, 3072, 2048, 3072, 2048], 'decoder_layer_num': [5], 'encoder_self_attention_heads': [4, 8, 4, 8, 8, 8], 'decoder_self_attention_heads': [4, 8, 4, 4, 4, 8], 'decoder_ende_attention_heads': [8, 8, 8, 4, 8, 8], 'decoder_arbitrary_ende_attn': [2, 1, 1, 2, 2, 1]}",2022-11-29 23:12:57.379424,0,2421538668,26.175261088262666 +"{'encoder_embed_dim': [640], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 1024, 3072, 1024, 1024], 'decoder_ffn_embed_dim': [2048, 3072, 3072, 3072, 3072, 2048], 'decoder_layer_num': [1], 'encoder_self_attention_heads': [8, 4, 4, 4, 4, 4], 'decoder_self_attention_heads': [8, 8, 4, 8, 4, 4], 'decoder_ende_attention_heads': [8, 8, 4, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, 2, 2, 1, -1]}",2022-11-29 23:13:25.381877,0,1251290964,23.320679652947288 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 3072, 1024, 3072, 1024, 2048], 'decoder_ffn_embed_dim': [1024, 3072, 2048, 1024, 3072, 2048], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [8, 8, 8, 4, 8, 8], 'decoder_self_attention_heads': [4, 8, 4, 8, 8, 8], 'decoder_ende_attention_heads': [4, 4, 8, 4, 4, 4], 'decoder_arbitrary_ende_attn': [1, -1, 1, -1, -1, 1]}",2022-11-29 23:13:58.935098,0,1588224102,25.972021275557776 +"{'encoder_embed_dim': [640], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 2048, 3072, 1024, 2048], 'decoder_ffn_embed_dim': [2048, 1024, 3072, 3072, 3072, 2048], 'decoder_layer_num': [5], 'encoder_self_attention_heads': [8, 4, 8, 4, 8, 8], 'decoder_self_attention_heads': [4, 8, 8, 4, 4, 8], 'decoder_ende_attention_heads': [8, 8, 8, 4, 8, 4], 'decoder_arbitrary_ende_attn': [-1, 1, 1, -1, 1, 2]}",2022-11-29 23:14:35.478235,0,1960738668,26.14494989795422 +"{'encoder_embed_dim': [640], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [3072, 2048, 2048, 2048, 3072, 1024], 'decoder_ffn_embed_dim': [3072, 2048, 3072, 2048, 1024, 2048], 'decoder_layer_num': [1], 'encoder_self_attention_heads': [4, 8, 8, 8, 4, 8], 'decoder_self_attention_heads': [8, 4, 4, 4, 4, 8], 'decoder_ende_attention_heads': [8, 4, 8, 8, 8, 8], 'decoder_arbitrary_ende_attn': [-1, -1, -1, -1, 2, 2]}",2022-11-29 23:15:03.906597,0,1419452244,22.7811520313731 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 3072, 3072, 1024, 1024, 1024], 'decoder_ffn_embed_dim': [3072, 2048, 2048, 3072, 1024, 3072], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 8, 4, 8, 8, 4], 'decoder_self_attention_heads': [4, 4, 4, 8, 8, 8], 'decoder_ende_attention_heads': [4, 4, 4, 4, 4, 4], 'decoder_arbitrary_ende_attn': [1, -1, 1, 2, 2, 2]}",2022-11-29 23:15:33.346144,0,1316075610,24.898481627702125 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 3072, 3072, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 3072, 2048, 2048, 2048], 'decoder_layer_num': [5], 'encoder_self_attention_heads': [8, 4, 8, 8, 8, 4], 'decoder_self_attention_heads': [4, 4, 4, 4, 8, 8], 'decoder_ende_attention_heads': [8, 4, 8, 4, 8, 8], 'decoder_arbitrary_ende_attn': [1, 2, 1, 2, 1, 2]}",2022-11-29 23:16:12.674460,0,1893596268,26.420978678385804 +"{'encoder_embed_dim': [640], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 2048, 2048, 3072, 2048], 'decoder_ffn_embed_dim': [1024, 3072, 1024, 1024, 1024, 1024], 'decoder_layer_num': [3], 'encoder_self_attention_heads': [8, 8, 8, 4, 8, 4], 'decoder_self_attention_heads': [4, 4, 8, 4, 8, 4], 'decoder_ende_attention_heads': [4, 4, 8, 4, 4, 8], 'decoder_arbitrary_ende_attn': [-1, 1, 2, -1, 2, -1]}",2022-11-29 23:16:45.186860,0,1642325856,25.982442735663543 +"{'encoder_embed_dim': [640], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [3072, 1024, 1024, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 2048, 2048, 2048, 1024, 1024], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [4, 8, 4, 4, 8, 8], 'decoder_self_attention_heads': [8, 8, 8, 8, 4, 4], 'decoder_ende_attention_heads': [8, 4, 4, 4, 8, 8], 'decoder_arbitrary_ende_attn': [1, 2, 2, 2, 2, 2]}",2022-11-29 23:17:21.824602,0,1856336742,25.94442144683277 +"{'encoder_embed_dim': [640], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [3072, 3072, 3072, 3072, 1024, 2048], 'decoder_ffn_embed_dim': [2048, 2048, 3072, 1024, 1024, 1024], 'decoder_layer_num': [5], 'encoder_self_attention_heads': [8, 8, 8, 4, 8, 4], 'decoder_self_attention_heads': [4, 4, 8, 4, 4, 8], 'decoder_ende_attention_heads': [8, 8, 4, 4, 4, 4], 'decoder_arbitrary_ende_attn': [-1, 1, 2, -1, -1, 1]}",2022-11-29 23:17:58.260936,0,2023653228,26.07457768169323 +"{'encoder_embed_dim': [640], 'decoder_embed_dim': [640], 'encoder_ffn_embed_dim': [1024, 3072, 2048, 1024, 3072, 2048], 'decoder_ffn_embed_dim': [3072, 1024, 3072, 3072, 1024, 3072], 'decoder_layer_num': [1], 'encoder_self_attention_heads': [8, 4, 4, 4, 4, 8], 'decoder_self_attention_heads': [4, 4, 4, 8, 4, 4], 'decoder_ende_attention_heads': [4, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [2, 2, 1, -1, 2, 2]}",2022-11-29 23:18:27.089691,0,1582529364,23.950252879196924 +"{'encoder_embed_dim': [640], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [3072, 3072, 2048, 3072, 3072, 2048], 'decoder_ffn_embed_dim': [2048, 3072, 1024, 3072, 3072, 2048], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [4, 8, 8, 8, 4, 4], 'decoder_self_attention_heads': [8, 8, 4, 4, 4, 8], 'decoder_ende_attention_heads': [4, 4, 8, 4, 4, 8], 'decoder_arbitrary_ende_attn': [2, 1, 2, -1, 2, -1]}",2022-11-29 23:19:02.726800,0,2035372902,26.447028779186226 +"{'encoder_embed_dim': [640], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [3072, 1024, 3072, 2048, 2048, 3072], 'decoder_ffn_embed_dim': [2048, 2048, 2048, 2048, 3072, 1024], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [8, 4, 4, 8, 8, 4], 'decoder_self_attention_heads': [8, 4, 4, 8, 4, 4], 'decoder_ende_attention_heads': [4, 8, 4, 4, 4, 8], 'decoder_arbitrary_ende_attn': [-1, 2, 2, 2, 2, 2]}",2022-11-29 23:19:39.439886,0,1945854822,26.003986822056245 +"{'encoder_embed_dim': [640], 'decoder_embed_dim': [640], 'encoder_ffn_embed_dim': [1024, 1024, 3072, 3072, 2048, 2048], 'decoder_ffn_embed_dim': [2048, 3072, 3072, 3072, 1024, 2048], 'decoder_layer_num': [5], 'encoder_self_attention_heads': [4, 4, 8, 4, 4, 4], 'decoder_self_attention_heads': [8, 4, 4, 4, 8, 4], 'decoder_ende_attention_heads': [8, 4, 8, 4, 8, 4], 'decoder_arbitrary_ende_attn': [-1, 1, 2, -1, 1, 1]}",2022-11-29 23:20:17.584270,0,2299887468,26.442901941442834 +"{'encoder_embed_dim': [640], 'decoder_embed_dim': [640], 'encoder_ffn_embed_dim': [1024, 1024, 2048, 1024, 3072, 3072], 'decoder_ffn_embed_dim': [3072, 2048, 3072, 1024, 1024, 2048], 'decoder_layer_num': [6], 'encoder_self_attention_heads': [8, 4, 4, 8, 4, 4], 'decoder_self_attention_heads': [8, 8, 4, 8, 4, 4], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 8], 'decoder_arbitrary_ende_attn': [1, -1, 1, -1, 2, 2]}",2022-11-29 23:20:58.503235,0,2382305394,26.517600251211515 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [640], 'encoder_ffn_embed_dim': [1024, 1024, 2048, 1024, 3072, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 1024, 1024, 2048, 2048], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [8, 4, 4, 8, 8, 8], 'decoder_self_attention_heads': [4, 4, 8, 8, 4, 8], 'decoder_ende_attention_heads': [8, 4, 8, 4, 8, 4], 'decoder_arbitrary_ende_attn': [2, 1, 2, -1, 1, -1]}",2022-11-29 23:21:34.739346,0,1693274982,25.604765879724265 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [640], 'encoder_ffn_embed_dim': [2048, 1024, 3072, 3072, 1024, 2048], 'decoder_ffn_embed_dim': [2048, 3072, 1024, 3072, 3072, 3072], 'decoder_layer_num': [5], 'encoder_self_attention_heads': [4, 4, 4, 8, 8, 4], 'decoder_self_attention_heads': [8, 8, 4, 8, 4, 8], 'decoder_ende_attention_heads': [8, 4, 4, 4, 4, 8], 'decoder_arbitrary_ende_attn': [1, 1, 2, 2, 2, 1]}",2022-11-29 23:22:15.291570,0,2189491308,25.71548559680124 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [640], 'encoder_ffn_embed_dim': [1024, 3072, 3072, 3072, 1024, 3072], 'decoder_ffn_embed_dim': [3072, 2048, 3072, 2048, 3072, 2048], 'decoder_layer_num': [5], 'encoder_self_attention_heads': [8, 8, 8, 8, 4, 4], 'decoder_self_attention_heads': [4, 4, 4, 4, 8, 8], 'decoder_ende_attention_heads': [8, 4, 8, 8, 4, 4], 'decoder_arbitrary_ende_attn': [1, 2, -1, 1, -1, 2]}",2022-11-29 23:22:53.173666,0,2225126508,26.022116504070834 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [640], 'encoder_ffn_embed_dim': [3072, 1024, 2048, 2048, 2048, 1024], 'decoder_ffn_embed_dim': [1024, 2048, 2048, 3072, 1024, 2048], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [4, 4, 4, 8, 4, 8], 'decoder_self_attention_heads': [4, 4, 8, 8, 8, 4], 'decoder_ende_attention_heads': [8, 8, 4, 8, 4, 8], 'decoder_arbitrary_ende_attn': [1, 2, 2, 1, 1, 1]}",2022-11-29 23:23:30.358756,0,1890804582,25.70580338518658 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 1024, 1024, 2048, 2048], 'decoder_ffn_embed_dim': [2048, 1024, 1024, 1024, 2048, 2048], 'decoder_layer_num': [1], 'encoder_self_attention_heads': [8, 4, 4, 4, 8, 8], 'decoder_self_attention_heads': [8, 4, 8, 4, 4, 8], 'decoder_ende_attention_heads': [4, 4, 8, 4, 4, 4], 'decoder_arbitrary_ende_attn': [1, -1, -1, 1, 1, -1]}",2022-11-29 23:24:17.798475,0,1156869204,23.410008497520735 +"{'encoder_embed_dim': [640], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 3072, 1024, 2048, 2048], 'decoder_ffn_embed_dim': [2048, 3072, 3072, 2048, 2048, 3072], 'decoder_layer_num': [6], 'encoder_self_attention_heads': [8, 8, 4, 4, 4, 4], 'decoder_self_attention_heads': [4, 8, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 4, 4, 8, 8, 8], 'decoder_arbitrary_ende_attn': [2, 2, 1, 2, 1, -1]}",2022-11-29 23:24:59.431502,0,2226789234,26.12743322887944 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 1024, 1024, 2048, 2048], 'decoder_ffn_embed_dim': [2048, 1024, 1024, 1024, 2048, 2048], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 4, 4, 4, 8, 8], 'decoder_self_attention_heads': [8, 4, 8, 4, 4, 8], 'decoder_ende_attention_heads': [4, 4, 4, 8, 4, 4], 'decoder_arbitrary_ende_attn': [1, -1, -1, 1, 2, -1]}",2022-11-29 23:25:28.533368,0,1253161050,24.92578691671575 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 1024, 1024, 2048, 2048], 'decoder_ffn_embed_dim': [2048, 3072, 2048, 1024, 2048, 3072], 'decoder_layer_num': [1], 'encoder_self_attention_heads': [8, 8, 4, 4, 4, 4], 'decoder_self_attention_heads': [4, 8, 8, 4, 8, 4], 'decoder_ende_attention_heads': [4, 4, 8, 8, 4, 4], 'decoder_arbitrary_ende_attn': [2, 2, 1, 2, 1, -1]}",2022-11-29 23:25:56.800152,0,1173519444,23.28780146013261 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 1024, 1024, 2048, 2048], 'decoder_ffn_embed_dim': [2048, 2048, 3072, 1024, 2048, 3072], 'decoder_layer_num': [3], 'encoder_self_attention_heads': [8, 8, 4, 4, 8, 4], 'decoder_self_attention_heads': [4, 8, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 4, 8, 8, 4, 4], 'decoder_arbitrary_ende_attn': [2, 2, 1, 1, 2, -1]}",2022-11-29 23:26:33.876814,0,1510425696,25.417654573154596 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 1024, 1024, 2048, 2048], 'decoder_ffn_embed_dim': [2048, 2048, 3072, 1024, 2048, 3072], 'decoder_layer_num': [5], 'encoder_self_attention_heads': [8, 8, 4, 4, 4, 4], 'decoder_self_attention_heads': [4, 4, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 4, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [2, 2, 1, 2, 1, -1]}",2022-11-29 23:27:12.966148,0,1784417388,25.760464304216683 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 1024, 1024, 2048, 2048], 'decoder_ffn_embed_dim': [2048, 1024, 3072, 1024, 2048, 3072], 'decoder_layer_num': [3], 'encoder_self_attention_heads': [8, 8, 4, 4, 8, 4], 'decoder_self_attention_heads': [4, 8, 8, 8, 8, 8], 'decoder_ende_attention_heads': [4, 4, 8, 8, 4, 4], 'decoder_arbitrary_ende_attn': [2, 2, 1, 1, 2, -1]}",2022-11-29 23:27:50.697432,0,1478968416,25.19809949508387 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 1024, 1024, 2048, 2048], 'decoder_ffn_embed_dim': [2048, 2048, 3072, 1024, 2048, 3072], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 8, 4, 4, 4, 4], 'decoder_self_attention_heads': [4, 8, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 4, 8, 8, 4, 4], 'decoder_arbitrary_ende_attn': [2, 2, 1, 1, 2, -1]}",2022-11-29 23:28:21.688106,0,1334569050,24.839033226584537 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 3072, 1024, 2048, 2048], 'decoder_ffn_embed_dim': [2048, 3072, 3072, 2048, 2048, 3072], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [8, 8, 4, 4, 4, 4], 'decoder_self_attention_heads': [4, 8, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 4, 8, 8, 8, 8], 'decoder_arbitrary_ende_attn': [2, 2, 1, 2, 1, -1]}",2022-11-29 23:28:58.223688,0,1765847142,25.798742018362613 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 1024, 1024, 2048, 2048], 'decoder_ffn_embed_dim': [2048, 2048, 3072, 1024, 2048, 3072], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 8, 4, 4, 4, 4], 'decoder_self_attention_heads': [4, 4, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 4, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [2, 2, 1, 2, 2, -1]}",2022-11-29 23:29:28.855348,0,1334569050,24.797771375743167 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 1024, 1024, 2048, 2048], 'decoder_ffn_embed_dim': [2048, 2048, 3072, 1024, 2048, 3072], 'decoder_layer_num': [6], 'encoder_self_attention_heads': [8, 8, 4, 4, 4, 4], 'decoder_self_attention_heads': [4, 8, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 4, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [2, 2, 1, 2, 1, -1]}",2022-11-29 23:30:10.461923,0,1943623794,25.731563707029 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 3072, 1024, 2048, 2048], 'decoder_ffn_embed_dim': [2048, 3072, 3072, 1024, 2048, 3072], 'decoder_layer_num': [6], 'encoder_self_attention_heads': [8, 8, 4, 4, 4, 4], 'decoder_self_attention_heads': [4, 8, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 4, 8, 8, 8, 8], 'decoder_arbitrary_ende_attn': [2, 2, 1, 2, 1, -1]}",2022-11-29 23:30:51.410770,0,2006538354,25.93412075285396 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 1024, 1024, 2048, 2048], 'decoder_ffn_embed_dim': [2048, 2048, 3072, 1024, 2048, 3072], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [8, 8, 4, 4, 4, 4], 'decoder_self_attention_heads': [4, 4, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 4, 8, 8, 8, 8], 'decoder_arbitrary_ende_attn': [2, 2, 1, 2, 1, -1]}",2022-11-29 23:31:27.644610,0,1640018022,25.48925301539262 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 1024, 1024, 2048, 2048], 'decoder_ffn_embed_dim': [2048, 1024, 3072, 1024, 2048, 3072], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 8, 4, 4, 8, 4], 'decoder_self_attention_heads': [4, 8, 8, 8, 8, 8], 'decoder_ende_attention_heads': [4, 4, 8, 8, 4, 4], 'decoder_arbitrary_ende_attn': [2, 2, 1, 1, 2, -1]}",2022-11-29 23:31:58.151027,0,1303111770,24.636635975207156 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 1024, 1024, 2048, 2048], 'decoder_ffn_embed_dim': [2048, 2048, 3072, 1024, 2048, 3072], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [8, 8, 4, 4, 4, 4], 'decoder_self_attention_heads': [4, 4, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 4, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, 2, 1, 2, 2, -1]}",2022-11-29 23:32:34.035788,0,1623367782,25.55649399276896 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 1024, 1024, 2048, 2048], 'decoder_ffn_embed_dim': [2048, 1024, 3072, 1024, 2048, 3072], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [8, 8, 4, 4, 4, 4], 'decoder_self_attention_heads': [4, 4, 8, 8, 8, 8], 'decoder_ende_attention_heads': [4, 4, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, 2, 1, 2, 2, -1]}",2022-11-29 23:33:09.644672,0,1591910502,25.363108576481086 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 1024, 1024, 2048, 2048], 'decoder_ffn_embed_dim': [2048, 3072, 3072, 1024, 2048, 3072], 'decoder_layer_num': [6], 'encoder_self_attention_heads': [8, 8, 4, 4, 4, 4], 'decoder_self_attention_heads': [4, 8, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 4, 8, 8, 8, 8], 'decoder_arbitrary_ende_attn': [2, 2, 1, 2, 1, -1]}",2022-11-29 23:33:50.627277,0,1943623794,25.471976693004432 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 3072, 1024, 2048, 2048], 'decoder_ffn_embed_dim': [2048, 2048, 3072, 2048, 2048, 3072], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [8, 8, 4, 4, 4, 4], 'decoder_self_attention_heads': [4, 8, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 4, 8, 8, 4, 8], 'decoder_arbitrary_ende_attn': [2, 2, 1, 2, 1, -1]}",2022-11-29 23:34:27.530520,0,1734389862,26.022537181002058 +"{'encoder_embed_dim': [640], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 1024, 1024, 2048, 2048], 'decoder_ffn_embed_dim': [2048, 3072, 3072, 2048, 2048, 3072], 'decoder_layer_num': [6], 'encoder_self_attention_heads': [8, 8, 4, 4, 4, 4], 'decoder_self_attention_heads': [4, 8, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 4, 8, 8, 8, 8], 'decoder_arbitrary_ende_attn': [2, 2, 1, 2, 1, -1]}",2022-11-29 23:35:08.733562,0,2148146034,25.76800258460572 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 1024, 1024, 2048, 2048], 'decoder_ffn_embed_dim': [2048, 2048, 3072, 1024, 2048, 3072], 'decoder_layer_num': [5], 'encoder_self_attention_heads': [8, 8, 4, 4, 4, 4], 'decoder_self_attention_heads': [4, 8, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 4, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [2, 2, 1, 2, 1, -1]}",2022-11-29 23:35:48.076930,0,1784417388,25.746269422993464 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 3072, 1024, 2048, 2048], 'decoder_ffn_embed_dim': [2048, 2048, 3072, 2048, 2048, 3072], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 8, 4, 4, 4, 4], 'decoder_self_attention_heads': [4, 8, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 4, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [2, 2, 1, 2, 1, -1]}",2022-11-29 23:36:18.846293,0,1397483610,24.957529704052245 +"{'encoder_embed_dim': [640], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 3072, 1024, 2048, 2048], 'decoder_ffn_embed_dim': [2048, 3072, 3072, 2048, 2048, 3072], 'decoder_layer_num': [6], 'encoder_self_attention_heads': [8, 8, 4, 4, 4, 4], 'decoder_self_attention_heads': [4, 8, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 4, 8, 8, 8, 8], 'decoder_arbitrary_ende_attn': [2, 2, 1, 2, 1, -1]}",2022-11-29 23:37:00.512837,0,2226789234,26.00635517076823 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 1024, 1024, 2048, 2048], 'decoder_ffn_embed_dim': [2048, 2048, 3072, 1024, 2048, 3072], 'decoder_layer_num': [6], 'encoder_self_attention_heads': [8, 8, 4, 4, 4, 4], 'decoder_self_attention_heads': [4, 8, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 4, 8, 8, 8, 8], 'decoder_arbitrary_ende_attn': [2, 2, 1, 2, 1, -1]}",2022-11-29 23:37:41.235777,0,1912166514,25.566969946150067 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 1024, 1024, 2048, 2048], 'decoder_ffn_embed_dim': [2048, 3072, 3072, 1024, 2048, 3072], 'decoder_layer_num': [3], 'encoder_self_attention_heads': [8, 8, 4, 4, 4, 4], 'decoder_self_attention_heads': [4, 8, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 4, 8, 8, 8, 8], 'decoder_arbitrary_ende_attn': [2, 2, 1, 2, 2, -1]}",2022-11-29 23:38:18.262099,0,1541882976,25.33456038818163 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 1024, 1024, 2048, 2048], 'decoder_ffn_embed_dim': [2048, 3072, 3072, 1024, 2048, 3072], 'decoder_layer_num': [5], 'encoder_self_attention_heads': [8, 8, 4, 4, 4, 4], 'decoder_self_attention_heads': [4, 8, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 4, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [2, 2, 1, 2, 1, -1]}",2022-11-29 23:38:57.094675,0,1815874668,25.598195468771692 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 1024, 1024, 2048, 2048], 'decoder_ffn_embed_dim': [2048, 2048, 3072, 1024, 2048, 3072], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [8, 8, 4, 4, 4, 4], 'decoder_self_attention_heads': [4, 8, 8, 4, 4, 8], 'decoder_ende_attention_heads': [4, 4, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [2, 2, 1, 2, 1, -1]}",2022-11-29 23:39:33.146325,0,1640018022,25.552118216389637 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 1024, 1024, 2048, 2048], 'decoder_ffn_embed_dim': [2048, 1024, 3072, 1024, 2048, 3072], 'decoder_layer_num': [3], 'encoder_self_attention_heads': [8, 8, 4, 4, 4, 4], 'decoder_self_attention_heads': [4, 8, 8, 8, 8, 8], 'decoder_ende_attention_heads': [4, 4, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, 2, 1, 1, 2, -1]}",2022-11-29 23:40:09.706551,0,1462318176,25.29731961246495 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 1024, 1024, 2048, 2048], 'decoder_ffn_embed_dim': [2048, 1024, 1024, 2048, 2048, 3072], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 8, 4, 4, 4, 4], 'decoder_self_attention_heads': [4, 4, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 4, 4, 4, 4, 4], 'decoder_arbitrary_ende_attn': [1, -1, 1, 1, 1, -1]}",2022-11-29 23:40:38.756573,0,1253161050,24.86012555534481 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 1024, 1024, 2048, 2048], 'decoder_ffn_embed_dim': [2048, 1024, 1024, 1024, 2048, 3072], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 8, 4, 4, 8, 8], 'decoder_self_attention_heads': [4, 4, 8, 4, 8, 4], 'decoder_ende_attention_heads': [4, 4, 4, 8, 4, 4], 'decoder_arbitrary_ende_attn': [1, -1, -1, 1, 2, -1]}",2022-11-29 23:41:07.530840,0,1253161050,24.9440878831812 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 1024, 1024, 2048, 2048], 'decoder_ffn_embed_dim': [2048, 3072, 2048, 1024, 2048, 2048], 'decoder_layer_num': [1], 'encoder_self_attention_heads': [8, 8, 4, 4, 4, 4], 'decoder_self_attention_heads': [4, 8, 8, 4, 8, 4], 'decoder_ende_attention_heads': [4, 4, 8, 4, 4, 4], 'decoder_arbitrary_ende_attn': [1, 2, -1, 2, 1, -1]}",2022-11-29 23:41:34.610233,0,1156869204,23.274975491818346 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 1024, 1024, 2048, 2048], 'decoder_ffn_embed_dim': [2048, 1024, 3072, 1024, 2048, 3072], 'decoder_layer_num': [1], 'encoder_self_attention_heads': [8, 8, 4, 4, 8, 4], 'decoder_self_attention_heads': [4, 4, 8, 4, 8, 4], 'decoder_ende_attention_heads': [4, 4, 8, 8, 8, 8], 'decoder_arbitrary_ende_attn': [2, 2, 1, 1, 2, -1]}",2022-11-29 23:42:01.811817,0,1173519444,23.245718341488995 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 1024, 1024, 2048, 2048], 'decoder_ffn_embed_dim': [2048, 1024, 3072, 1024, 2048, 3072], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [8, 8, 4, 4, 4, 4], 'decoder_self_attention_heads': [4, 8, 8, 8, 8, 8], 'decoder_ende_attention_heads': [4, 4, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, 2, 1, 2, 2, -1]}",2022-11-29 23:42:37.425300,0,1591910502,25.413674094921433 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 1024, 1024, 2048, 2048], 'decoder_ffn_embed_dim': [2048, 1024, 3072, 1024, 2048, 3072], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [8, 8, 4, 4, 4, 4], 'decoder_self_attention_heads': [8, 8, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 4, 8, 8, 4, 4], 'decoder_arbitrary_ende_attn': [2, 2, 1, 2, 1, -1]}",2022-11-29 23:43:13.878058,0,1608560742,25.365752658832324 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 1024, 1024, 2048, 2048], 'decoder_ffn_embed_dim': [2048, 1024, 3072, 1024, 2048, 3072], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [8, 8, 4, 4, 4, 4], 'decoder_self_attention_heads': [4, 4, 8, 8, 8, 8], 'decoder_ende_attention_heads': [4, 4, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [2, 2, 1, 2, 2, -1]}",2022-11-29 23:43:50.046834,0,1608560742,25.36745967365502 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 1024, 1024, 2048, 2048], 'decoder_ffn_embed_dim': [2048, 1024, 1024, 2048, 2048, 3072], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 8, 4, 4, 4, 8], 'decoder_self_attention_heads': [4, 4, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 4, 4, 4, 4, 8], 'decoder_arbitrary_ende_attn': [1, -1, -1, 1, 1, -1]}",2022-11-29 23:44:18.851277,0,1253161050,25.005251094503805 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 1024, 1024, 2048, 2048], 'decoder_ffn_embed_dim': [2048, 1024, 3072, 1024, 2048, 3072], 'decoder_layer_num': [1], 'encoder_self_attention_heads': [8, 8, 4, 4, 8, 4], 'decoder_self_attention_heads': [4, 4, 8, 4, 8, 4], 'decoder_ende_attention_heads': [4, 4, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, 2, 1, 1, 2, -1]}",2022-11-29 23:44:46.110775,0,1156869204,23.277213964898888 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 1024, 1024, 2048, 2048], 'decoder_ffn_embed_dim': [2048, 1024, 1024, 1024, 2048, 3072], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 8, 4, 4, 8, 4], 'decoder_self_attention_heads': [4, 4, 8, 4, 8, 4], 'decoder_ende_attention_heads': [4, 4, 4, 8, 4, 4], 'decoder_arbitrary_ende_attn': [1, -1, 1, 1, 2, -1]}",2022-11-29 23:45:14.844805,0,1253161050,24.834074772858695 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 1024, 1024, 2048, 2048], 'decoder_ffn_embed_dim': [2048, 1024, 3072, 1024, 2048, 3072], 'decoder_layer_num': [1], 'encoder_self_attention_heads': [8, 8, 4, 4, 4, 8], 'decoder_self_attention_heads': [4, 4, 8, 4, 8, 4], 'decoder_ende_attention_heads': [4, 4, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, 2, 1, 1, 2, -1]}",2022-11-29 23:45:42.403812,0,1156869204,23.178044546083612 +"{'encoder_embed_dim': [640], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 1024, 1024, 2048, 2048], 'decoder_ffn_embed_dim': [2048, 2048, 3072, 2048, 2048, 3072], 'decoder_layer_num': [6], 'encoder_self_attention_heads': [8, 8, 4, 4, 4, 4], 'decoder_self_attention_heads': [4, 8, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 4, 8, 8, 8, 8], 'decoder_arbitrary_ende_attn': [2, 2, 1, 2, 1, -1]}",2022-11-29 23:46:23.812450,0,2116688754,25.77854866366567 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 3072, 1024, 2048, 2048], 'decoder_ffn_embed_dim': [2048, 2048, 3072, 1024, 2048, 3072], 'decoder_layer_num': [5], 'encoder_self_attention_heads': [8, 8, 4, 4, 4, 4], 'decoder_self_attention_heads': [4, 8, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 4, 8, 8, 8, 8], 'decoder_arbitrary_ende_attn': [2, 2, 1, 2, 1, -1]}",2022-11-29 23:47:03.270121,0,1815874668,25.683722427952674 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 1024, 1024, 2048, 2048], 'decoder_ffn_embed_dim': [2048, 2048, 3072, 1024, 2048, 3072], 'decoder_layer_num': [6], 'encoder_self_attention_heads': [8, 8, 4, 4, 4, 4], 'decoder_self_attention_heads': [4, 8, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 4, 8, 8, 4, 4], 'decoder_arbitrary_ende_attn': [2, 2, 1, 1, 1, -1]}",2022-11-29 23:47:43.427381,0,1926973554,25.6886566287921 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 1024, 1024, 2048, 2048], 'decoder_ffn_embed_dim': [2048, 1024, 1024, 1024, 2048, 2048], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 8, 4, 4, 4, 8], 'decoder_self_attention_heads': [8, 4, 8, 4, 4, 8], 'decoder_ende_attention_heads': [4, 4, 8, 8, 4, 4], 'decoder_arbitrary_ende_attn': [2, 2, 1, 1, 2, -1]}",2022-11-29 23:48:14.256367,0,1303111770,24.786229852100394 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 3072, 1024, 2048, 2048], 'decoder_ffn_embed_dim': [2048, 2048, 3072, 1024, 2048, 3072], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [8, 8, 4, 4, 4, 4], 'decoder_self_attention_heads': [4, 8, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 4, 8, 8, 8, 8], 'decoder_arbitrary_ende_attn': [2, 2, 1, 2, 1, -1]}",2022-11-29 23:48:50.842520,0,1702932582,25.571343061345555 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 1024, 1024, 2048, 2048], 'decoder_ffn_embed_dim': [2048, 3072, 3072, 2048, 2048, 3072], 'decoder_layer_num': [6], 'encoder_self_attention_heads': [8, 8, 4, 4, 4, 4], 'decoder_self_attention_heads': [4, 8, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 4, 8, 8, 8, 8], 'decoder_arbitrary_ende_attn': [2, 2, 1, 2, 1, -1]}",2022-11-29 23:49:31.529598,0,1975081074,25.722542126362086 +"{'encoder_embed_dim': [640], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 3072, 1024, 2048, 2048], 'decoder_ffn_embed_dim': [2048, 2048, 3072, 1024, 2048, 3072], 'decoder_layer_num': [6], 'encoder_self_attention_heads': [8, 8, 4, 4, 4, 4], 'decoder_self_attention_heads': [4, 8, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 4, 8, 8, 8, 8], 'decoder_arbitrary_ende_attn': [2, 2, 1, 2, 1, -1]}",2022-11-29 23:50:13.472772,0,2163874674,26.02132010597597 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 1024, 1024, 2048, 2048], 'decoder_ffn_embed_dim': [2048, 2048, 3072, 2048, 2048, 3072], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [8, 8, 4, 4, 4, 4], 'decoder_self_attention_heads': [4, 8, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 4, 8, 8, 8, 8], 'decoder_arbitrary_ende_attn': [2, 2, 1, 2, 1, -1]}",2022-11-29 23:50:49.447197,0,1671475302,25.60807605923095 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 1024, 1024, 2048, 2048], 'decoder_ffn_embed_dim': [2048, 1024, 1024, 1024, 2048, 2048], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 8, 4, 4, 4, 8], 'decoder_self_attention_heads': [4, 4, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 4, 8, 8, 4, 4], 'decoder_arbitrary_ende_attn': [2, 2, 1, 2, 1, -1]}",2022-11-29 23:51:20.051939,0,1303111770,24.844209249533108 +"{'encoder_embed_dim': [640], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 1024, 1024, 2048, 2048], 'decoder_ffn_embed_dim': [2048, 3072, 3072, 1024, 2048, 3072], 'decoder_layer_num': [6], 'encoder_self_attention_heads': [8, 8, 4, 4, 4, 4], 'decoder_self_attention_heads': [4, 8, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 4, 8, 8, 8, 8], 'decoder_arbitrary_ende_attn': [2, 2, 1, 2, 1, -1]}",2022-11-29 23:52:01.725725,0,2116688754,25.774689748379366 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 3072, 1024, 2048, 2048], 'decoder_ffn_embed_dim': [2048, 3072, 3072, 2048, 2048, 3072], 'decoder_layer_num': [6], 'encoder_self_attention_heads': [8, 8, 4, 4, 4, 4], 'decoder_self_attention_heads': [4, 8, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 4, 8, 8, 8, 8], 'decoder_arbitrary_ende_attn': [2, 2, 1, 2, 1, -1]}",2022-11-29 23:52:43.523756,0,2037995634,25.839133901981196 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 3072, 1024, 2048, 2048], 'decoder_ffn_embed_dim': [2048, 2048, 3072, 1024, 2048, 3072], 'decoder_layer_num': [3], 'encoder_self_attention_heads': [8, 8, 4, 4, 4, 4], 'decoder_self_attention_heads': [4, 8, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 4, 8, 8, 4, 4], 'decoder_arbitrary_ende_attn': [2, 2, 1, 1, 2, -1]}",2022-11-29 23:53:22.095038,0,1573340256,25.369471344799734 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 1024, 1024, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 3072, 3072, 2048], 'decoder_layer_num': [1], 'encoder_self_attention_heads': [8, 4, 8, 4, 4, 8], 'decoder_self_attention_heads': [8, 8, 4, 4, 8, 4], 'decoder_ende_attention_heads': [4, 4, 8, 8, 8, 8], 'decoder_arbitrary_ende_attn': [-1, 1, -1, -1, 1, -1]}",2022-11-29 23:54:11.017282,0,982932564,21.75634266526977 +"{'encoder_embed_dim': [640], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 3072, 1024], 'decoder_layer_num': [6], 'encoder_self_attention_heads': [8, 4, 8, 4, 8, 4], 'decoder_self_attention_heads': [8, 4, 4, 4, 8, 4], 'decoder_ende_attention_heads': [4, 8, 8, 4, 8, 4], 'decoder_arbitrary_ende_attn': [1, 1, 1, -1, 1, 1]}",2022-11-29 23:54:50.536084,0,2047077234,26.357977252559444 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 1024, 1024, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 3072, 2048, 2048], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 4, 8, 4, 4, 8], 'decoder_self_attention_heads': [8, 8, 4, 4, 8, 4], 'decoder_ende_attention_heads': [4, 4, 8, 8, 4, 8], 'decoder_arbitrary_ende_attn': [-1, -1, -1, -1, 1, -1]}",2022-11-29 23:55:18.749444,0,1079224410,23.671710345039983 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 2048, 2048, 1024], 'decoder_layer_num': [1], 'encoder_self_attention_heads': [8, 4, 8, 4, 8, 4], 'decoder_self_attention_heads': [8, 4, 4, 4, 4, 4], 'decoder_ende_attention_heads': [4, 4, 8, 8, 4, 8], 'decoder_arbitrary_ende_attn': [1, 1, 1, -1, 1, 1]}",2022-11-29 23:55:45.598303,0,1062497364,22.630106123134603 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 2048, 1024], 'decoder_layer_num': [6], 'encoder_self_attention_heads': [8, 4, 8, 4, 8, 4], 'decoder_self_attention_heads': [8, 4, 8, 4, 4, 4], 'decoder_ende_attention_heads': [4, 8, 8, 4, 4, 4], 'decoder_arbitrary_ende_attn': [1, 1, 1, -1, 1, 1]}",2022-11-29 23:56:23.927095,0,1799301234,26.237657577301754 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 2048, 1024], 'decoder_layer_num': [6], 'encoder_self_attention_heads': [8, 4, 8, 4, 8, 4], 'decoder_self_attention_heads': [8, 4, 4, 4, 8, 4], 'decoder_ende_attention_heads': [4, 8, 8, 4, 8, 4], 'decoder_arbitrary_ende_attn': [1, 1, 1, -1, 1, 1]}",2022-11-29 23:57:02.821107,0,1799301234,26.235752390680105 +"{'encoder_embed_dim': [640], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 2048, 1024], 'decoder_layer_num': [6], 'encoder_self_attention_heads': [8, 4, 8, 4, 4, 4], 'decoder_self_attention_heads': [8, 4, 4, 4, 8, 4], 'decoder_ende_attention_heads': [4, 8, 8, 4, 4, 4], 'decoder_arbitrary_ende_attn': [1, 1, 1, -1, 1, 1]}",2022-11-29 23:57:41.722481,0,1976298354,26.409237334320427 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 3072, 1024], 'decoder_layer_num': [6], 'encoder_self_attention_heads': [8, 4, 8, 4, 8, 4], 'decoder_self_attention_heads': [8, 4, 4, 4, 8, 4], 'decoder_ende_attention_heads': [4, 8, 8, 4, 8, 4], 'decoder_arbitrary_ende_attn': [1, 1, 1, -1, 1, 1]}",2022-11-29 23:58:20.422891,0,1862215794,26.41037129035317 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 3072, 2048, 2048], 'decoder_layer_num': [3], 'encoder_self_attention_heads': [8, 4, 8, 4, 8, 4], 'decoder_self_attention_heads': [8, 4, 8, 4, 8, 4], 'decoder_ende_attention_heads': [4, 8, 8, 4, 8, 8], 'decoder_arbitrary_ende_attn': [1, 1, -1, -1, 1, -1]}",2022-11-29 23:58:51.406394,0,1334645856,25.47756540184132 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 2048, 2048, 1024], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [8, 4, 8, 4, 8, 4], 'decoder_self_attention_heads': [8, 4, 4, 4, 8, 4], 'decoder_ende_attention_heads': [4, 8, 8, 4, 4, 4], 'decoder_arbitrary_ende_attn': [1, 1, 1, -1, 1, 1]}",2022-11-29 23:59:24.731374,0,1479045222,25.751217013602062 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 2048, 2048, 1024], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [8, 4, 8, 4, 8, 4], 'decoder_self_attention_heads': [8, 4, 8, 4, 4, 4], 'decoder_ende_attention_heads': [4, 8, 8, 4, 4, 4], 'decoder_arbitrary_ende_attn': [1, 1, -1, -1, 1, 1]}",2022-11-29 23:59:57.442326,0,1462394982,25.872011121812324 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 2048, 1024, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 2048, 1024], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [8, 4, 8, 4, 8, 4], 'decoder_self_attention_heads': [8, 4, 8, 4, 8, 4], 'decoder_ende_attention_heads': [4, 8, 8, 4, 4, 4], 'decoder_arbitrary_ende_attn': [-1, 1, 1, -1, 1, 1]}",2022-11-30 00:00:30.249710,0,1399480422,25.17096109375348 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 2048, 1024], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [8, 4, 8, 4, 8, 4], 'decoder_self_attention_heads': [8, 4, 4, 4, 8, 4], 'decoder_ende_attention_heads': [4, 8, 8, 4, 4, 4], 'decoder_arbitrary_ende_attn': [1, 1, 1, -1, 1, 1]}",2022-11-30 00:01:04.136585,0,1573417062,26.100985377959983 +"{'encoder_embed_dim': [640], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 2048, 1024], 'decoder_layer_num': [6], 'encoder_self_attention_heads': [8, 4, 8, 4, 8, 4], 'decoder_self_attention_heads': [8, 4, 4, 4, 8, 4], 'decoder_ende_attention_heads': [4, 8, 8, 4, 8, 4], 'decoder_arbitrary_ende_attn': [1, 1, 1, -1, 1, 1]}",2022-11-30 00:01:43.420676,0,2015619954,26.590522766056612 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 3072, 1024], 'decoder_layer_num': [6], 'encoder_self_attention_heads': [8, 4, 8, 4, 8, 4], 'decoder_self_attention_heads': [8, 4, 4, 4, 8, 4], 'decoder_ende_attention_heads': [4, 8, 8, 4, 4, 4], 'decoder_arbitrary_ende_attn': [1, 1, 1, -1, 1, 1]}",2022-11-30 00:02:22.331936,0,1830758514,26.2075294199728 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 2048, 3072, 2048], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 4, 8, 4, 8, 8], 'decoder_self_attention_heads': [8, 8, 4, 4, 8, 4], 'decoder_ende_attention_heads': [4, 8, 8, 4, 4, 8], 'decoder_arbitrary_ende_attn': [1, 1, 1, -1, 1, -1]}",2022-11-30 00:02:51.964958,0,1269811290,25.475704676620662 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 2048, 1024], 'decoder_layer_num': [6], 'encoder_self_attention_heads': [8, 4, 8, 4, 8, 4], 'decoder_self_attention_heads': [8, 4, 8, 4, 4, 4], 'decoder_ende_attention_heads': [4, 8, 8, 4, 4, 4], 'decoder_arbitrary_ende_attn': [1, 1, -1, -1, 1, 1]}",2022-11-30 00:03:29.946063,0,1751193714,26.105643502743355 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 2048, 1024], 'decoder_layer_num': [6], 'encoder_self_attention_heads': [8, 4, 8, 4, 8, 4], 'decoder_self_attention_heads': [8, 4, 4, 4, 4, 4], 'decoder_ende_attention_heads': [4, 8, 8, 4, 4, 4], 'decoder_arbitrary_ende_attn': [1, 1, -1, -1, 1, 1]}",2022-11-30 00:04:07.937498,0,1751193714,25.93245158011976 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 2048, 3072, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 2048, 2048, 1024], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 4, 8, 4, 8, 4], 'decoder_self_attention_heads': [8, 4, 4, 4, 8, 4], 'decoder_ende_attention_heads': [4, 8, 8, 4, 4, 8], 'decoder_arbitrary_ende_attn': [1, 1, 1, -1, 1, 1]}",2022-11-30 00:04:37.760555,0,1206896730,24.91733058699306 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 2048, 2048, 2048, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 2048, 1024], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [8, 4, 8, 4, 8, 4], 'decoder_self_attention_heads': [8, 4, 4, 4, 8, 4], 'decoder_ende_attention_heads': [4, 8, 8, 4, 4, 4], 'decoder_arbitrary_ende_attn': [1, 1, 1, -1, 1, 1]}",2022-11-30 00:05:12.938706,0,1541959782,25.94168073173522 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 2048, 2048, 1024], 'decoder_layer_num': [6], 'encoder_self_attention_heads': [8, 4, 8, 4, 8, 4], 'decoder_self_attention_heads': [8, 4, 8, 4, 4, 4], 'decoder_ende_attention_heads': [4, 8, 8, 4, 4, 4], 'decoder_arbitrary_ende_attn': [1, 1, -1, -1, 1, 1]}",2022-11-30 00:05:50.741638,0,1719736434,26.03682929942318 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 2048, 1024], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [8, 4, 8, 4, 8, 4], 'decoder_self_attention_heads': [8, 4, 4, 4, 8, 4], 'decoder_ende_attention_heads': [4, 8, 8, 4, 4, 8], 'decoder_arbitrary_ende_attn': [1, 1, 1, -1, 1, -1]}",2022-11-30 00:06:25.032545,0,1447587942,25.535328515473058 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 2048, 2048], 'decoder_layer_num': [3], 'encoder_self_attention_heads': [8, 4, 8, 4, 8, 4], 'decoder_self_attention_heads': [8, 4, 4, 4, 8, 4], 'decoder_ende_attention_heads': [4, 8, 8, 4, 4, 4], 'decoder_arbitrary_ende_attn': [1, 1, 1, -1, 1, 1]}",2022-11-30 00:06:58.207200,0,1319838816,25.37435112751941 +"{'encoder_embed_dim': [640], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 3072, 1024], 'decoder_layer_num': [6], 'encoder_self_attention_heads': [8, 4, 8, 4, 8, 4], 'decoder_self_attention_heads': [8, 4, 4, 4, 8, 4], 'decoder_ende_attention_heads': [4, 8, 8, 4, 4, 4], 'decoder_arbitrary_ende_attn': [1, 1, 1, -1, 1, 1]}",2022-11-30 00:07:37.942726,0,2007755634,26.51729964072713 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 2048, 2048, 1024], 'decoder_layer_num': [3], 'encoder_self_attention_heads': [8, 4, 8, 4, 8, 4], 'decoder_self_attention_heads': [8, 4, 8, 4, 8, 4], 'decoder_ende_attention_heads': [4, 4, 8, 8, 4, 4], 'decoder_arbitrary_ende_attn': [1, 1, -1, -1, 1, 1]}",2022-11-30 00:08:09.189292,0,1271731296,25.166134982432663 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 2048, 2048, 1024], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [8, 4, 8, 4, 8, 4], 'decoder_self_attention_heads': [8, 4, 8, 4, 8, 4], 'decoder_ende_attention_heads': [4, 8, 8, 4, 8, 8], 'decoder_arbitrary_ende_attn': [1, 1, 1, -1, 1, 1]}",2022-11-30 00:08:43.141792,0,1416130662,25.449204232481016 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 1024, 1024, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 3072, 3072, 2048], 'decoder_layer_num': [1], 'encoder_self_attention_heads': [8, 4, 8, 4, 4, 8], 'decoder_self_attention_heads': [8, 4, 4, 4, 8, 4], 'decoder_ende_attention_heads': [4, 4, 8, 4, 8, 8], 'decoder_arbitrary_ende_attn': [1, 1, -1, -1, 1, -1]}",2022-11-30 00:09:11.020741,0,999582804,21.951713272811833 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 3072, 2048, 1024], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 4, 8, 4, 8, 4], 'decoder_self_attention_heads': [8, 4, 8, 4, 8, 4], 'decoder_ende_attention_heads': [4, 8, 8, 8, 8, 8], 'decoder_arbitrary_ende_attn': [1, 1, -1, -1, 1, -1]}",2022-11-30 00:09:40.649180,0,1238354010,24.98037680186457 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 2048, 1024], 'decoder_layer_num': [3], 'encoder_self_attention_heads': [8, 4, 8, 4, 4, 4], 'decoder_self_attention_heads': [8, 4, 8, 4, 8, 4], 'decoder_ende_attention_heads': [4, 4, 8, 8, 4, 4], 'decoder_arbitrary_ende_attn': [1, 1, -1, -1, 1, 1]}",2022-11-30 00:10:13.227827,0,1303188576,25.448120268873193 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 2048, 2048, 1024], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 4, 8, 4, 8, 4], 'decoder_self_attention_heads': [8, 4, 4, 4, 8, 4], 'decoder_ende_attention_heads': [4, 8, 8, 4, 4, 8], 'decoder_arbitrary_ende_attn': [1, 1, -1, -1, 1, 1]}",2022-11-30 00:10:43.351246,0,1175439450,24.62386693490815 +"{'encoder_embed_dim': [640], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 3072, 1024], 'decoder_layer_num': [6], 'encoder_self_attention_heads': [8, 4, 8, 4, 8, 4], 'decoder_self_attention_heads': [8, 4, 4, 4, 8, 4], 'decoder_ende_attention_heads': [4, 8, 8, 4, 8, 4], 'decoder_arbitrary_ende_attn': [1, 1, 1, -1, 1, 1]}",2022-11-30 00:11:23.467151,0,2007755634,26.232304249661833 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 1024, 2048, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 3072, 3072, 2048], 'decoder_layer_num': [1], 'encoder_self_attention_heads': [8, 4, 8, 4, 4, 8], 'decoder_self_attention_heads': [8, 4, 8, 4, 8, 4], 'decoder_ende_attention_heads': [4, 4, 8, 4, 8, 8], 'decoder_arbitrary_ende_attn': [1, 1, -1, -1, 1, -1]}",2022-11-30 00:11:51.316006,0,1031040084,22.17006946878601 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 2048, 2048, 1024], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 4, 8, 4, 8, 4], 'decoder_self_attention_heads': [8, 4, 8, 4, 8, 4], 'decoder_ende_attention_heads': [4, 4, 8, 4, 8, 4], 'decoder_arbitrary_ende_attn': [1, 1, -1, -1, 1, 1]}",2022-11-30 00:12:21.453576,0,1238354010,24.98730164778759 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 2048, 3072, 1024], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 4, 8, 4, 8, 4], 'decoder_self_attention_heads': [8, 8, 4, 4, 8, 4], 'decoder_ende_attention_heads': [4, 4, 8, 4, 4, 8], 'decoder_arbitrary_ende_attn': [1, 1, -1, -1, 1, 1]}",2022-11-30 00:12:51.154193,0,1175439450,24.370328478496752 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 2048, 1024, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 2048, 1024], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 4, 8, 4, 8, 4], 'decoder_self_attention_heads': [8, 4, 8, 4, 8, 4], 'decoder_ende_attention_heads': [4, 8, 8, 4, 4, 4], 'decoder_arbitrary_ende_attn': [1, 1, 1, -1, 1, -1]}",2022-11-30 00:13:21.253989,0,1143982170,24.242154307788027 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 3072, 2048, 2048], 'decoder_layer_num': [1], 'encoder_self_attention_heads': [8, 4, 8, 4, 4, 8], 'decoder_self_attention_heads': [8, 4, 8, 4, 8, 4], 'decoder_ende_attention_heads': [4, 4, 8, 4, 8, 8], 'decoder_arbitrary_ende_attn': [1, 1, 1, -1, 1, -1]}",2022-11-30 00:13:48.722881,0,1062497364,22.741194138868078 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 2048, 1024, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 3072, 2048, 1024], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 4, 8, 4, 4, 4], 'decoder_self_attention_heads': [8, 4, 4, 4, 8, 4], 'decoder_ende_attention_heads': [4, 8, 8, 8, 4, 8], 'decoder_arbitrary_ende_attn': [-1, 1, -1, -1, 1, 1]}",2022-11-30 00:14:18.241161,0,1127331930,24.234335725130748 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 2048, 1024], 'decoder_layer_num': [6], 'encoder_self_attention_heads': [8, 4, 8, 4, 8, 4], 'decoder_self_attention_heads': [8, 4, 4, 4, 8, 4], 'decoder_ende_attention_heads': [4, 8, 8, 4, 8, 4], 'decoder_arbitrary_ende_attn': [1, 1, 1, -1, 1, 1]}",2022-11-30 00:14:57.044258,0,1830758514,26.44666962861665 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 2048, 1024], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [8, 4, 8, 4, 8, 4], 'decoder_self_attention_heads': [8, 4, 8, 4, 8, 4], 'decoder_ende_attention_heads': [4, 8, 8, 4, 4, 4], 'decoder_arbitrary_ende_attn': [1, 1, 1, -1, 1, 1]}",2022-11-30 00:15:32.368557,0,1541959782,26.050701171511186 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 2048, 2048, 1024], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 4, 8, 4, 8, 4], 'decoder_self_attention_heads': [8, 4, 8, 4, 8, 4], 'decoder_ende_attention_heads': [4, 8, 8, 4, 8, 4], 'decoder_arbitrary_ende_attn': [-1, 1, -1, -1, 1, 1]}",2022-11-30 00:16:01.654068,0,1221703770,24.996951152986856 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 2048, 2048, 1024], 'decoder_layer_num': [3], 'encoder_self_attention_heads': [8, 4, 8, 4, 8, 4], 'decoder_self_attention_heads': [8, 4, 4, 4, 8, 4], 'decoder_ende_attention_heads': [4, 8, 8, 4, 4, 8], 'decoder_arbitrary_ende_attn': [1, 1, 1, -1, 1, 1]}",2022-11-30 00:16:33.903998,0,1288381536,25.159811999467085 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 2048, 2048, 1024], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [8, 4, 8, 4, 8, 4], 'decoder_self_attention_heads': [8, 4, 4, 4, 8, 4], 'decoder_ende_attention_heads': [4, 8, 8, 4, 4, 4], 'decoder_arbitrary_ende_attn': [1, 1, -1, -1, 1, -1]}",2022-11-30 00:17:07.162316,0,1399480422,25.443390153441023 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 1024, 1024, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 3072, 2048, 2048], 'decoder_layer_num': [1], 'encoder_self_attention_heads': [8, 4, 8, 4, 8, 4], 'decoder_self_attention_heads': [8, 8, 4, 4, 8, 4], 'decoder_ende_attention_heads': [4, 4, 8, 4, 8, 8], 'decoder_arbitrary_ende_attn': [-1, 1, -1, -1, 1, -1]}",2022-11-30 00:17:35.512007,0,982932564,21.871299717957186 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 2048, 2048, 1024], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [8, 4, 8, 4, 8, 4], 'decoder_self_attention_heads': [8, 4, 4, 4, 8, 4], 'decoder_ende_attention_heads': [4, 8, 8, 4, 4, 8], 'decoder_arbitrary_ende_attn': [1, 1, 1, -1, 1, 1]}",2022-11-30 00:18:09.340609,0,1416130662,25.447142262784027 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 2048, 1024], 'decoder_layer_num': [1], 'encoder_self_attention_heads': [8, 4, 8, 4, 8, 4], 'decoder_self_attention_heads': [8, 4, 8, 4, 8, 4], 'decoder_ende_attention_heads': [4, 8, 8, 4, 8, 8], 'decoder_arbitrary_ende_attn': [-1, 1, 1, -1, 1, 1]}",2022-11-30 00:18:37.410682,0,1045847124,22.47970882205465 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 1024, 1024, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 3072, 2048, 2048], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 4, 8, 4, 4, 4], 'decoder_self_attention_heads': [8, 8, 4, 4, 8, 4], 'decoder_ende_attention_heads': [4, 8, 8, 8, 4, 8], 'decoder_arbitrary_ende_attn': [-1, 1, -1, -1, 1, 1]}",2022-11-30 00:19:06.749493,0,1095874650,23.814093767809908 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 2048, 2048], 'decoder_layer_num': [3], 'encoder_self_attention_heads': [8, 4, 8, 4, 8, 4], 'decoder_self_attention_heads': [8, 4, 8, 4, 8, 4], 'decoder_ende_attention_heads': [4, 8, 4, 8, 4, 4], 'decoder_arbitrary_ende_attn': [1, 1, 1, -1, 1, -1]}",2022-11-30 00:19:39.893348,0,1319838816,25.478955124339844 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 1024, 1024, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 3072, 3072, 2048], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 4, 8, 4, 4, 8], 'decoder_self_attention_heads': [8, 8, 4, 4, 8, 4], 'decoder_ende_attention_heads': [4, 4, 8, 8, 4, 8], 'decoder_arbitrary_ende_attn': [-1, 1, -1, -1, 1, -1]}",2022-11-30 00:20:09.251167,0,1095874650,23.750972433853825 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 1024, 2048, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 3072, 2048, 2048], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 4, 8, 4, 4, 8], 'decoder_self_attention_heads': [8, 8, 4, 4, 8, 4], 'decoder_ende_attention_heads': [4, 4, 8, 8, 4, 8], 'decoder_arbitrary_ende_attn': [-1, 1, -1, -1, 1, -1]}",2022-11-30 00:20:38.348608,0,1127331930,23.993329632521927 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 2048, 1024, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 3072, 2048, 1024], 'decoder_layer_num': [1], 'encoder_self_attention_heads': [8, 4, 8, 4, 8, 4], 'decoder_self_attention_heads': [8, 4, 8, 4, 8, 4], 'decoder_ende_attention_heads': [4, 8, 8, 4, 8, 8], 'decoder_arbitrary_ende_attn': [-1, 1, 1, -1, 1, 1]}",2022-11-30 00:21:06.810591,0,1014389844,22.26199233258481 +"{'encoder_embed_dim': [640], 'decoder_embed_dim': [640], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 2048], 'decoder_ffn_embed_dim': [2048, 3072, 2048, 2048, 2048, 1024], 'decoder_layer_num': [6], 'encoder_self_attention_heads': [8, 8, 4, 4, 8, 8], 'decoder_self_attention_heads': [8, 4, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 8, 4, 8, 8, 8], 'decoder_arbitrary_ende_attn': [1, 1, 2, -1, 1, 1]}",2022-11-30 00:22:10.464888,0,2500270194,26.490267758415033 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 2048, 1024, 1024, 1024], 'decoder_ffn_embed_dim': [2048, 3072, 1024, 2048, 3072, 3072], 'decoder_layer_num': [1], 'encoder_self_attention_heads': [8, 4, 8, 4, 4, 8], 'decoder_self_attention_heads': [8, 4, 4, 8, 8, 8], 'decoder_ende_attention_heads': [4, 4, 4, 8, 4, 8], 'decoder_arbitrary_ende_attn': [-1, 1, -1, -1, 1, 2]}",2022-11-30 00:22:38.706298,0,1077304404,23.03639520261316 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 2048, 1024, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 3072, 1024], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 4, 8, 4, 4, 8], 'decoder_self_attention_heads': [8, 4, 4, 4, 8, 8], 'decoder_ende_attention_heads': [4, 4, 4, 8, 4, 4], 'decoder_arbitrary_ende_attn': [-1, -1, -1, -1, 1, 2]}",2022-11-30 00:23:08.174166,0,1142138970,24.176833511635046 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 2048, 1024, 3072, 1024], 'decoder_ffn_embed_dim': [2048, 2048, 2048, 2048, 3072, 3072], 'decoder_layer_num': [1], 'encoder_self_attention_heads': [8, 8, 8, 4, 8, 8], 'decoder_self_attention_heads': [4, 4, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 8, 4, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, 1, -1, -1, -1, 1]}",2022-11-30 00:23:37.202278,0,1156869204,23.451597618181914 +"{'encoder_embed_dim': [640], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 2048], 'decoder_ffn_embed_dim': [2048, 3072, 2048, 2048, 2048, 1024], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [8, 8, 4, 4, 8, 8], 'decoder_self_attention_heads': [4, 4, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 8, 4, 8, 8, 8], 'decoder_arbitrary_ende_attn': [1, 1, -1, -1, 1, 1]}",2022-11-30 00:24:12.037672,0,1894982502,26.570692923549505 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [2048, 2048, 2048, 2048, 2048, 3072], 'decoder_layer_num': [5], 'encoder_self_attention_heads': [8, 8, 4, 4, 8, 8], 'decoder_self_attention_heads': [4, 4, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 8, 4, 8, 8, 8], 'decoder_arbitrary_ende_attn': [1, 1, -1, -1, 1, 1]}",2022-11-30 00:24:48.692945,0,1795538028,26.656160779307765 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 2048, 1024, 2048, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 2048, 3072, 3072, 3072], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 4, 8, 4, 8, 8], 'decoder_self_attention_heads': [4, 4, 8, 8, 8, 8], 'decoder_ende_attention_heads': [4, 8, 4, 8, 4, 4], 'decoder_arbitrary_ende_attn': [1, -1, -1, -1, -1, 1]}",2022-11-30 00:25:18.655715,0,1221703770,24.876611482664103 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 2048, 2048, 3072, 2048], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [8, 8, 4, 4, 8, 8], 'decoder_self_attention_heads': [4, 4, 8, 4, 4, 8], 'decoder_ende_attention_heads': [4, 8, 4, 8, 4, 4], 'decoder_arbitrary_ende_attn': [1, -1, -1, -1, -1, 1]}",2022-11-30 00:25:53.952187,0,1571573862,26.243286807849493 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 2048, 3072, 1024, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 2048, 3072, 3072, 3072], 'decoder_layer_num': [3], 'encoder_self_attention_heads': [8, 8, 8, 4, 8, 8], 'decoder_self_attention_heads': [4, 4, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 8, 4, 8, 4, 4], 'decoder_arbitrary_ende_attn': [1, 1, -1, -1, -1, 2]}",2022-11-30 00:26:28.198620,0,1397560416,25.575803459047332 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 2048, 2048, 3072, 2048], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [8, 8, 4, 4, 8, 8], 'decoder_self_attention_heads': [4, 4, 8, 4, 4, 8], 'decoder_ende_attention_heads': [4, 8, 4, 8, 4, 8], 'decoder_arbitrary_ende_attn': [1, 1, -1, -1, -1, 1]}",2022-11-30 00:27:03.351000,0,1619681382,26.29731327668079 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 2048, 1024, 2048, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 2048, 2048, 3072, 3072], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 4, 8, 4, 8, 8], 'decoder_self_attention_heads': [4, 4, 8, 8, 8, 8], 'decoder_ende_attention_heads': [4, 8, 4, 8, 4, 4], 'decoder_arbitrary_ende_attn': [-1, -1, -1, -1, -1, 1]}",2022-11-30 00:27:32.454206,0,1205053530,24.652752198325597 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 2048, 1024, 2048, 1024], 'decoder_ffn_embed_dim': [2048, 3072, 2048, 2048, 3072, 3072], 'decoder_layer_num': [1], 'encoder_self_attention_heads': [8, 4, 8, 4, 4, 8], 'decoder_self_attention_heads': [4, 4, 8, 8, 8, 8], 'decoder_ende_attention_heads': [4, 4, 4, 8, 4, 4], 'decoder_arbitrary_ende_attn': [1, -1, -1, -1, 1, 2]}",2022-11-30 00:28:00.103253,0,1125411924,23.2436921421545 +"{'encoder_embed_dim': [640], 'decoder_embed_dim': [640], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 2048], 'decoder_ffn_embed_dim': [2048, 3072, 2048, 2048, 2048, 1024], 'decoder_layer_num': [5], 'encoder_self_attention_heads': [8, 8, 4, 4, 8, 8], 'decoder_self_attention_heads': [8, 4, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 8, 4, 8, 8, 8], 'decoder_arbitrary_ende_attn': [1, 1, -1, -1, 1, 1]}",2022-11-30 00:28:38.079240,0,2318626668,26.646036771408653 +"{'encoder_embed_dim': [640], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [2048, 2048, 2048, 2048, 2048, 1024], 'decoder_layer_num': [6], 'encoder_self_attention_heads': [8, 8, 4, 4, 8, 8], 'decoder_self_attention_heads': [8, 4, 8, 4, 8, 4], 'decoder_ende_attention_heads': [4, 8, 4, 8, 8, 8], 'decoder_arbitrary_ende_attn': [1, 1, -1, -1, 1, 1]}",2022-11-30 00:29:17.534918,0,2097273714,26.452370488677506 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [2048, 2048, 2048, 2048, 2048, 2048], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [8, 8, 4, 4, 8, 8], 'decoder_self_attention_heads': [4, 4, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 8, 4, 8, 8, 8], 'decoder_arbitrary_ende_attn': [1, 1, -1, -1, -1, 1]}",2022-11-30 00:29:52.746931,0,1651138662,26.151231823189608 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 2048, 3072, 2048, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 2048, 2048, 3072, 1024], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 4, 4, 4, 8, 8], 'decoder_self_attention_heads': [4, 4, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 8, 4, 8, 4, 4], 'decoder_arbitrary_ende_attn': [1, -1, -1, -1, 1, 1]}",2022-11-30 00:30:22.172110,0,1284618330,25.49140283243417 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 2048, 3072, 2048, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 2048, 2048, 3072, 3072], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 4, 8, 4, 8, 8], 'decoder_self_attention_heads': [8, 4, 4, 4, 8, 8], 'decoder_ende_attention_heads': [4, 8, 4, 8, 4, 4], 'decoder_arbitrary_ende_attn': [1, -1, -1, -1, -1, 1]}",2022-11-30 00:30:52.032834,0,1284618330,25.483324803555185 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 2048], 'decoder_ffn_embed_dim': [2048, 2048, 2048, 2048, 2048, 2048], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [8, 8, 4, 4, 8, 8], 'decoder_self_attention_heads': [4, 4, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 8, 4, 8, 8, 8], 'decoder_arbitrary_ende_attn': [1, 1, -1, -1, -1, 1]}",2022-11-30 00:31:26.157137,0,1682595942,26.243800774630134 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 2048, 3072, 2048, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 2048, 3072, 2048, 1024], 'decoder_layer_num': [3], 'encoder_self_attention_heads': [8, 8, 4, 4, 8, 8], 'decoder_self_attention_heads': [4, 4, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 8, 4, 8, 4, 4], 'decoder_arbitrary_ende_attn': [1, -1, -1, -1, 1, 2]}",2022-11-30 00:32:02.794238,0,1412367456,25.73672237614388 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 2048, 3072, 2048, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 2048, 3072, 3072, 3072], 'decoder_layer_num': [3], 'encoder_self_attention_heads': [8, 8, 8, 4, 8, 8], 'decoder_self_attention_heads': [4, 4, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 8, 4, 8, 4, 4], 'decoder_arbitrary_ende_attn': [1, -1, -1, -1, -1, 1]}",2022-11-30 00:32:38.498129,0,1412367456,25.694815754009024 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 2048, 3072, 3072, 3072], 'decoder_layer_num': [3], 'encoder_self_attention_heads': [8, 8, 8, 4, 8, 8], 'decoder_self_attention_heads': [4, 4, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 8, 4, 8, 4, 4], 'decoder_arbitrary_ende_attn': [1, 1, -1, -1, -1, 2]}",2022-11-30 00:33:14.792750,0,1460474976,25.87514706229428 +"{'encoder_embed_dim': [640], 'decoder_embed_dim': [640], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [2048, 3072, 2048, 2048, 2048, 1024], 'decoder_layer_num': [6], 'encoder_self_attention_heads': [8, 8, 4, 4, 8, 8], 'decoder_self_attention_heads': [8, 4, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 8, 4, 8, 8, 8], 'decoder_arbitrary_ende_attn': [1, 1, -1, -1, 1, 1]}",2022-11-30 00:33:55.738731,0,2419783794,26.642604563885367 +"{'encoder_embed_dim': [640], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 2048], 'decoder_ffn_embed_dim': [2048, 3072, 2048, 2048, 2048, 1024], 'decoder_layer_num': [6], 'encoder_self_attention_heads': [8, 8, 4, 4, 8, 8], 'decoder_self_attention_heads': [8, 4, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 8, 4, 8, 8, 8], 'decoder_arbitrary_ende_attn': [1, 1, -1, -1, 1, 1]}",2022-11-30 00:34:35.635995,0,2168052594,26.669695273057105 +"{'encoder_embed_dim': [640], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 2048], 'decoder_ffn_embed_dim': [2048, 2048, 2048, 2048, 2048, 1024], 'decoder_layer_num': [5], 'encoder_self_attention_heads': [8, 8, 4, 4, 8, 8], 'decoder_self_attention_heads': [8, 4, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 8, 4, 8, 8, 8], 'decoder_arbitrary_ende_attn': [1, 1, -1, -1, 1, 2]}",2022-11-30 00:35:13.689253,0,2015788908,26.70123813916216 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 2048, 2048, 2048, 1024], 'decoder_layer_num': [3], 'encoder_self_attention_heads': [8, 8, 4, 4, 8, 8], 'decoder_self_attention_heads': [4, 4, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 8, 4, 8, 4, 8], 'decoder_arbitrary_ende_attn': [1, 1, -1, -1, -1, 1]}",2022-11-30 00:35:50.004608,0,1460474976,25.829650262816568 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 2048, 1024, 2048, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 2048, 2048, 3072, 3072], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 4, 4, 4, 8, 8], 'decoder_self_attention_heads': [4, 4, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 8, 4, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, -1, -1, -1, 1]}",2022-11-30 00:36:19.755425,0,1221703770,24.917846107035597 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 2048, 1024, 1024, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 2048, 2048, 3072, 3072], 'decoder_layer_num': [1], 'encoder_self_attention_heads': [8, 4, 4, 4, 4, 8], 'decoder_self_attention_heads': [4, 4, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 8, 4, 8, 4, 4], 'decoder_arbitrary_ende_attn': [1, -1, -1, -1, -1, 1]}",2022-11-30 00:36:48.610538,0,1093954644,23.08549976883442 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 2048, 2048, 2048, 3072], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [8, 8, 8, 4, 8, 8], 'decoder_self_attention_heads': [4, 4, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 8, 4, 8, 4, 8], 'decoder_arbitrary_ende_attn': [1, -1, -1, -1, -1, 1]}",2022-11-30 00:37:23.140196,0,1571573862,26.495457209152665 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 2048, 1024, 2048, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 2048, 3072, 3072, 3072], 'decoder_layer_num': [3], 'encoder_self_attention_heads': [8, 4, 8, 4, 8, 8], 'decoder_self_attention_heads': [4, 4, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 8, 4, 8, 4, 4], 'decoder_arbitrary_ende_attn': [1, -1, -1, -1, -1, 1]}",2022-11-30 00:37:58.602077,0,1349452896,25.103371841310192 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [2048, 2048, 2048, 2048, 2048, 2048], 'decoder_layer_num': [3], 'encoder_self_attention_heads': [8, 8, 4, 4, 8, 8], 'decoder_self_attention_heads': [4, 4, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 8, 4, 8, 4, 8], 'decoder_arbitrary_ende_attn': [1, 1, -1, -1, -1, 1]}",2022-11-30 00:38:34.600291,0,1523389536,26.028214026012492 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 2048, 1024, 2048, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 2048, 3072, 3072, 3072], 'decoder_layer_num': [3], 'encoder_self_attention_heads': [8, 4, 4, 4, 8, 8], 'decoder_self_attention_heads': [4, 4, 8, 4, 4, 8], 'decoder_ende_attention_heads': [4, 8, 4, 8, 4, 8], 'decoder_arbitrary_ende_attn': [1, -1, -1, -1, -1, 1]}",2022-11-30 00:39:10.001278,0,1349452896,24.972561656938478 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 2048], 'decoder_ffn_embed_dim': [2048, 3072, 2048, 2048, 3072, 2048], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [8, 8, 4, 4, 8, 8], 'decoder_self_attention_heads': [4, 4, 8, 4, 4, 8], 'decoder_ende_attention_heads': [4, 8, 4, 8, 4, 8], 'decoder_arbitrary_ende_attn': [1, 1, -1, -1, -1, 1]}",2022-11-30 00:39:44.443816,0,1714053222,26.16022517555216 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 2048, 2048, 3072, 1024], 'decoder_layer_num': [3], 'encoder_self_attention_heads': [8, 8, 4, 4, 8, 8], 'decoder_self_attention_heads': [4, 4, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 8, 4, 8, 4, 8], 'decoder_arbitrary_ende_attn': [1, 1, -1, -1, -1, 1]}",2022-11-30 00:40:21.100754,0,1491932256,25.983857779417527 +"{'encoder_embed_dim': [640], 'decoder_embed_dim': [640], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 2048], 'decoder_ffn_embed_dim': [2048, 3072, 2048, 2048, 2048, 1024], 'decoder_layer_num': [6], 'encoder_self_attention_heads': [8, 8, 4, 4, 8, 8], 'decoder_self_attention_heads': [8, 4, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 8, 4, 8, 8, 8], 'decoder_arbitrary_ende_attn': [1, 1, -1, -1, 1, 1]}",2022-11-30 00:41:01.553620,0,2459105394,26.362258167185217 +"{'encoder_embed_dim': [640], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 2048], 'decoder_ffn_embed_dim': [2048, 2048, 2048, 2048, 2048, 1024], 'decoder_layer_num': [6], 'encoder_self_attention_heads': [8, 8, 4, 4, 8, 8], 'decoder_self_attention_heads': [8, 4, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 8, 4, 8, 8, 8], 'decoder_arbitrary_ende_attn': [1, 1, -1, -1, 1, 1]}",2022-11-30 00:41:41.438856,0,2136595314,26.728063906094565 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 2048, 1024, 1024, 1024], 'decoder_ffn_embed_dim': [2048, 3072, 1024, 2048, 3072, 3072], 'decoder_layer_num': [1], 'encoder_self_attention_heads': [8, 4, 8, 4, 4, 8], 'decoder_self_attention_heads': [8, 4, 4, 8, 8, 8], 'decoder_ende_attention_heads': [4, 4, 4, 8, 4, 8], 'decoder_arbitrary_ende_attn': [1, 1, -1, -1, 1, 2]}",2022-11-30 00:42:09.810325,0,1093954644,22.9713116425012 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 2048, 1024, 3072, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 2048, 2048, 3072, 3072], 'decoder_layer_num': [1], 'encoder_self_attention_heads': [8, 4, 8, 4, 8, 8], 'decoder_self_attention_heads': [4, 4, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 8, 4, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, 1, -1, -1, -1, 1]}",2022-11-30 00:42:38.643170,0,1156869204,23.4523958088576 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 2048, 1024, 1024, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 2048, 3072, 3072, 1024], 'decoder_layer_num': [1], 'encoder_self_attention_heads': [8, 8, 8, 4, 8, 8], 'decoder_self_attention_heads': [4, 4, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 8, 4, 8, 4, 8], 'decoder_arbitrary_ende_attn': [1, -1, -1, -1, 1, 2]}",2022-11-30 00:43:07.305868,0,1093954644,23.166873916592454 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 2048, 2048, 3072, 1024], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 4, 4, 4, 8, 8], 'decoder_self_attention_heads': [4, 4, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 8, 4, 8, 4, 4], 'decoder_arbitrary_ende_attn': [1, -1, -1, -1, 1, 1]}",2022-11-30 00:43:36.751041,0,1316075610,25.51338789773949 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 2048, 3072, 1024, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 2048, 2048, 2048, 3072], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 4, 4, 4, 4, 8], 'decoder_self_attention_heads': [4, 4, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 8, 4, 8, 4, 4], 'decoder_arbitrary_ende_attn': [1, -1, -1, -1, 1, 2]}",2022-11-30 00:44:07.237775,0,1253161050,25.004771948649424 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 2048, 2048, 2048, 1024], 'decoder_layer_num': [5], 'encoder_self_attention_heads': [8, 8, 4, 4, 8, 8], 'decoder_self_attention_heads': [4, 4, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 8, 4, 4, 8, 8], 'decoder_arbitrary_ende_attn': [1, -1, -1, -1, 1, 1]}",2022-11-30 00:44:43.589272,0,1747430508,26.62802459531885 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 2048, 1024, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 3072, 1024], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 8, 4, 4, 4, 8], 'decoder_self_attention_heads': [4, 4, 8, 8, 8, 8], 'decoder_ende_attention_heads': [4, 4, 4, 8, 8, 4], 'decoder_arbitrary_ende_attn': [-1, -1, -1, -1, 1, 1]}",2022-11-30 00:45:13.240897,0,1142138970,24.356415715383232 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 2048, 3072, 2048, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 2048, 2048, 3072, 3072], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 4, 4, 4, 8, 8], 'decoder_self_attention_heads': [8, 4, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 8, 4, 8, 4, 4], 'decoder_arbitrary_ende_attn': [1, 1, -1, -1, 1, 1]}",2022-11-30 00:45:43.655137,0,1301268570,25.49676417511692 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 2048, 1024, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 3072, 3072], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 4, 8, 4, 8, 8], 'decoder_self_attention_heads': [4, 4, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 8, 4, 8, 4, 4], 'decoder_arbitrary_ende_attn': [-1, -1, -1, -1, 1, 2]}",2022-11-30 00:46:13.344754,0,1142138970,24.224366625169438 +"{'encoder_embed_dim': [640], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 2048, 2048, 2048, 3072], 'decoder_layer_num': [5], 'encoder_self_attention_heads': [8, 8, 4, 4, 8, 8], 'decoder_self_attention_heads': [4, 4, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 8, 4, 4, 4, 4], 'decoder_arbitrary_ende_attn': [1, 1, -1, -1, 1, 2]}",2022-11-30 00:46:50.784832,0,1945010028,26.612602269295685 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 2048, 1024, 2048, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 1024, 2048, 3072, 3072], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 4, 4, 4, 4, 8], 'decoder_self_attention_heads': [8, 4, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 8, 4, 8, 4, 4], 'decoder_arbitrary_ende_attn': [1, 1, -1, -1, 1, 1]}",2022-11-30 00:47:21.162395,0,1238354010,24.912296082302873 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 2048, 1024, 1024, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 2048, 2048, 3072, 1024], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 8, 8, 4, 8, 8], 'decoder_self_attention_heads': [8, 4, 4, 8, 8, 8], 'decoder_ende_attention_heads': [4, 4, 4, 8, 4, 8], 'decoder_arbitrary_ende_attn': [1, -1, -1, -1, 1, 2]}",2022-11-30 00:47:50.863610,0,1190246490,24.44551868443464 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 2048, 1024, 1024, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 2048, 2048, 3072, 3072], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 4, 8, 4, 8, 8], 'decoder_self_attention_heads': [8, 4, 4, 8, 8, 8], 'decoder_ende_attention_heads': [4, 4, 4, 8, 4, 4], 'decoder_arbitrary_ende_attn': [1, -1, -1, -1, 1, 2]}",2022-11-30 00:48:20.655219,0,1190246490,24.332970362633144 +"{'encoder_embed_dim': [640], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 2048], 'decoder_ffn_embed_dim': [2048, 1024, 2048, 2048, 2048, 1024], 'decoder_layer_num': [5], 'encoder_self_attention_heads': [8, 8, 4, 4, 8, 8], 'decoder_self_attention_heads': [8, 4, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 8, 4, 8, 8, 8], 'decoder_arbitrary_ende_attn': [1, 1, -1, -1, 1, 2]}",2022-11-30 00:48:57.884646,0,1984331628,26.656284353019814 +"{'encoder_embed_dim': [640], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [2048, 2048, 2048, 2048, 2048, 3072], 'decoder_layer_num': [5], 'encoder_self_attention_heads': [8, 4, 4, 4, 8, 8], 'decoder_self_attention_heads': [8, 4, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 8, 4, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, 1, -1, -1, 1, 2]}",2022-11-30 00:49:36.098176,0,1976467308,26.4326896541673 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 1024, 1024, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 3072, 3072, 2048, 1024], 'decoder_layer_num': [1], 'encoder_self_attention_heads': [8, 8, 4, 4, 8, 4], 'decoder_self_attention_heads': [8, 4, 8, 8, 4, 4], 'decoder_ende_attention_heads': [4, 8, 4, 4, 8, 8], 'decoder_arbitrary_ende_attn': [-1, -1, 1, -1, 1, 2]}",2022-11-30 00:50:27.405530,0,982932564,21.902795709633452 +"{'encoder_embed_dim': [640], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [3072, 2048, 2048, 2048, 2048, 2048], 'decoder_layer_num': [6], 'encoder_self_attention_heads': [8, 8, 4, 8, 4, 4], 'decoder_self_attention_heads': [4, 4, 8, 4, 8, 8], 'decoder_ende_attention_heads': [8, 4, 8, 4, 8, 8], 'decoder_arbitrary_ende_attn': [1, -1, 1, -1, 1, 2]}",2022-11-30 00:51:08.032064,0,2180770674,26.599782674768583 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 2048, 1024, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 2048, 3072, 2048, 2048, 1024], 'decoder_layer_num': [1], 'encoder_self_attention_heads': [8, 8, 4, 8, 8, 8], 'decoder_self_attention_heads': [4, 4, 8, 4, 8, 4], 'decoder_ende_attention_heads': [4, 4, 8, 4, 8, 8], 'decoder_arbitrary_ende_attn': [1, -1, 1, -1, 1, 2]}",2022-11-30 00:51:35.807356,0,1062497364,22.371131012623422 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 1024, 1024, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 3072, 3072, 1024, 2048], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 8, 4, 4, 8, 8], 'decoder_self_attention_heads': [8, 4, 8, 8, 4, 4], 'decoder_ende_attention_heads': [4, 4, 4, 4, 4, 8], 'decoder_arbitrary_ende_attn': [-1, -1, 1, 2, 1, 2]}",2022-11-30 00:52:05.763177,0,1079224410,23.52409715160256 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 3072, 2048], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [8, 8, 4, 4, 4, 4], 'decoder_self_attention_heads': [4, 4, 8, 4, 8, 4], 'decoder_ende_attention_heads': [4, 4, 8, 4, 4, 4], 'decoder_arbitrary_ende_attn': [1, -1, -1, -1, 1, 2]}",2022-11-30 00:52:39.045430,0,1571573862,26.256831208948178 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 3072, 2048], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [8, 8, 4, 4, 4, 4], 'decoder_self_attention_heads': [4, 4, 8, 4, 8, 4], 'decoder_ende_attention_heads': [4, 4, 8, 4, 4, 4], 'decoder_arbitrary_ende_attn': [1, -1, -1, -1, 1, 2]}",2022-11-30 00:53:12.642643,0,1508659302,26.00462272595473 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 1024, 3072, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 3072, 2048, 1024, 2048], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 8, 4, 4, 4, 4], 'decoder_self_attention_heads': [8, 4, 8, 4, 4, 4], 'decoder_ende_attention_heads': [4, 8, 4, 4, 4, 8], 'decoder_arbitrary_ende_attn': [1, -1, -1, -1, 1, 2]}",2022-11-30 00:53:42.368319,0,1158789210,24.50401996100462 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 2048, 1024, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 3072, 3072, 3072, 2048], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 8, 4, 4, 4, 4], 'decoder_self_attention_heads': [8, 4, 8, 8, 4, 4], 'decoder_ende_attention_heads': [4, 8, 8, 4, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, 1, -1, 1, 2]}",2022-11-30 00:54:12.059480,0,1127331930,24.087230950865788 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [3072, 2048, 2048, 2048, 2048, 1024], 'decoder_layer_num': [6], 'encoder_self_attention_heads': [8, 8, 4, 4, 4, 4], 'decoder_self_attention_heads': [4, 4, 8, 8, 8, 8], 'decoder_ende_attention_heads': [4, 4, 8, 4, 8, 8], 'decoder_arbitrary_ende_attn': [1, -1, 1, -1, 1, 2]}",2022-11-30 00:54:51.957601,0,1956587634,26.41692984127425 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [3072, 2048, 2048, 2048, 2048, 1024], 'decoder_layer_num': [5], 'encoder_self_attention_heads': [8, 8, 4, 8, 4, 4], 'decoder_self_attention_heads': [4, 4, 8, 4, 8, 8], 'decoder_ende_attention_heads': [8, 4, 8, 4, 8, 8], 'decoder_arbitrary_ende_attn': [1, -1, 1, -1, 1, 2]}",2022-11-30 00:55:29.312918,0,1826995308,26.542101655879033 +"{'encoder_embed_dim': [640], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [3072, 2048, 2048, 2048, 2048, 1024], 'decoder_layer_num': [5], 'encoder_self_attention_heads': [8, 8, 4, 8, 4, 4], 'decoder_self_attention_heads': [4, 4, 8, 4, 8, 8], 'decoder_ende_attention_heads': [8, 4, 8, 4, 8, 8], 'decoder_arbitrary_ende_attn': [1, -1, 1, -1, 1, 2]}",2022-11-30 00:56:07.198728,0,2007924588,26.616730692591272 +"{'encoder_embed_dim': [640], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [3072, 2048, 2048, 2048, 2048, 1024], 'decoder_layer_num': [6], 'encoder_self_attention_heads': [8, 8, 4, 4, 4, 4], 'decoder_self_attention_heads': [4, 4, 8, 8, 8, 8], 'decoder_ende_attention_heads': [4, 4, 8, 4, 8, 8], 'decoder_arbitrary_ende_attn': [1, -1, 1, -1, 1, 2]}",2022-11-30 00:56:48.032095,0,2149313394,26.572361617382416 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 1024, 1024, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 3072, 2048, 1024, 2048], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 8, 4, 4, 4, 4], 'decoder_self_attention_heads': [8, 4, 8, 4, 4, 4], 'decoder_ende_attention_heads': [4, 8, 4, 4, 4, 8], 'decoder_arbitrary_ende_attn': [1, -1, -1, -1, 1, 2]}",2022-11-30 00:57:18.152332,0,1095874650,23.782891130205417 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 2048, 3072, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 3072, 3072, 3072, 2048], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 8, 4, 4, 4, 4], 'decoder_self_attention_heads': [8, 4, 8, 4, 4, 4], 'decoder_ende_attention_heads': [4, 8, 8, 4, 8, 4], 'decoder_arbitrary_ende_attn': [-1, -1, 1, -1, 1, 2]}",2022-11-30 00:57:47.584518,0,1173596250,24.547205997391597 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 2048, 1024, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 3072, 2048, 3072, 1024], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 8, 4, 4, 4, 4], 'decoder_self_attention_heads': [8, 4, 8, 4, 4, 4], 'decoder_ende_attention_heads': [4, 8, 8, 4, 8, 4], 'decoder_arbitrary_ende_attn': [-1, -1, 1, -1, 1, 2]}",2022-11-30 00:58:17.345898,0,1110681690,23.94407626548769 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 2048, 3072, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 3072, 1024, 2048], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 8, 4, 4, 4, 4], 'decoder_self_attention_heads': [8, 4, 8, 4, 4, 8], 'decoder_ende_attention_heads': [4, 8, 8, 4, 8, 8], 'decoder_arbitrary_ende_attn': [1, -1, 1, -1, 1, 2]}",2022-11-30 00:58:47.056374,0,1190246490,24.491800170896195 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 3072, 2048], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 8, 4, 4, 4, 4], 'decoder_self_attention_heads': [8, 4, 8, 4, 4, 8], 'decoder_ende_attention_heads': [4, 8, 4, 4, 4, 8], 'decoder_arbitrary_ende_attn': [1, -1, 1, -1, 1, 2]}",2022-11-30 00:59:16.745616,0,1253161050,25.254425153820556 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 3072, 2048], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 8, 4, 4, 4, 4], 'decoder_self_attention_heads': [8, 4, 8, 4, 4, 4], 'decoder_ende_attention_heads': [4, 8, 4, 4, 8, 8], 'decoder_arbitrary_ende_attn': [-1, -1, 1, -1, 1, 2]}",2022-11-30 00:59:47.189514,0,1236510810,25.16579743713827 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 2048, 2048, 3072, 1024], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 8, 4, 4, 4, 4], 'decoder_self_attention_heads': [8, 4, 8, 4, 4, 8], 'decoder_ende_attention_heads': [4, 8, 4, 4, 4, 8], 'decoder_arbitrary_ende_attn': [1, -1, -1, -1, 1, 2]}",2022-11-30 01:00:16.840252,0,1347532890,25.676132056093994 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 2048, 3072, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 3072, 2048, 3072, 2048], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 8, 4, 4, 4, 4], 'decoder_self_attention_heads': [8, 4, 8, 4, 4, 4], 'decoder_ende_attention_heads': [4, 8, 8, 4, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, -1, -1, 1, 2]}",2022-11-30 01:00:46.759507,0,1221703770,24.768354572582403 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 3072, 2048, 2048], 'decoder_layer_num': [3], 'encoder_self_attention_heads': [8, 8, 4, 4, 4, 4], 'decoder_self_attention_heads': [8, 4, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 4, 8, 4, 4, 8], 'decoder_arbitrary_ende_attn': [1, -1, -1, -1, 1, 2]}",2022-11-30 01:01:21.302832,0,1412367456,25.746119527915003 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [3072, 1024, 2048, 2048, 2048, 2048], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [8, 8, 4, 4, 4, 4], 'decoder_self_attention_heads': [4, 4, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 4, 8, 4, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, -1, -1, 1, 2]}",2022-11-30 01:01:55.146488,0,1634488422,26.23053149772474 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 2048, 1024, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 2048, 2048, 2048, 2048, 1024], 'decoder_layer_num': [1], 'encoder_self_attention_heads': [8, 8, 4, 4, 4, 4], 'decoder_self_attention_heads': [8, 4, 8, 8, 8, 4], 'decoder_ende_attention_heads': [4, 8, 4, 4, 8, 8], 'decoder_arbitrary_ende_attn': [1, -1, 1, -1, 1, 2]}",2022-11-30 01:02:23.005266,0,1062497364,22.439008935256105 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [3072, 1024, 2048, 2048, 2048, 1024], 'decoder_layer_num': [5], 'encoder_self_attention_heads': [8, 8, 4, 4, 4, 4], 'decoder_self_attention_heads': [4, 4, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 4, 8, 4, 8, 8], 'decoder_arbitrary_ende_attn': [1, -1, -1, -1, 1, 2]}",2022-11-30 01:02:59.282910,0,1778887788,26.56040601873973 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [3072, 2048, 2048, 2048, 2048, 1024], 'decoder_layer_num': [5], 'encoder_self_attention_heads': [8, 8, 4, 4, 4, 4], 'decoder_self_attention_heads': [4, 4, 8, 8, 8, 8], 'decoder_ende_attention_heads': [4, 4, 8, 4, 8, 8], 'decoder_arbitrary_ende_attn': [1, -1, 1, -1, 1, 2]}",2022-11-30 01:03:36.281891,0,1826995308,26.583100181248412 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [3072, 1024, 2048, 1024, 2048, 1024], 'decoder_layer_num': [5], 'encoder_self_attention_heads': [8, 8, 4, 4, 4, 4], 'decoder_self_attention_heads': [4, 4, 8, 4, 8, 4], 'decoder_ende_attention_heads': [4, 4, 8, 4, 8, 8], 'decoder_arbitrary_ende_attn': [1, -1, -1, -1, 1, 2]}",2022-11-30 01:04:12.687089,0,1715973228,26.303092956441024 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [3072, 1024, 2048, 3072, 2048, 2048], 'decoder_layer_num': [3], 'encoder_self_attention_heads': [8, 8, 4, 4, 4, 4], 'decoder_self_attention_heads': [8, 4, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 4, 8, 4, 4, 8], 'decoder_arbitrary_ende_attn': [1, -1, -1, -1, 1, 2]}",2022-11-30 01:04:48.509486,0,1475282016,25.672530277500815 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [3072, 1024, 2048, 2048, 2048, 1024], 'decoder_layer_num': [5], 'encoder_self_attention_heads': [8, 8, 4, 4, 4, 4], 'decoder_self_attention_heads': [4, 4, 8, 4, 8, 4], 'decoder_ende_attention_heads': [4, 4, 8, 4, 8, 8], 'decoder_arbitrary_ende_attn': [1, -1, -1, -1, 1, 2]}",2022-11-30 01:05:25.099950,0,1747430508,26.33985155284849 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 2048, 1024, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 2048, 3072, 2048, 2048, 1024], 'decoder_layer_num': [1], 'encoder_self_attention_heads': [8, 8, 4, 4, 8, 8], 'decoder_self_attention_heads': [4, 4, 8, 4, 8, 4], 'decoder_ende_attention_heads': [4, 8, 8, 4, 8, 8], 'decoder_arbitrary_ende_attn': [1, -1, 1, -1, 1, 2]}",2022-11-30 01:05:53.329815,0,1031040084,22.262405116542624 +"{'encoder_embed_dim': [640], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [3072, 1024, 2048, 2048, 2048, 2048], 'decoder_layer_num': [5], 'encoder_self_attention_heads': [8, 8, 4, 4, 4, 4], 'decoder_self_attention_heads': [4, 4, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 4, 8, 4, 8, 8], 'decoder_arbitrary_ende_attn': [1, -1, 1, -1, 1, 2]}",2022-11-30 01:06:30.890827,0,1976467308,26.580438511490023 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 1024, 1024, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 3072, 3072, 2048, 2048], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 8, 4, 4, 4, 4], 'decoder_self_attention_heads': [8, 4, 8, 8, 4, 8], 'decoder_ende_attention_heads': [4, 8, 4, 4, 8, 8], 'decoder_arbitrary_ende_attn': [-1, -1, 1, -1, 1, 2]}",2022-11-30 01:07:00.560411,0,1079224410,23.728343099659895 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 1024, 1024, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 3072, 1024, 2048, 1024], 'decoder_layer_num': [1], 'encoder_self_attention_heads': [8, 8, 4, 4, 8, 4], 'decoder_self_attention_heads': [4, 4, 8, 4, 4, 4], 'decoder_ende_attention_heads': [4, 4, 4, 4, 8, 8], 'decoder_arbitrary_ende_attn': [1, -1, 1, -1, 1, 2]}",2022-11-30 01:07:28.617193,0,999582804,21.72906013722468 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 2048, 1024, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 2048, 3072, 2048, 3072, 1024], 'decoder_layer_num': [1], 'encoder_self_attention_heads': [8, 8, 4, 4, 4, 4], 'decoder_self_attention_heads': [8, 4, 8, 4, 8, 4], 'decoder_ende_attention_heads': [4, 8, 8, 4, 8, 4], 'decoder_arbitrary_ende_attn': [-1, -1, 1, -1, 1, 2]}",2022-11-30 01:07:57.530176,0,1014389844,22.471135438983048 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [3072, 2048, 2048, 2048, 2048, 1024], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [8, 8, 4, 8, 4, 4], 'decoder_self_attention_heads': [4, 4, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 4, 8, 4, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, 1, -1, 1, 2]}",2022-11-30 01:08:32.633405,0,1682595942,26.278826842615302 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 3072, 2048], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 8, 4, 4, 4, 4], 'decoder_self_attention_heads': [8, 4, 8, 4, 4, 8], 'decoder_ende_attention_heads': [4, 8, 4, 4, 4, 8], 'decoder_arbitrary_ende_attn': [1, -1, 1, -1, 1, 2]}",2022-11-30 01:09:03.054659,0,1284618330,25.50001897765904 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 1024, 1024, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 3072, 2048, 2048, 1024], 'decoder_layer_num': [1], 'encoder_self_attention_heads': [8, 8, 4, 4, 8, 4], 'decoder_self_attention_heads': [8, 4, 8, 8, 8, 4], 'decoder_ende_attention_heads': [4, 8, 4, 4, 8, 8], 'decoder_arbitrary_ende_attn': [1, -1, 1, -1, 1, 2]}",2022-11-30 01:09:31.687974,0,999582804,21.967989901189092 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [3072, 1024, 2048, 3072, 2048, 2048], 'decoder_layer_num': [3], 'encoder_self_attention_heads': [8, 8, 4, 4, 4, 4], 'decoder_self_attention_heads': [8, 4, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 4, 8, 4, 4, 4], 'decoder_arbitrary_ende_attn': [1, -1, -1, -1, 1, 2]}",2022-11-30 01:10:06.993955,0,1506739296,25.812328256697807 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 2048, 2048, 3072, 1024], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 8, 4, 4, 4, 4], 'decoder_self_attention_heads': [8, 4, 8, 4, 4, 8], 'decoder_ende_attention_heads': [4, 8, 4, 4, 4, 8], 'decoder_arbitrary_ende_attn': [1, -1, -1, -1, 1, 2]}",2022-11-30 01:10:36.948650,0,1284618330,25.440948757479756 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 3072, 1024], 'decoder_layer_num': [3], 'encoder_self_attention_heads': [8, 8, 4, 4, 4, 4], 'decoder_self_attention_heads': [8, 4, 8, 4, 8, 4], 'decoder_ende_attention_heads': [8, 4, 8, 4, 4, 8], 'decoder_arbitrary_ende_attn': [1, -1, -1, -1, 1, 2]}",2022-11-30 01:11:11.600524,0,1380910176,25.703810906558953 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 3072, 1024], 'decoder_layer_num': [3], 'encoder_self_attention_heads': [8, 8, 4, 4, 4, 4], 'decoder_self_attention_heads': [8, 4, 8, 4, 8, 4], 'decoder_ende_attention_heads': [4, 4, 8, 4, 4, 8], 'decoder_arbitrary_ende_attn': [1, -1, -1, -1, 1, 2]}",2022-11-30 01:11:46.811777,0,1380910176,25.597256331992767 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 1024, 1024, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 3072, 3072, 2048, 2048], 'decoder_layer_num': [1], 'encoder_self_attention_heads': [8, 8, 4, 4, 8, 4], 'decoder_self_attention_heads': [4, 4, 8, 8, 4, 8], 'decoder_ende_attention_heads': [4, 8, 4, 4, 8, 4], 'decoder_arbitrary_ende_attn': [-1, -1, 1, -1, 1, 2]}",2022-11-30 01:12:14.807619,0,982932564,21.644880427454428 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [2048, 2048, 2048, 2048, 3072, 1024], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 8, 4, 4, 4, 4], 'decoder_self_attention_heads': [8, 4, 8, 4, 4, 8], 'decoder_ende_attention_heads': [8, 8, 4, 4, 8, 8], 'decoder_arbitrary_ende_attn': [1, -1, 1, -1, 1, 2]}",2022-11-30 01:12:45.339890,0,1378990170,25.719053744735888 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [3072, 1024, 2048, 2048, 2048, 2048], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [8, 8, 4, 8, 4, 4], 'decoder_self_attention_heads': [4, 4, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 4, 8, 4, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, 1, -1, 1, 2]}",2022-11-30 01:13:19.554133,0,1651138662,26.296467387644455 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 2048, 2048], 'decoder_layer_num': [3], 'encoder_self_attention_heads': [8, 8, 4, 4, 4, 4], 'decoder_self_attention_heads': [8, 4, 8, 4, 4, 8], 'decoder_ende_attention_heads': [4, 4, 8, 4, 4, 8], 'decoder_arbitrary_ende_attn': [1, -1, -1, -1, 1, 2]}",2022-11-30 01:13:55.039207,0,1443824736,25.92682154046769 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [2048, 2048, 2048, 2048, 2048, 1024], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 8, 4, 4, 4, 4], 'decoder_self_attention_heads': [8, 4, 8, 4, 4, 8], 'decoder_ende_attention_heads': [4, 8, 4, 4, 8, 8], 'decoder_arbitrary_ende_attn': [1, -1, -1, -1, 1, 2]}",2022-11-30 01:14:25.339862,0,1378990170,25.618097500353073 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 3072, 2048, 1024], 'decoder_layer_num': [3], 'encoder_self_attention_heads': [8, 8, 4, 8, 4, 4], 'decoder_self_attention_heads': [8, 4, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 4, 8, 4, 8, 8], 'decoder_arbitrary_ende_attn': [1, -1, -1, -1, 1, 2]}",2022-11-30 01:14:58.311883,0,1443824736,26.05836967026129 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [3072, 2048, 2048, 2048, 2048, 1024], 'decoder_layer_num': [5], 'encoder_self_attention_heads': [8, 8, 4, 4, 4, 4], 'decoder_self_attention_heads': [4, 4, 8, 4, 8, 4], 'decoder_ende_attention_heads': [8, 4, 8, 4, 8, 8], 'decoder_arbitrary_ende_attn': [1, -1, 1, -1, 1, 2]}",2022-11-30 01:15:35.880349,0,1826995308,26.665846653894516 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 2048, 2048, 2048, 1024], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 8, 4, 4, 4, 4], 'decoder_self_attention_heads': [8, 4, 8, 4, 4, 8], 'decoder_ende_attention_heads': [4, 8, 4, 4, 8, 8], 'decoder_arbitrary_ende_attn': [1, -1, -1, -1, 1, 2]}",2022-11-30 01:16:05.964065,0,1316075610,25.603591515089885 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 2048, 2048, 3072, 1024], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 8, 4, 4, 4, 4], 'decoder_self_attention_heads': [8, 4, 8, 4, 4, 8], 'decoder_ende_attention_heads': [4, 8, 4, 4, 4, 8], 'decoder_arbitrary_ende_attn': [1, -1, -1, -1, 1, 2]}",2022-11-30 01:16:35.543255,0,1316075610,25.726771790425637 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 2048, 2048], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [8, 8, 4, 8, 4, 4], 'decoder_self_attention_heads': [4, 4, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 4, 8, 4, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, 1, -1, 1, 2]}",2022-11-30 01:17:09.354304,0,1588224102,26.44933722401916 +"{'encoder_embed_dim': [640], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 2048], 'decoder_ffn_embed_dim': [2048, 1024, 2048, 2048, 3072, 1024], 'decoder_layer_num': [6], 'encoder_self_attention_heads': [4, 8, 8, 4, 4, 4], 'decoder_self_attention_heads': [4, 4, 4, 4, 8, 4], 'decoder_ende_attention_heads': [8, 8, 8, 4, 8, 8], 'decoder_arbitrary_ende_attn': [1, -1, 1, -1, 1, 1]}",2022-11-30 01:18:13.173994,0,2136595314,26.65406908362065 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 1024, 1024, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 2048, 1024], 'decoder_layer_num': [1], 'encoder_self_attention_heads': [4, 4, 8, 4, 4, 8], 'decoder_self_attention_heads': [4, 4, 4, 4, 4, 4], 'decoder_ende_attention_heads': [4, 8, 8, 4, 4, 8], 'decoder_arbitrary_ende_attn': [-1, -1, 1, -1, 1, 1]}",2022-11-30 01:18:40.903167,0,982932564,21.753935985565093 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 1024, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 2048, 2048, 1024, 1024], 'decoder_layer_num': [1], 'encoder_self_attention_heads': [4, 4, 8, 4, 4, 4], 'decoder_self_attention_heads': [4, 4, 4, 4, 8, 4], 'decoder_ende_attention_heads': [8, 8, 8, 4, 8, 4], 'decoder_arbitrary_ende_attn': [-1, -1, 1, -1, 2, 1]}",2022-11-30 01:19:09.095827,0,1077304404,22.592892971887863 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 1024, 1024, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 2048, 1024], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [4, 4, 8, 4, 4, 8], 'decoder_self_attention_heads': [4, 4, 4, 4, 4, 4], 'decoder_ende_attention_heads': [4, 8, 8, 4, 4, 8], 'decoder_arbitrary_ende_attn': [-1, -1, 1, -1, 1, -1]}",2022-11-30 01:19:38.452316,0,1079224410,23.708705837824187 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 2048], 'decoder_ffn_embed_dim': [2048, 1024, 2048, 2048, 1024, 1024], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [4, 8, 8, 4, 4, 4], 'decoder_self_attention_heads': [4, 4, 8, 4, 4, 8], 'decoder_ende_attention_heads': [8, 8, 8, 4, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, 1, -1, 2, 1]}",2022-11-30 01:20:15.347627,0,1651138662,26.467237911633077 +"{'encoder_embed_dim': [640], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 2048], 'decoder_ffn_embed_dim': [2048, 1024, 2048, 2048, 3072, 1024], 'decoder_layer_num': [5], 'encoder_self_attention_heads': [4, 8, 8, 4, 4, 4], 'decoder_self_attention_heads': [4, 4, 4, 4, 8, 4], 'decoder_ende_attention_heads': [8, 8, 8, 4, 8, 8], 'decoder_arbitrary_ende_attn': [1, -1, 1, -1, 1, 1]}",2022-11-30 01:20:53.408627,0,2015788908,26.720937123019265 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 1024, 2048, 2048, 1024], 'decoder_layer_num': [5], 'encoder_self_attention_heads': [4, 8, 8, 4, 4, 4], 'decoder_self_attention_heads': [4, 4, 4, 4, 8, 4], 'decoder_ende_attention_heads': [8, 8, 8, 4, 8, 8], 'decoder_arbitrary_ende_attn': [1, -1, 1, -1, 1, 1]}",2022-11-30 01:21:30.489630,0,1732623468,26.539145946856273 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 1024, 2048, 2048, 1024], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [4, 8, 8, 4, 4, 4], 'decoder_self_attention_heads': [4, 4, 4, 4, 8, 4], 'decoder_ende_attention_heads': [8, 8, 8, 4, 8, 8], 'decoder_arbitrary_ende_attn': [1, -1, 1, -1, 1, 1]}",2022-11-30 01:22:04.657299,0,1588224102,26.359690429140286 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 2048], 'decoder_ffn_embed_dim': [2048, 1024, 2048, 2048, 2048, 1024], 'decoder_layer_num': [5], 'encoder_self_attention_heads': [4, 8, 8, 4, 4, 4], 'decoder_self_attention_heads': [4, 4, 8, 4, 8, 8], 'decoder_ende_attention_heads': [8, 8, 8, 4, 8, 8], 'decoder_arbitrary_ende_attn': [1, -1, 1, -1, 1, 1]}",2022-11-30 01:22:41.410352,0,1795538028,26.7016667761388 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 2048, 1024, 2048, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 2048, 1024], 'decoder_layer_num': [1], 'encoder_self_attention_heads': [4, 4, 8, 4, 4, 8], 'decoder_self_attention_heads': [4, 4, 4, 4, 4, 4], 'decoder_ende_attention_heads': [4, 8, 8, 4, 8, 8], 'decoder_arbitrary_ende_attn': [-1, -1, 1, -1, 1, 1]}",2022-11-30 01:23:11.191925,0,1045847124,22.30727022285813 +"{'encoder_embed_dim': [640], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 2048, 2048, 2048, 1024], 'decoder_layer_num': [5], 'encoder_self_attention_heads': [4, 8, 8, 4, 4, 4], 'decoder_self_attention_heads': [4, 4, 4, 4, 8, 4], 'decoder_ende_attention_heads': [8, 8, 8, 4, 8, 8], 'decoder_arbitrary_ende_attn': [1, -1, 1, -1, 1, 1]}",2022-11-30 01:23:48.495707,0,1945010028,26.92853123605158 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 1024, 1024], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [4, 4, 8, 4, 4, 4], 'decoder_self_attention_heads': [4, 4, 4, 4, 4, 8], 'decoder_ende_attention_heads': [8, 8, 8, 4, 8, 4], 'decoder_arbitrary_ende_attn': [-1, -1, 1, -1, 2, 1]}",2022-11-30 01:24:18.247397,0,1142138970,24.635872154023456 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 1024, 2048, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 1024, 1024], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [4, 4, 8, 4, 4, 8], 'decoder_self_attention_heads': [4, 4, 4, 4, 8, 8], 'decoder_ende_attention_heads': [8, 8, 8, 4, 8, 8], 'decoder_arbitrary_ende_attn': [1, -1, 1, -1, 1, 1]}",2022-11-30 01:24:48.348618,0,1127331930,24.1530263191461 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 2048], 'decoder_ffn_embed_dim': [2048, 1024, 2048, 2048, 1024, 1024], 'decoder_layer_num': [6], 'encoder_self_attention_heads': [4, 8, 8, 4, 4, 4], 'decoder_self_attention_heads': [4, 4, 8, 4, 8, 4], 'decoder_ende_attention_heads': [8, 8, 8, 4, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, 1, -1, 1, -1]}",2022-11-30 01:25:26.938625,0,1860372594,26.730853774460698 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 3072, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 1024, 2048, 2048, 1024], 'decoder_layer_num': [3], 'encoder_self_attention_heads': [4, 8, 8, 4, 4, 4], 'decoder_self_attention_heads': [4, 4, 4, 4, 4, 8], 'decoder_ende_attention_heads': [8, 8, 8, 4, 4, 4], 'decoder_arbitrary_ende_attn': [1, -1, 1, -1, 1, 1]}",2022-11-30 01:25:59.999243,0,1366103136,25.823752934217687 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 2048, 2048, 2048, 1024], 'decoder_layer_num': [3], 'encoder_self_attention_heads': [4, 8, 8, 4, 4, 4], 'decoder_self_attention_heads': [4, 4, 4, 4, 8, 4], 'decoder_ende_attention_heads': [8, 8, 8, 4, 8, 8], 'decoder_arbitrary_ende_attn': [1, -1, 1, -1, 1, 1]}",2022-11-30 01:26:38.109367,0,1460474976,25.87027954805624 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 2048], 'decoder_ffn_embed_dim': [2048, 1024, 2048, 2048, 3072, 1024], 'decoder_layer_num': [5], 'encoder_self_attention_heads': [4, 8, 8, 4, 4, 4], 'decoder_self_attention_heads': [4, 4, 8, 4, 8, 4], 'decoder_ende_attention_heads': [8, 8, 8, 4, 8, 8], 'decoder_arbitrary_ende_attn': [1, -1, 1, -1, 1, 1]}",2022-11-30 01:27:14.992048,0,1826995308,26.80291796821574 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 1024, 2048, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 2048, 1024], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [4, 4, 8, 4, 4, 4], 'decoder_self_attention_heads': [4, 4, 4, 4, 8, 8], 'decoder_ende_attention_heads': [8, 8, 8, 4, 8, 4], 'decoder_arbitrary_ende_attn': [-1, -1, -1, -1, 2, 1]}",2022-11-30 01:27:44.593561,0,1110681690,24.51106684934533 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 1024, 1024, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 2048, 1024], 'decoder_layer_num': [1], 'encoder_self_attention_heads': [4, 4, 8, 4, 4, 8], 'decoder_self_attention_heads': [4, 4, 4, 4, 8, 8], 'decoder_ende_attention_heads': [8, 8, 8, 4, 8, 8], 'decoder_arbitrary_ende_attn': [1, -1, 1, -1, 1, 1]}",2022-11-30 01:28:13.315260,0,999582804,21.754138520655324 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 1024, 1024, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 3072, 1024], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [4, 4, 8, 4, 4, 8], 'decoder_self_attention_heads': [4, 4, 4, 4, 8, 4], 'decoder_ende_attention_heads': [8, 8, 8, 4, 8, 8], 'decoder_arbitrary_ende_attn': [1, -1, 1, -1, 1, 1]}",2022-11-30 01:28:43.054739,0,1095874650,23.72840125694677 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 1024, 2048, 2048, 1024], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [4, 8, 8, 4, 4, 4], 'decoder_self_attention_heads': [4, 4, 8, 4, 8, 8], 'decoder_ende_attention_heads': [8, 8, 8, 4, 8, 8], 'decoder_arbitrary_ende_attn': [1, -1, -1, -1, 2, 1]}",2022-11-30 01:29:16.453346,0,1540116582,26.383471679803876 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 1024, 2048, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 2048, 1024], 'decoder_layer_num': [1], 'encoder_self_attention_heads': [4, 4, 8, 4, 4, 8], 'decoder_self_attention_heads': [4, 4, 4, 4, 4, 4], 'decoder_ende_attention_heads': [4, 8, 8, 4, 8, 8], 'decoder_arbitrary_ende_attn': [-1, -1, 1, -1, 1, 1]}",2022-11-30 01:29:44.596431,0,1014389844,21.82697530935213 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 2048], 'decoder_ffn_embed_dim': [2048, 1024, 2048, 2048, 2048, 1024], 'decoder_layer_num': [3], 'encoder_self_attention_heads': [4, 8, 8, 4, 4, 4], 'decoder_self_attention_heads': [4, 4, 4, 4, 8, 4], 'decoder_ende_attention_heads': [8, 8, 8, 4, 8, 8], 'decoder_arbitrary_ende_attn': [1, -1, 1, -1, 1, 1]}",2022-11-30 01:30:22.103158,0,1523389536,25.605380278566003 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 2048, 2048, 1024], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [4, 8, 8, 4, 4, 8], 'decoder_self_attention_heads': [4, 4, 4, 4, 8, 4], 'decoder_ende_attention_heads': [8, 8, 8, 4, 4, 8], 'decoder_arbitrary_ende_attn': [1, -1, 1, -1, 1, 1]}",2022-11-30 01:30:52.162081,0,1158789210,24.84491236430293 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 2048, 2048, 2048, 1024], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [4, 8, 8, 4, 4, 4], 'decoder_self_attention_heads': [4, 4, 8, 4, 8, 4], 'decoder_ende_attention_heads': [8, 8, 8, 4, 8, 8], 'decoder_arbitrary_ende_attn': [1, -1, -1, -1, 1, 1]}",2022-11-30 01:31:27.756932,0,1571573862,26.52406333073416 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 3072, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 2048, 2048, 1024, 1024], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [4, 8, 8, 4, 4, 4], 'decoder_self_attention_heads': [4, 4, 4, 4, 8, 8], 'decoder_ende_attention_heads': [8, 8, 8, 4, 4, 8], 'decoder_arbitrary_ende_attn': [1, -1, 1, -1, 1, 1]}",2022-11-30 01:31:58.252949,0,1253161050,25.329648091984946 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 1024, 2048, 1024, 1024], 'decoder_layer_num': [5], 'encoder_self_attention_heads': [4, 8, 8, 4, 4, 4], 'decoder_self_attention_heads': [4, 4, 8, 4, 8, 8], 'decoder_ende_attention_heads': [8, 8, 8, 4, 8, 8], 'decoder_arbitrary_ende_attn': [1, -1, 1, -1, 1, 1]}",2022-11-30 01:32:34.692819,0,1669708908,26.5312087342803 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 2048, 3072, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 2048, 2048, 3072, 1024], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [4, 4, 8, 4, 4, 8], 'decoder_self_attention_heads': [4, 4, 4, 4, 8, 4], 'decoder_ende_attention_heads': [8, 8, 8, 4, 8, 8], 'decoder_arbitrary_ende_attn': [1, -1, 1, -1, 2, 1]}",2022-11-30 01:33:04.851690,0,1316075610,25.720879408387013 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 1024, 1024, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 2048, 1024], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [4, 8, 8, 4, 4, 8], 'decoder_self_attention_heads': [4, 4, 4, 4, 8, 4], 'decoder_ende_attention_heads': [8, 8, 8, 4, 4, 8], 'decoder_arbitrary_ende_attn': [-1, -1, 1, -1, 2, 1]}",2022-11-30 01:33:34.153425,0,1079224410,24.103822174703968 +"{'encoder_embed_dim': [640], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 2048], 'decoder_ffn_embed_dim': [2048, 1024, 2048, 2048, 1024, 1024], 'decoder_layer_num': [6], 'encoder_self_attention_heads': [4, 8, 8, 4, 4, 4], 'decoder_self_attention_heads': [4, 4, 4, 4, 4, 4], 'decoder_ende_attention_heads': [8, 8, 8, 4, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, 1, -1, 1, 1]}",2022-11-30 01:34:13.928399,0,2073680754,26.650073433583966 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 2048, 2048, 2048, 1024], 'decoder_layer_num': [3], 'encoder_self_attention_heads': [4, 8, 8, 4, 4, 4], 'decoder_self_attention_heads': [4, 4, 4, 4, 8, 4], 'decoder_ende_attention_heads': [8, 8, 8, 4, 8, 8], 'decoder_arbitrary_ende_attn': [1, -1, 1, -1, 1, 1]}",2022-11-30 01:34:52.003469,0,1491932256,25.68166831220549 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 2048, 2048, 2048, 1024], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [4, 8, 8, 4, 4, 4], 'decoder_self_attention_heads': [4, 4, 4, 4, 8, 4], 'decoder_ende_attention_heads': [8, 8, 8, 4, 4, 8], 'decoder_arbitrary_ende_attn': [1, -1, 1, -1, 1, 1]}",2022-11-30 01:35:22.255164,0,1316075610,25.63882428357699 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 2048, 2048, 2048, 1024], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [4, 8, 8, 4, 4, 4], 'decoder_self_attention_heads': [4, 4, 8, 4, 8, 4], 'decoder_ende_attention_heads': [8, 8, 8, 4, 8, 8], 'decoder_arbitrary_ende_attn': [1, -1, 1, -1, 1, -1]}",2022-11-30 01:35:51.968182,0,1347532890,25.833426111273635 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 2048], 'decoder_ffn_embed_dim': [2048, 1024, 2048, 2048, 1024, 1024], 'decoder_layer_num': [6], 'encoder_self_attention_heads': [4, 8, 8, 4, 4, 4], 'decoder_self_attention_heads': [4, 4, 4, 4, 8, 4], 'decoder_ende_attention_heads': [8, 8, 8, 4, 8, 8], 'decoder_arbitrary_ende_attn': [1, -1, 1, -1, 1, 1]}",2022-11-30 01:36:31.293492,0,1877022834,26.470763981649853 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 2048, 1024, 2048, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 1024, 1024], 'decoder_layer_num': [1], 'encoder_self_attention_heads': [4, 8, 8, 4, 4, 8], 'decoder_self_attention_heads': [4, 4, 4, 4, 4, 8], 'decoder_ende_attention_heads': [8, 8, 8, 4, 4, 8], 'decoder_arbitrary_ende_attn': [1, -1, 1, -1, 1, 1]}",2022-11-30 01:37:01.112958,0,1062497364,22.484020387193706 +"{'encoder_embed_dim': [640], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 2048], 'decoder_ffn_embed_dim': [2048, 1024, 2048, 2048, 2048, 1024], 'decoder_layer_num': [6], 'encoder_self_attention_heads': [4, 8, 8, 4, 4, 4], 'decoder_self_attention_heads': [4, 4, 8, 4, 8, 4], 'decoder_ende_attention_heads': [8, 8, 8, 4, 8, 8], 'decoder_arbitrary_ende_attn': [1, -1, 1, -1, 1, 1]}",2022-11-30 01:37:41.228117,0,2105138034,26.83517275846072 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 2048, 2048, 1024, 1024], 'decoder_layer_num': [3], 'encoder_self_attention_heads': [4, 4, 8, 4, 4, 4], 'decoder_self_attention_heads': [4, 4, 8, 4, 8, 8], 'decoder_ende_attention_heads': [8, 8, 8, 4, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, -1, -1, 2, 1]}",2022-11-30 01:38:19.780658,0,1443824736,25.801099953730787 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 2048, 2048, 2048, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 2048, 1024], 'decoder_layer_num': [1], 'encoder_self_attention_heads': [4, 4, 8, 4, 4, 8], 'decoder_self_attention_heads': [4, 4, 4, 4, 4, 4], 'decoder_ende_attention_heads': [8, 8, 8, 4, 4, 8], 'decoder_arbitrary_ende_attn': [-1, -1, 1, -1, 1, 1]}",2022-11-30 01:38:50.000595,0,1077304404,22.347079305577246 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 1024, 2048, 2048, 1024], 'decoder_layer_num': [3], 'encoder_self_attention_heads': [4, 8, 8, 4, 4, 4], 'decoder_self_attention_heads': [4, 4, 4, 4, 4, 8], 'decoder_ende_attention_heads': [8, 8, 8, 4, 8, 8], 'decoder_arbitrary_ende_attn': [1, -1, -1, -1, 1, 1]}",2022-11-30 01:39:21.802091,0,1380910176,25.69437359008145 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 2048, 2048, 2048, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 2048, 1024], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [4, 8, 8, 4, 4, 8], 'decoder_self_attention_heads': [4, 4, 8, 4, 8, 8], 'decoder_ende_attention_heads': [8, 8, 8, 4, 8, 8], 'decoder_arbitrary_ende_attn': [1, -1, 1, -1, 1, -1]}",2022-11-30 01:39:51.661673,0,1253161050,25.278872112854447 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 1024, 2048, 2048, 1024], 'decoder_layer_num': [5], 'encoder_self_attention_heads': [4, 8, 8, 4, 4, 4], 'decoder_self_attention_heads': [4, 4, 4, 4, 8, 4], 'decoder_ende_attention_heads': [8, 8, 8, 4, 8, 8], 'decoder_arbitrary_ende_attn': [1, -1, 1, -1, 1, 1]}",2022-11-30 01:40:28.495428,0,1701166188,26.376182230389777 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 2048, 2048, 3072, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 1024, 2048, 3072, 1024], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [4, 4, 8, 4, 4, 8], 'decoder_self_attention_heads': [4, 4, 4, 4, 8, 4], 'decoder_ende_attention_heads': [8, 8, 8, 4, 8, 8], 'decoder_arbitrary_ende_attn': [1, -1, 1, -1, 2, 1]}",2022-11-30 01:40:58.659095,0,1284618330,25.38778340159123 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 2048, 2048, 2048, 3072, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 2048, 2048, 1024, 1024], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [4, 8, 8, 4, 4, 4], 'decoder_self_attention_heads': [4, 4, 4, 4, 8, 4], 'decoder_ende_attention_heads': [8, 8, 8, 4, 4, 4], 'decoder_arbitrary_ende_attn': [1, -1, -1, -1, 2, 1]}",2022-11-30 01:41:29.227300,0,1284618330,25.42396262714132 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 2048], 'decoder_ffn_embed_dim': [2048, 1024, 1024, 2048, 2048, 1024], 'decoder_layer_num': [3], 'encoder_self_attention_heads': [4, 8, 8, 4, 4, 4], 'decoder_self_attention_heads': [4, 4, 4, 4, 8, 8], 'decoder_ende_attention_heads': [8, 8, 8, 4, 8, 8], 'decoder_arbitrary_ende_attn': [1, -1, 1, -1, 2, 1]}",2022-11-30 01:42:01.693769,0,1491932256,26.049173770932143 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 2048, 2048, 2048, 3072, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 2048, 2048, 1024, 1024], 'decoder_layer_num': [3], 'encoder_self_attention_heads': [4, 4, 8, 4, 4, 4], 'decoder_self_attention_heads': [4, 4, 4, 4, 8, 4], 'decoder_ende_attention_heads': [8, 8, 8, 4, 4, 4], 'decoder_arbitrary_ende_attn': [1, -1, -1, -1, 2, 1]}",2022-11-30 01:42:39.003275,0,1412367456,25.450957445704358 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 2048, 1024], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [4, 8, 8, 4, 4, 8], 'decoder_self_attention_heads': [4, 4, 4, 4, 4, 4], 'decoder_ende_attention_heads': [8, 8, 8, 4, 4, 4], 'decoder_arbitrary_ende_attn': [-1, -1, 1, -1, 1, 1]}",2022-11-30 01:43:08.560591,0,1205053530,24.982224495498585 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 2048, 2048, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 3072, 1024], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [4, 8, 8, 4, 4, 8], 'decoder_self_attention_heads': [4, 4, 4, 4, 8, 4], 'decoder_ende_attention_heads': [8, 8, 8, 4, 4, 8], 'decoder_arbitrary_ende_attn': [1, -1, 1, -1, 2, 1]}",2022-11-30 01:43:38.655869,0,1190246490,24.935927330385375 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 1024, 2048, 1024, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 2048, 2048, 1024, 1024], 'decoder_layer_num': [1], 'encoder_self_attention_heads': [4, 4, 8, 4, 4, 8], 'decoder_self_attention_heads': [4, 4, 4, 4, 8, 4], 'decoder_ende_attention_heads': [8, 8, 8, 4, 4, 4], 'decoder_arbitrary_ende_attn': [-1, -1, 1, -1, 2, 1]}",2022-11-30 01:44:07.301698,0,1045847124,22.58554082987191 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 1024, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 2048, 2048, 1024, 1024], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [4, 8, 8, 4, 4, 8], 'decoder_self_attention_heads': [4, 4, 4, 4, 8, 8], 'decoder_ende_attention_heads': [8, 8, 8, 4, 4, 8], 'decoder_arbitrary_ende_attn': [1, -1, 1, -1, 1, 1]}",2022-11-30 01:44:37.461976,0,1190246490,24.671958393096833 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 2048, 2048, 1024, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 2048, 2048, 3072, 1024], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [4, 4, 8, 4, 4, 8], 'decoder_self_attention_heads': [4, 4, 4, 4, 8, 4], 'decoder_ende_attention_heads': [8, 8, 8, 4, 4, 8], 'decoder_arbitrary_ende_attn': [1, -1, 1, -1, 2, 1]}",2022-11-30 01:45:07.650100,0,1221703770,24.956348430879675 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 1024, 2048, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 3072, 2048, 1024, 3072, 1024], 'decoder_layer_num': [1], 'encoder_self_attention_heads': [4, 4, 8, 4, 8, 8], 'decoder_self_attention_heads': [4, 4, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 8, 4, 8, 8, 8], 'decoder_arbitrary_ende_attn': [1, -1, 1, -1, 1, 2]}",2022-11-30 01:46:01.404524,0,1031040084,22.152066191159324 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [640], 'encoder_ffn_embed_dim': [2048, 3072, 3072, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 3072, 1024, 3072, 1024], 'decoder_layer_num': [5], 'encoder_self_attention_heads': [4, 8, 8, 4, 4, 4], 'decoder_self_attention_heads': [4, 4, 8, 8, 8, 8], 'decoder_ende_attention_heads': [8, 8, 4, 8, 8, 8], 'decoder_arbitrary_ende_attn': [2, -1, -1, -1, 1, 2]}",2022-11-30 01:46:38.986518,0,2082647148,26.632249118046865 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 2048, 2048, 1024, 1024], 'decoder_ffn_embed_dim': [2048, 3072, 2048, 1024, 3072, 1024], 'decoder_layer_num': [1], 'encoder_self_attention_heads': [8, 8, 8, 4, 8, 8], 'decoder_self_attention_heads': [4, 4, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 8, 4, 4, 8, 8], 'decoder_arbitrary_ende_attn': [1, -1, -1, -1, 1, 2]}",2022-11-30 01:47:07.608826,0,1125411924,23.429789667811992 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 1024, 2048, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 1024, 3072, 1024], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [4, 4, 8, 4, 8, 8], 'decoder_self_attention_heads': [4, 8, 8, 8, 8, 8], 'decoder_ende_attention_heads': [4, 8, 4, 4, 4, 8], 'decoder_arbitrary_ende_attn': [1, -1, 1, 1, -1, 2]}",2022-11-30 01:47:37.867613,0,1127331930,24.145917266173598 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 3072, 3072, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 3072, 1024, 3072, 1024], 'decoder_layer_num': [5], 'encoder_self_attention_heads': [4, 8, 8, 4, 4, 4], 'decoder_self_attention_heads': [4, 4, 8, 8, 8, 8], 'decoder_ende_attention_heads': [8, 8, 4, 8, 8, 8], 'decoder_arbitrary_ende_attn': [2, -1, -1, -1, 1, 2]}",2022-11-30 01:48:15.090054,0,1826995308,26.770198001318732 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [640], 'encoder_ffn_embed_dim': [2048, 3072, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 1024, 3072, 1024], 'decoder_layer_num': [5], 'encoder_self_attention_heads': [4, 8, 8, 4, 4, 4], 'decoder_self_attention_heads': [4, 4, 8, 8, 8, 8], 'decoder_ende_attention_heads': [8, 8, 4, 8, 8, 8], 'decoder_arbitrary_ende_attn': [2, -1, -1, -1, 1, 2]}",2022-11-30 01:48:53.000294,0,2011868268,26.415120168334965 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 2048, 2048, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 1024, 3072, 1024], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [4, 8, 8, 4, 4, 4], 'decoder_self_attention_heads': [4, 4, 8, 8, 4, 8], 'decoder_ende_attention_heads': [8, 8, 4, 8, 8, 8], 'decoder_arbitrary_ende_attn': [1, -1, -1, -1, 1, 2]}",2022-11-30 01:49:26.441014,0,1382830182,25.316083834952625 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 1024, 2048, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 1024, 3072, 1024], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [4, 8, 8, 8, 4, 4], 'decoder_self_attention_heads': [4, 4, 8, 8, 8, 8], 'decoder_ende_attention_heads': [8, 8, 4, 8, 8, 8], 'decoder_arbitrary_ende_attn': [1, -1, -1, -1, -1, 2]}",2022-11-30 01:49:59.544683,0,1414287462,25.711571467826957 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 2048, 2048, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 3072, 1024, 3072, 1024], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [4, 8, 8, 4, 4, 4], 'decoder_self_attention_heads': [4, 4, 8, 8, 4, 8], 'decoder_ende_attention_heads': [8, 8, 4, 8, 8, 8], 'decoder_arbitrary_ende_attn': [2, 1, -1, -1, 1, 2]}",2022-11-30 01:50:30.849009,0,1223546970,24.831513194585497 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 1024, 2048, 1024, 1024], 'decoder_ffn_embed_dim': [2048, 3072, 1024, 1024, 2048, 1024], 'decoder_layer_num': [1], 'encoder_self_attention_heads': [8, 8, 8, 4, 4, 8], 'decoder_self_attention_heads': [4, 4, 8, 8, 8, 8], 'decoder_ende_attention_heads': [4, 8, 4, 8, 8, 8], 'decoder_arbitrary_ende_attn': [2, -1, 1, -1, 1, 2]}",2022-11-30 01:51:01.107149,0,1079147604,22.675323576984212 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 1024, 2048, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 1024, 3072, 1024], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [4, 8, 8, 4, 4, 4], 'decoder_self_attention_heads': [4, 4, 8, 8, 4, 8], 'decoder_ende_attention_heads': [8, 8, 4, 8, 8, 8], 'decoder_arbitrary_ende_attn': [2, -1, -1, 1, -1, 2]}",2022-11-30 01:51:31.946530,0,1143982170,24.30020519062015 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 1024, 2048, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 1024, 3072, 1024], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [4, 8, 8, 4, 4, 4], 'decoder_self_attention_heads': [4, 4, 8, 4, 4, 8], 'decoder_ende_attention_heads': [8, 8, 4, 8, 8, 8], 'decoder_arbitrary_ende_attn': [1, -1, -1, -1, 1, 2]}",2022-11-30 01:52:01.944178,0,1158789210,24.760354519096623 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 1024, 2048, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 1024, 3072, 1024], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [4, 8, 8, 4, 4, 4], 'decoder_self_attention_heads': [4, 4, 8, 4, 4, 8], 'decoder_ende_attention_heads': [8, 8, 4, 8, 8, 8], 'decoder_arbitrary_ende_attn': [2, -1, -1, -1, 1, 2]}",2022-11-30 01:52:32.644831,0,1175439450,24.745534135337422 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 1024, 2048, 1024, 1024], 'decoder_ffn_embed_dim': [2048, 3072, 2048, 1024, 3072, 1024], 'decoder_layer_num': [1], 'encoder_self_attention_heads': [8, 8, 8, 4, 8, 8], 'decoder_self_attention_heads': [4, 4, 8, 8, 8, 8], 'decoder_ende_attention_heads': [4, 8, 4, 4, 8, 8], 'decoder_arbitrary_ende_attn': [1, -1, 1, 1, 1, 2]}",2022-11-30 01:53:01.116235,0,1062497364,22.613029220889654 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 3072, 3072, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 1024, 3072, 1024], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [4, 8, 8, 8, 4, 4], 'decoder_self_attention_heads': [4, 4, 8, 8, 8, 8], 'decoder_ende_attention_heads': [8, 8, 4, 8, 8, 8], 'decoder_arbitrary_ende_attn': [2, -1, -1, -1, 1, 2]}",2022-11-30 01:53:35.041431,0,1619681382,26.2082672911056 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 2048, 3072, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 3072, 1024, 3072, 1024], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [4, 8, 8, 4, 4, 4], 'decoder_self_attention_heads': [4, 4, 8, 8, 4, 8], 'decoder_ende_attention_heads': [8, 8, 4, 8, 8, 8], 'decoder_arbitrary_ende_attn': [2, 1, -1, -1, 1, 2]}",2022-11-30 01:54:05.943378,0,1255004250,25.072608209842993 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 1024, 1024], 'decoder_ffn_embed_dim': [2048, 3072, 1024, 1024, 2048, 1024], 'decoder_layer_num': [1], 'encoder_self_attention_heads': [8, 8, 8, 4, 4, 8], 'decoder_self_attention_heads': [4, 4, 8, 8, 8, 8], 'decoder_ende_attention_heads': [4, 8, 4, 8, 8, 8], 'decoder_arbitrary_ende_attn': [2, -1, 1, -1, 1, 2]}",2022-11-30 01:54:35.408367,0,1110604884,23.1627621167102 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 3072, 1024, 3072, 1024], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [4, 8, 8, 4, 4, 4], 'decoder_self_attention_heads': [4, 4, 8, 8, 4, 8], 'decoder_ende_attention_heads': [8, 8, 4, 8, 8, 8], 'decoder_arbitrary_ende_attn': [2, 1, -1, -1, 1, 2]}",2022-11-30 01:55:06.650719,0,1317918810,25.578281446432214 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 1024, 2048, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 1024, 3072, 1024], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [4, 8, 8, 4, 4, 4], 'decoder_self_attention_heads': [4, 4, 8, 4, 4, 8], 'decoder_ende_attention_heads': [8, 8, 4, 8, 8, 8], 'decoder_arbitrary_ende_attn': [1, -1, 1, -1, 1, 2]}",2022-11-30 01:55:36.475238,0,1127331930,24.221637357380914 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 1024, 2048, 1024], 'decoder_layer_num': [5], 'encoder_self_attention_heads': [4, 8, 8, 4, 4, 4], 'decoder_self_attention_heads': [4, 4, 8, 8, 8, 8], 'decoder_ende_attention_heads': [8, 8, 4, 8, 8, 8], 'decoder_arbitrary_ende_attn': [2, -1, -1, -1, 1, 2]}",2022-11-30 01:56:12.790186,0,1669708908,26.54140556368842 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 2048, 2048, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 3072, 1024, 3072, 1024], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [4, 8, 8, 4, 4, 4], 'decoder_self_attention_heads': [4, 4, 8, 8, 8, 8], 'decoder_ende_attention_heads': [8, 8, 4, 8, 8, 8], 'decoder_arbitrary_ende_attn': [2, -1, -1, -1, 1, 2]}",2022-11-30 01:56:43.360950,0,1206896730,24.98000861493876 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 3072, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 1024, 2048, 1024], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [4, 8, 8, 4, 4, 4], 'decoder_self_attention_heads': [4, 4, 8, 8, 8, 8], 'decoder_ende_attention_heads': [8, 8, 4, 8, 8, 8], 'decoder_arbitrary_ende_attn': [2, -1, -1, -1, 1, 2]}",2022-11-30 01:57:13.953244,0,1364183130,25.71705181089393 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 3072, 1024, 2048, 1024], 'decoder_layer_num': [5], 'encoder_self_attention_heads': [4, 8, 8, 4, 4, 4], 'decoder_self_attention_heads': [4, 4, 8, 8, 8, 8], 'decoder_ende_attention_heads': [8, 8, 4, 8, 8, 8], 'decoder_arbitrary_ende_attn': [2, -1, -1, -1, 1, 2]}",2022-11-30 01:57:51.090401,0,1701166188,26.74516092088886 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 3072, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 1024, 2048, 1024], 'decoder_layer_num': [5], 'encoder_self_attention_heads': [4, 8, 8, 4, 4, 4], 'decoder_self_attention_heads': [4, 4, 8, 8, 8, 8], 'decoder_ende_attention_heads': [8, 8, 4, 8, 8, 4], 'decoder_arbitrary_ende_attn': [2, -1, -1, -1, 1, 2]}",2022-11-30 01:58:27.981238,0,1732623468,26.832353130261968 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 1024, 2048, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 1024, 2048, 1024], 'decoder_layer_num': [1], 'encoder_self_attention_heads': [8, 8, 8, 4, 4, 4], 'decoder_self_attention_heads': [4, 4, 8, 8, 8, 8], 'decoder_ende_attention_heads': [4, 8, 4, 8, 8, 8], 'decoder_arbitrary_ende_attn': [2, -1, 1, -1, 1, 2]}",2022-11-30 01:58:56.696835,0,1047690324,21.97995176988117 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 3072, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 1024, 3072, 1024], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [4, 8, 8, 8, 4, 4], 'decoder_self_attention_heads': [4, 4, 8, 8, 4, 8], 'decoder_ende_attention_heads': [8, 8, 4, 8, 8, 8], 'decoder_arbitrary_ende_attn': [2, -1, -1, -1, 1, 2]}",2022-11-30 01:59:31.345276,0,1588224102,26.3358070373563 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 1024, 2048, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 1024, 3072, 1024], 'decoder_layer_num': [1], 'encoder_self_attention_heads': [8, 8, 8, 4, 8, 8], 'decoder_self_attention_heads': [4, 4, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 8, 4, 8, 4, 8], 'decoder_arbitrary_ende_attn': [1, -1, 1, -1, 1, 2]}",2022-11-30 02:00:00.113102,0,1062497364,22.362618000056415 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 1024, 2048, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 1024, 3072, 1024], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [4, 8, 8, 4, 4, 4], 'decoder_self_attention_heads': [4, 4, 8, 8, 4, 8], 'decoder_ende_attention_heads': [4, 8, 4, 8, 8, 8], 'decoder_arbitrary_ende_attn': [1, -1, 1, 1, 1, 2]}",2022-11-30 02:00:29.945995,0,1127331930,24.648057370763627 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 3072, 1024, 3072, 1024], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [4, 8, 8, 4, 4, 4], 'decoder_self_attention_heads': [4, 4, 8, 4, 4, 8], 'decoder_ende_attention_heads': [8, 8, 4, 8, 8, 8], 'decoder_arbitrary_ende_attn': [2, -1, -1, -1, 1, 2]}",2022-11-30 02:01:00.265931,0,1301268570,25.436935433814305 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 3072, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 3072, 1024, 3072, 1024], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [4, 8, 8, 4, 4, 4], 'decoder_self_attention_heads': [4, 4, 8, 8, 4, 8], 'decoder_ende_attention_heads': [8, 8, 4, 8, 8, 8], 'decoder_arbitrary_ende_attn': [2, 1, -1, -1, 1, 2]}",2022-11-30 02:01:31.151981,0,1380833370,25.654131643295717 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 3072, 3072, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 1024, 3072, 1024], 'decoder_layer_num': [5], 'encoder_self_attention_heads': [4, 8, 8, 4, 4, 4], 'decoder_self_attention_heads': [4, 4, 8, 8, 8, 8], 'decoder_ende_attention_heads': [8, 8, 4, 8, 8, 8], 'decoder_arbitrary_ende_attn': [2, -1, -1, -1, 1, 2]}",2022-11-30 02:02:09.301256,0,1795538028,26.84364156318439 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 1024, 2048, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 1024, 2048, 1024], 'decoder_layer_num': [1], 'encoder_self_attention_heads': [8, 8, 8, 4, 4, 4], 'decoder_self_attention_heads': [4, 4, 8, 8, 8, 8], 'decoder_ende_attention_heads': [4, 8, 4, 8, 8, 8], 'decoder_arbitrary_ende_attn': [1, -1, 1, -1, 1, 2]}",2022-11-30 02:02:37.517073,0,1031040084,21.937041004808965 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 3072, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 1024, 3072, 1024], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [4, 8, 8, 8, 4, 4], 'decoder_self_attention_heads': [4, 4, 8, 8, 8, 8], 'decoder_ende_attention_heads': [8, 8, 4, 8, 8, 8], 'decoder_arbitrary_ende_attn': [2, -1, -1, -1, 1, 2]}",2022-11-30 02:03:12.651254,0,1556766822,26.361614746242726 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 2048, 2048, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 3072, 1024, 3072, 1024], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [4, 8, 8, 4, 4, 4], 'decoder_self_attention_heads': [4, 4, 8, 8, 4, 8], 'decoder_ende_attention_heads': [8, 8, 4, 8, 8, 8], 'decoder_arbitrary_ende_attn': [2, 1, -1, -1, 1, 2]}",2022-11-30 02:03:43.953623,0,1286461530,25.33590186847325 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 1024, 2048, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 1024, 3072, 1024], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [4, 8, 8, 4, 8, 8], 'decoder_self_attention_heads': [4, 8, 8, 8, 8, 8], 'decoder_ende_attention_heads': [4, 8, 4, 4, 8, 8], 'decoder_arbitrary_ende_attn': [1, -1, 1, 1, -1, 2]}",2022-11-30 02:04:14.640998,0,1127331930,24.26800692018226 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 2048, 2048, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 1024, 2048, 1024], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [4, 8, 8, 4, 4, 4], 'decoder_self_attention_heads': [4, 4, 8, 8, 4, 8], 'decoder_ende_attention_heads': [8, 8, 4, 8, 8, 8], 'decoder_arbitrary_ende_attn': [1, -1, -1, -1, 1, 2]}",2022-11-30 02:04:48.236910,0,1445744742,25.8605605687801 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 1024, 2048, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 1024, 3072, 1024], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [4, 8, 8, 8, 4, 4], 'decoder_self_attention_heads': [4, 4, 8, 8, 8, 8], 'decoder_ende_attention_heads': [8, 8, 4, 8, 8, 8], 'decoder_arbitrary_ende_attn': [2, -1, -1, -1, -1, 2]}",2022-11-30 02:05:22.538414,0,1430937702,25.736368197822884 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 1024, 2048, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 1024, 3072, 1024], 'decoder_layer_num': [1], 'encoder_self_attention_heads': [4, 8, 8, 4, 4, 8], 'decoder_self_attention_heads': [4, 4, 8, 4, 4, 8], 'decoder_ende_attention_heads': [8, 8, 4, 8, 8, 8], 'decoder_arbitrary_ende_attn': [1, -1, -1, -1, 1, 2]}",2022-11-30 02:05:51.408503,0,1031040084,22.160851653528795 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 1024, 3072, 1024], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [4, 8, 8, 8, 4, 4], 'decoder_self_attention_heads': [4, 4, 8, 8, 4, 8], 'decoder_ende_attention_heads': [8, 8, 4, 8, 8, 8], 'decoder_arbitrary_ende_attn': [2, -1, -1, -1, 1, 2]}",2022-11-30 02:06:26.648873,0,1525309542,26.283505470568386 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 3072, 3072, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 3072, 1024, 2048, 1024], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [4, 8, 8, 8, 4, 4], 'decoder_self_attention_heads': [4, 4, 8, 4, 4, 8], 'decoder_ende_attention_heads': [8, 8, 4, 8, 8, 4], 'decoder_arbitrary_ende_attn': [2, -1, -1, -1, 1, 2]}",2022-11-30 02:07:01.841145,0,1651138662,26.200923396852648 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 1024, 3072, 1024], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [4, 8, 8, 4, 4, 4], 'decoder_self_attention_heads': [4, 4, 8, 8, 4, 8], 'decoder_ende_attention_heads': [8, 8, 4, 8, 8, 8], 'decoder_arbitrary_ende_attn': [2, -1, -1, -1, 1, 2]}",2022-11-30 02:07:36.826535,0,1493852262,26.022938040890846 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 3072, 3072, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 3072, 1024, 3072, 1024], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [4, 8, 8, 8, 4, 4], 'decoder_self_attention_heads': [4, 4, 8, 8, 8, 8], 'decoder_ende_attention_heads': [8, 8, 4, 8, 8, 8], 'decoder_arbitrary_ende_attn': [2, -1, -1, -1, 1, 2]}",2022-11-30 02:08:11.743622,0,1651138662,26.323507870111342 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 1024, 2048, 1024, 1024], 'decoder_ffn_embed_dim': [2048, 3072, 2048, 1024, 3072, 1024], 'decoder_layer_num': [1], 'encoder_self_attention_heads': [8, 8, 8, 4, 4, 8], 'decoder_self_attention_heads': [4, 4, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 8, 4, 4, 8, 8], 'decoder_arbitrary_ende_attn': [1, -1, -1, -1, 1, 2]}",2022-11-30 02:08:40.810079,0,1093954644,23.10914434974903 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 1024, 2048, 1024, 1024], 'decoder_ffn_embed_dim': [2048, 3072, 2048, 1024, 3072, 1024], 'decoder_layer_num': [1], 'encoder_self_attention_heads': [8, 8, 8, 4, 8, 8], 'decoder_self_attention_heads': [4, 4, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 8, 4, 4, 8, 8], 'decoder_arbitrary_ende_attn': [1, -1, -1, -1, 1, 2]}",2022-11-30 02:09:10.497316,0,1093954644,22.93525727677559 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 1024, 2048, 1024], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [4, 8, 8, 4, 4, 4], 'decoder_self_attention_heads': [4, 4, 8, 8, 8, 8], 'decoder_ende_attention_heads': [8, 8, 4, 8, 8, 8], 'decoder_arbitrary_ende_attn': [2, -1, -1, -1, 1, 2]}",2022-11-30 02:09:45.035662,0,1525309542,26.303765162822714 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 1024, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 1024, 2048, 1024], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [4, 8, 8, 4, 4, 4], 'decoder_self_attention_heads': [4, 4, 8, 8, 8, 8], 'decoder_ende_attention_heads': [4, 8, 4, 8, 8, 8], 'decoder_arbitrary_ende_attn': [2, -1, 1, -1, 1, 2]}",2022-11-30 02:10:15.853041,0,1269811290,25.14969354058586 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 3072, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 3072, 1024, 2048, 1024], 'decoder_layer_num': [5], 'encoder_self_attention_heads': [4, 8, 8, 4, 4, 4], 'decoder_self_attention_heads': [4, 4, 8, 8, 8, 8], 'decoder_ende_attention_heads': [8, 8, 4, 8, 8, 8], 'decoder_arbitrary_ende_attn': [2, -1, -1, -1, 1, 2]}",2022-11-30 02:10:53.034327,0,1764080748,26.719747425528375 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 1024, 2048, 1024], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [4, 8, 8, 8, 4, 4], 'decoder_self_attention_heads': [4, 4, 8, 8, 8, 8], 'decoder_ende_attention_heads': [8, 8, 4, 8, 8, 8], 'decoder_arbitrary_ende_attn': [2, -1, -1, -1, -1, 2]}",2022-11-30 02:11:27.448102,0,1493852262,26.147631963028342 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 2048, 2048, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 1024, 3072, 1024], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [4, 8, 8, 4, 4, 4], 'decoder_self_attention_heads': [4, 4, 8, 4, 4, 8], 'decoder_ende_attention_heads': [8, 8, 4, 8, 8, 8], 'decoder_arbitrary_ende_attn': [2, -1, -1, -1, -1, 2]}",2022-11-30 02:12:01.745228,0,1462394982,25.806638854264037 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 1024, 3072, 1024], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [4, 8, 8, 4, 4, 4], 'decoder_self_attention_heads': [4, 4, 8, 8, 4, 8], 'decoder_ende_attention_heads': [8, 8, 4, 8, 8, 8], 'decoder_arbitrary_ende_attn': [2, 1, -1, 1, 1, 2]}",2022-11-30 02:12:33.064042,0,1192089690,24.67646662934568 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 1024, 2048, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 2048, 1024, 3072, 3072, 1024], 'decoder_layer_num': [1], 'encoder_self_attention_heads': [8, 8, 8, 4, 8, 8], 'decoder_self_attention_heads': [8, 8, 4, 4, 4, 8], 'decoder_ende_attention_heads': [8, 4, 4, 8, 8, 4], 'decoder_arbitrary_ende_attn': [-1, -1, 2, 1, -1, 1]}",2022-11-30 02:13:31.905219,0,1014389844,22.07444316375671 +"{'encoder_embed_dim': [640], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 3072], 'decoder_ffn_embed_dim': [2048, 1024, 3072, 2048, 1024, 2048], 'decoder_layer_num': [6], 'encoder_self_attention_heads': [8, 8, 8, 8, 4, 8], 'decoder_self_attention_heads': [4, 8, 8, 8, 8, 4], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, 2, -1, 2, -1]}",2022-11-30 02:14:13.239756,0,2196499314,26.808937868203532 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 1024, 2048, 2048, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 3072, 3072, 1024, 1024], 'decoder_layer_num': [1], 'encoder_self_attention_heads': [8, 8, 4, 8, 8, 8], 'decoder_self_attention_heads': [8, 4, 8, 8, 8, 4], 'decoder_ende_attention_heads': [8, 8, 4, 8, 8, 8], 'decoder_arbitrary_ende_attn': [-1, -1, 2, -1, 2, 1]}",2022-11-30 02:14:45.016226,0,1108761684,22.99185474514479 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 1024, 2048, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 3072, 1024, 3072], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 8, 8, 4, 8, 8], 'decoder_self_attention_heads': [8, 4, 8, 8, 4, 4], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [-1, -1, 2, -1, 2, 1]}",2022-11-30 02:15:16.151531,0,1110681690,24.451311787293157 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 1024, 2048, 1024, 3072], 'decoder_layer_num': [5], 'encoder_self_attention_heads': [8, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [4, 4, 8, 8, 8, 4], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, -1, -1, 1, 1]}",2022-11-30 02:15:52.689296,0,1653058668,26.388890341662112 +"{'encoder_embed_dim': [640], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 3072, 2048, 1024, 2048], 'decoder_layer_num': [5], 'encoder_self_attention_heads': [8, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [4, 4, 8, 8, 8, 4], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, -1, -1, 2, -1]}",2022-11-30 02:16:30.799650,0,1945010028,26.832269839482606 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 3072], 'decoder_ffn_embed_dim': [2048, 1024, 3072, 2048, 1024, 3072], 'decoder_layer_num': [5], 'encoder_self_attention_heads': [8, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [4, 4, 8, 8, 8, 4], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, -1, -1, 1, 1]}",2022-11-30 02:17:09.628703,0,1810345068,26.87035240576452 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 1024, 2048, 1024, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 1024, 2048, 1024, 2048], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [8, 4, 8, 8, 8, 4], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, 2, -1, 2, 1]}",2022-11-30 02:17:41.163075,0,1158789210,24.73994162716646 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 2048, 2048, 2048, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 3072, 3072, 3072, 1024], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [4, 4, 8, 8, 8, 4], 'decoder_ende_attention_heads': [8, 8, 4, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, 2, -1, 2, 1]}",2022-11-30 02:18:12.461543,0,1253161050,25.18416515633912 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 2048, 3072, 1024, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 1024, 2048, 1024, 2048], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [8, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [4, 4, 8, 8, 4, 4], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, -1, -1, 2, 1]}",2022-11-30 02:18:46.947088,0,1445744742,25.761961494652613 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 1024, 2048, 1024, 2048], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [8, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [4, 4, 8, 8, 8, 4], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, -1, -1, 2, 1]}",2022-11-30 02:19:21.548056,0,1540116582,26.36816465236295 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 2048, 3072, 1024, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 1024, 2048, 1024, 2048], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [8, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [4, 4, 8, 8, 4, 4], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, -1, -1, 1, 1]}",2022-11-30 02:19:55.543285,0,1477202022,26.259735292233113 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 3072], 'decoder_ffn_embed_dim': [2048, 1024, 3072, 2048, 1024, 2048], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [4, 4, 8, 8, 8, 4], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, 2, -1, 2, 1]}",2022-11-30 02:20:26.257628,0,1410447450,25.769535189140193 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 2048, 3072, 2048, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 1024, 2048, 1024, 2048], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [8, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [4, 4, 8, 8, 8, 4], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, -1, -1, 2, 1]}",2022-11-30 02:21:00.935996,0,1508659302,26.45714906422908 +"{'encoder_embed_dim': [640], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 3072], 'decoder_ffn_embed_dim': [2048, 1024, 3072, 2048, 1024, 2048], 'decoder_layer_num': [5], 'encoder_self_attention_heads': [8, 8, 8, 8, 4, 8], 'decoder_self_attention_heads': [4, 8, 8, 8, 8, 4], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, 2, -1, 2, -1]}",2022-11-30 02:21:39.693771,0,2064818028,27.05141228767061 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 3072, 2048, 1024, 2048], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [8, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [4, 4, 8, 8, 8, 4], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, -1, -1, 2, 1]}",2022-11-30 02:22:16.858717,0,1634488422,26.322548970809848 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 3072, 2048, 1024, 3072], 'decoder_layer_num': [5], 'encoder_self_attention_heads': [8, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [4, 4, 8, 8, 8, 4], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, -1, -1, 1, 1]}",2022-11-30 02:22:53.592330,0,1715973228,26.657027696998984 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 1024, 2048, 1024, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 1024, 3072, 1024, 3072], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 8, 8, 4, 8, 8], 'decoder_self_attention_heads': [8, 4, 8, 8, 4, 4], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [-1, -1, 2, -1, 2, 1]}",2022-11-30 02:23:23.753177,0,1142138970,24.539293515717535 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 3072], 'decoder_ffn_embed_dim': [2048, 1024, 3072, 2048, 1024, 2048], 'decoder_layer_num': [5], 'encoder_self_attention_heads': [8, 8, 8, 8, 4, 8], 'decoder_self_attention_heads': [4, 8, 8, 8, 8, 4], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, 2, -1, 2, -1]}",2022-11-30 02:24:02.314804,0,1860295788,27.020732069299534 +"{'encoder_embed_dim': [640], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 3072], 'decoder_ffn_embed_dim': [2048, 1024, 3072, 2048, 1024, 2048], 'decoder_layer_num': [5], 'encoder_self_attention_heads': [8, 8, 8, 8, 4, 8], 'decoder_self_attention_heads': [4, 8, 8, 8, 8, 4], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, -1, -1, 2, -1]}",2022-11-30 02:24:40.894881,0,2023653228,26.816836839483663 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 3072], 'decoder_ffn_embed_dim': [2048, 1024, 3072, 2048, 1024, 2048], 'decoder_layer_num': [5], 'encoder_self_attention_heads': [8, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [4, 8, 8, 8, 8, 4], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, 2, -1, 2, -1]}",2022-11-30 02:25:19.593071,0,1860295788,26.957189752523504 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 2048, 3072, 2048, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 1024, 2048, 1024, 2048], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [4, 4, 8, 8, 8, 4], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, -1, -1, 2, 1]}",2022-11-30 02:25:50.458625,0,1284618330,25.61733025863949 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 2048, 2048, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 3072, 2048, 1024, 2048], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [8, 4, 8, 8, 8, 4], 'decoder_ende_attention_heads': [8, 8, 4, 8, 8, 4], 'decoder_arbitrary_ende_attn': [-1, -1, 2, -1, 1, 1]}",2022-11-30 02:26:20.744436,0,1236510810,25.25415313294958 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 1024, 2048, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 2048, 3072, 3072, 3072, 3072], 'decoder_layer_num': [1], 'encoder_self_attention_heads': [8, 8, 8, 4, 8, 8], 'decoder_self_attention_heads': [8, 8, 8, 4, 4, 8], 'decoder_ende_attention_heads': [8, 4, 4, 8, 8, 4], 'decoder_arbitrary_ende_attn': [-1, -1, 2, -1, 1, -1]}",2022-11-30 02:26:53.298888,0,1045847124,22.08090715962085 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 2048, 3072, 2048, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 1024, 2048, 1024, 3072], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [8, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [4, 4, 8, 8, 8, 4], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [-1, -1, -1, -1, 1, 1]}",2022-11-30 02:27:27.137456,0,1492009062,26.052263535617904 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 1024, 2048, 1024, 2048], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [8, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [4, 4, 8, 8, 8, 4], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, -1, -1, 2, 1]}",2022-11-30 02:28:01.945411,0,1571573862,26.118237209470863 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 3072, 2048, 1024, 3072], 'decoder_layer_num': [5], 'encoder_self_attention_heads': [8, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [4, 4, 8, 8, 8, 4], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, -1, -1, 1, 1]}",2022-11-30 02:28:38.594669,0,1747430508,26.903183105861135 +"{'encoder_embed_dim': [640], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 3072, 2048, 1024, 2048], 'decoder_layer_num': [6], 'encoder_self_attention_heads': [8, 8, 8, 8, 4, 8], 'decoder_self_attention_heads': [4, 8, 8, 8, 8, 4], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, -1, -1, 2, -1]}",2022-11-30 02:29:18.253776,0,2076691314,26.879803077690028 +"{'encoder_embed_dim': [640], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 3072], 'decoder_ffn_embed_dim': [2048, 1024, 3072, 2048, 1024, 3072], 'decoder_layer_num': [5], 'encoder_self_attention_heads': [8, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [4, 4, 8, 8, 8, 4], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, -1, -1, 1, -1]}",2022-11-30 02:29:56.298060,0,2003070828,26.839090098499142 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 3072, 2048, 1024, 3072], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [8, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [4, 4, 8, 8, 8, 4], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, -1, -1, 2, 1]}",2022-11-30 02:30:33.772050,0,1603031142,26.482417706814886 +"{'encoder_embed_dim': [640], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 3072], 'decoder_ffn_embed_dim': [2048, 1024, 3072, 2048, 1024, 2048], 'decoder_layer_num': [6], 'encoder_self_attention_heads': [8, 8, 8, 8, 4, 8], 'decoder_self_attention_heads': [4, 8, 8, 8, 8, 4], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, -1, -1, 2, -1]}",2022-11-30 02:31:13.542551,0,2155334514,26.87436116692265 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 1024, 2048, 2048, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 3072, 3072, 3072, 3072], 'decoder_layer_num': [1], 'encoder_self_attention_heads': [8, 8, 4, 8, 8, 8], 'decoder_self_attention_heads': [8, 8, 8, 4, 4, 8], 'decoder_ende_attention_heads': [8, 4, 4, 8, 8, 4], 'decoder_arbitrary_ende_attn': [-1, -1, 2, -1, 1, -1]}",2022-11-30 02:31:47.410084,0,1077304404,22.604758748152854 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 2048, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 3072, 1024, 2048], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [4, 4, 8, 8, 8, 8], 'decoder_ende_attention_heads': [8, 4, 4, 8, 8, 4], 'decoder_arbitrary_ende_attn': [-1, -1, -1, -1, 2, -1]}",2022-11-30 02:32:17.856179,0,1173596250,24.7651262424877 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 1024, 2048, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 3072, 1024, 2048], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [8, 4, 8, 8, 8, 4], 'decoder_ende_attention_heads': [8, 8, 4, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, -1, -1, 1, -1]}",2022-11-30 02:32:48.975305,0,1127331930,24.704098583912252 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 2048, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 1024, 2048, 1024, 2048], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 8, 8, 4, 8, 8], 'decoder_self_attention_heads': [8, 4, 8, 8, 8, 4], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 8], 'decoder_arbitrary_ende_attn': [-1, -1, -1, -1, 1, -1]}",2022-11-30 02:33:19.260398,0,1205053530,25.137127615371046 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 1024, 2048, 1024, 3072], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [4, 4, 8, 8, 4, 4], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, 1, -1, -1, 2, 1]}",2022-11-30 02:33:51.759557,0,1364183130,25.966442456302147 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 1024, 2048, 2048, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 1024, 2048, 1024, 2048], 'decoder_layer_num': [1], 'encoder_self_attention_heads': [8, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [8, 4, 8, 8, 8, 4], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, 2, -1, 2, 1]}",2022-11-30 02:34:21.926894,0,1093954644,23.220261036558885 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 2048, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 3072, 1024, 2048], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [4, 4, 8, 8, 8, 4], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, -1, -1, 2, -1]}",2022-11-30 02:34:53.358603,0,1190246490,25.1359189611972 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 1024, 2048, 1024, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 1024, 2048, 1024, 2048], 'decoder_layer_num': [1], 'encoder_self_attention_heads': [8, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [8, 4, 8, 8, 8, 4], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, 2, -1, 2, 1]}",2022-11-30 02:35:22.930318,0,1062497364,22.7755095138295 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 1024, 3072, 2048, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 3072, 2048, 1024, 2048], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [4, 4, 8, 8, 8, 4], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, 2, -1, 2, 1]}",2022-11-30 02:35:53.858887,0,1284618330,25.537416313175953 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 3072, 2048, 1024, 2048], 'decoder_layer_num': [5], 'encoder_self_attention_heads': [8, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [4, 4, 8, 8, 8, 4], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, -1, -1, 2, -1]}",2022-11-30 02:36:31.595724,0,1764080748,26.882073104251866 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 2048, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 3072, 2048, 1024, 2048], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [4, 4, 8, 8, 8, 4], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, 2, -1, 2, 1]}",2022-11-30 02:37:02.267669,0,1316075610,25.570143211664274 +"{'encoder_embed_dim': [640], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 3072], 'decoder_ffn_embed_dim': [2048, 1024, 3072, 2048, 1024, 2048], 'decoder_layer_num': [6], 'encoder_self_attention_heads': [8, 8, 8, 8, 4, 8], 'decoder_self_attention_heads': [4, 8, 8, 8, 8, 4], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, -1, -1, 1, -1]}",2022-11-30 02:37:41.647836,0,2134752114,26.898452516723687 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 1024, 2048, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 2048, 3072, 2048, 1024, 1024], 'decoder_layer_num': [1], 'encoder_self_attention_heads': [8, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [8, 4, 8, 4, 8, 8], 'decoder_ende_attention_heads': [8, 4, 4, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, 2, -1, 2, 1]}",2022-11-30 02:38:13.005215,0,1031040084,22.282220573694513 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 3072, 2048, 1024, 2048], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [4, 4, 8, 8, 8, 4], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, 2, -1, 2, 1]}",2022-11-30 02:38:44.151950,0,1347532890,25.784134961825835 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 2048, 2048, 2048, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 3072, 1024, 2048], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [4, 4, 8, 8, 8, 4], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, 2, -1, 2, -1]}",2022-11-30 02:39:15.266274,0,1221703770,25.202375572779086 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 2048, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 1024, 2048, 1024, 3072], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [4, 4, 8, 8, 4, 4], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, 1, 2, -1, 2, 1]}",2022-11-30 02:39:46.953641,0,1332725850,25.851297056506393 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 2048, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 3072, 3072, 1024, 2048], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [8, 4, 8, 8, 8, 4], 'decoder_ende_attention_heads': [8, 8, 4, 8, 8, 4], 'decoder_arbitrary_ende_attn': [-1, -1, 2, -1, 1, -1]}",2022-11-30 02:40:17.653376,0,1173596250,24.972344992193374 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 1024, 2048, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 2048, 3072, 3072, 3072, 3072], 'decoder_layer_num': [1], 'encoder_self_attention_heads': [8, 8, 8, 4, 8, 8], 'decoder_self_attention_heads': [8, 8, 8, 4, 4, 4], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, 2, -1, 1, -1]}",2022-11-30 02:40:49.609980,0,1062497364,22.535667407831653 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 1024, 2048, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 2048, 1024, 3072], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [8, 4, 8, 8, 4, 4], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [-1, -1, 2, -1, 2, 1]}",2022-11-30 02:41:20.761069,0,1110681690,24.40576457445596 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 3072, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 3072, 2048, 1024, 1024], 'decoder_layer_num': [5], 'encoder_self_attention_heads': [8, 8, 8, 8, 4, 4], 'decoder_self_attention_heads': [8, 4, 8, 8, 4, 4], 'decoder_ende_attention_heads': [8, 8, 8, 4, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, 2, -1, 2, 1]}",2022-11-30 02:42:27.496173,0,1828838508,26.85145321742662 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 1024, 1024, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 3072, 1024, 2048, 3072], 'decoder_layer_num': [1], 'encoder_self_attention_heads': [8, 4, 8, 8, 8, 8], 'decoder_self_attention_heads': [8, 8, 8, 4, 4, 4], 'decoder_ende_attention_heads': [4, 8, 4, 4, 4, 4], 'decoder_arbitrary_ende_attn': [-1, 1, 2, -1, -1, -1]}",2022-11-30 02:42:57.919093,0,982932564,21.761604389716634 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 1024, 1024, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 3072, 1024, 2048, 3072], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 4, 8, 8, 8, 8], 'decoder_self_attention_heads': [8, 8, 8, 8, 4, 4], 'decoder_ende_attention_heads': [4, 8, 4, 4, 4, 4], 'decoder_arbitrary_ende_attn': [-1, -1, 2, -1, -1, -1]}",2022-11-30 02:43:28.655608,0,1079224410,23.718368472465446 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 1024, 1024, 2048, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 3072, 1024, 1024, 3072], 'decoder_layer_num': [1], 'encoder_self_attention_heads': [8, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [8, 8, 8, 8, 4, 8], 'decoder_ende_attention_heads': [4, 8, 8, 4, 4, 4], 'decoder_arbitrary_ende_attn': [1, -1, 2, -1, -1, -1]}",2022-11-30 02:43:58.512911,0,1062497364,22.74182664332034 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 1024, 1024, 1024], 'decoder_layer_num': [3], 'encoder_self_attention_heads': [8, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [8, 8, 8, 8, 4, 4], 'decoder_ende_attention_heads': [8, 8, 8, 8, 4, 4], 'decoder_arbitrary_ende_attn': [1, -1, 2, -1, 1, -1]}",2022-11-30 02:44:32.816447,0,1445667936,26.19212122197997 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 1024, 1024, 1024], 'decoder_layer_num': [3], 'encoder_self_attention_heads': [8, 8, 8, 8, 4, 4], 'decoder_self_attention_heads': [8, 4, 8, 8, 4, 4], 'decoder_ende_attention_heads': [8, 8, 8, 4, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, 2, -1, 1, 1]}",2022-11-30 02:45:10.899684,0,1477125216,26.010153160637532 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 2048, 1024, 1024, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 3072, 2048, 2048, 3072], 'decoder_layer_num': [1], 'encoder_self_attention_heads': [8, 4, 8, 8, 8, 8], 'decoder_self_attention_heads': [8, 8, 8, 4, 4, 4], 'decoder_ende_attention_heads': [4, 8, 4, 4, 4, 4], 'decoder_arbitrary_ende_attn': [-1, 1, 2, -1, -1, -1]}",2022-11-30 02:45:41.313907,0,1045847124,22.827474314422002 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 2048, 2048, 1024, 1024], 'decoder_layer_num': [5], 'encoder_self_attention_heads': [8, 8, 8, 8, 4, 8], 'decoder_self_attention_heads': [8, 8, 8, 8, 4, 4], 'decoder_ende_attention_heads': [8, 8, 8, 4, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, 2, -1, 1, -1]}",2022-11-30 02:46:19.207836,0,1749273708,26.60418713411001 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 2048, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 1024, 1024, 3072], 'decoder_layer_num': [3], 'encoder_self_attention_heads': [8, 8, 8, 8, 4, 8], 'decoder_self_attention_heads': [8, 4, 8, 8, 4, 4], 'decoder_ende_attention_heads': [8, 8, 8, 4, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, 2, -1, -1, 1]}",2022-11-30 02:46:53.592587,0,1414210656,26.00726305388265 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 2048, 1024, 2048, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 3072, 1024, 1024, 3072], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [8, 8, 8, 8, 4, 8], 'decoder_ende_attention_heads': [8, 8, 4, 4, 8, 4], 'decoder_arbitrary_ende_attn': [-1, -1, 2, -1, 1, 1]}",2022-11-30 02:47:23.850237,0,1142138970,24.6489542661405 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 3072, 1024, 2048, 1024], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 8, 8, 8, 4, 8], 'decoder_self_attention_heads': [8, 8, 8, 4, 4, 8], 'decoder_ende_attention_heads': [8, 8, 8, 8, 4, 4], 'decoder_arbitrary_ende_attn': [1, -1, 2, -1, -1, 1]}",2022-11-30 02:47:54.746779,0,1316075610,25.689218653291295 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 1024, 1024, 1024, 1024], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_self_attention_heads': [8, 4, 8, 4, 4, 8], 'decoder_ende_attention_heads': [8, 8, 8, 4, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, 2, -1, 2, 1]}",2022-11-30 02:48:25.548201,0,1347532890,25.751911210030475 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 3072, 2048, 1024, 1024], 'decoder_layer_num': [5], 'encoder_self_attention_heads': [8, 8, 8, 8, 4, 8], 'decoder_self_attention_heads': [8, 8, 8, 8, 4, 4], 'decoder_ende_attention_heads': [8, 8, 8, 4, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, 2, -1, 1, 1]}",2022-11-30 02:49:04.198324,0,1780730988,27.00579454971298 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 3072, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 3072, 2048, 1024, 1024], 'decoder_layer_num': [5], 'encoder_self_attention_heads': [8, 8, 8, 8, 4, 4], 'decoder_self_attention_heads': [8, 8, 8, 8, 4, 8], 'decoder_ende_attention_heads': [8, 8, 8, 4, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, 2, -1, 1, 1]}",2022-11-30 02:49:42.700124,0,1812188268,26.874801178813083 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 2048, 1024, 2048, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 1024, 2048, 1024], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [8, 8, 8, 8, 4, 4], 'decoder_ende_attention_heads': [8, 8, 8, 4, 4, 4], 'decoder_arbitrary_ende_attn': [1, -1, 2, -1, -1, -1]}",2022-11-30 02:50:13.662203,0,1158789210,24.921368325030492 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 1024, 1024, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 3072, 1024, 1024, 3072], 'decoder_layer_num': [1], 'encoder_self_attention_heads': [8, 4, 8, 8, 8, 8], 'decoder_self_attention_heads': [8, 8, 8, 8, 4, 4], 'decoder_ende_attention_heads': [4, 8, 8, 4, 4, 4], 'decoder_arbitrary_ende_attn': [1, -1, 2, -1, -1, -1]}",2022-11-30 02:50:43.904715,0,999582804,22.026558500552152 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 1024, 1024], 'decoder_layer_num': [5], 'encoder_self_attention_heads': [8, 8, 8, 8, 4, 4], 'decoder_self_attention_heads': [8, 8, 8, 8, 4, 4], 'decoder_ende_attention_heads': [8, 8, 8, 4, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, 2, -1, 1, -1]}",2022-11-30 02:51:22.015853,0,1686359148,26.62215586535193 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 3072, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 2048, 2048, 1024, 1024], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [8, 8, 8, 8, 4, 4], 'decoder_self_attention_heads': [8, 4, 8, 8, 4, 4], 'decoder_ende_attention_heads': [8, 8, 8, 4, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, 2, -1, -1, 1]}",2022-11-30 02:52:00.755827,0,1667788902,26.61057212093059 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 3072, 2048, 2048, 3072], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [8, 8, 8, 8, 4, 4], 'decoder_self_attention_heads': [8, 8, 8, 8, 4, 8], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, 2, -1, 2, -1]}",2022-11-30 02:52:37.454316,0,1636331622,26.6512297923577 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 1024, 1024, 1024, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 1024, 1024, 2048, 1024], 'decoder_layer_num': [1], 'encoder_self_attention_heads': [8, 4, 8, 8, 8, 8], 'decoder_self_attention_heads': [8, 8, 8, 4, 4, 4], 'decoder_ende_attention_heads': [4, 8, 8, 4, 4, 4], 'decoder_arbitrary_ende_attn': [-1, 1, 2, -1, -1, 1]}",2022-11-30 02:53:07.714810,0,1014389844,22.393618015178536 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 2048, 3072, 2048, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 1024, 1024, 1024], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [8, 8, 8, 8, 4, 4], 'decoder_self_attention_heads': [8, 8, 8, 8, 4, 8], 'decoder_ende_attention_heads': [8, 8, 8, 4, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, 2, -1, -1, 1]}",2022-11-30 02:53:43.144481,0,1510502502,25.993038659880476 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 1024, 1024], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [8, 8, 8, 8, 4, 4], 'decoder_self_attention_heads': [8, 4, 8, 8, 4, 4], 'decoder_ende_attention_heads': [8, 8, 8, 4, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, 2, -1, -1, -1]}",2022-11-30 02:54:19.052536,0,1573417062,26.421662873667255 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 1024, 1024, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 3072, 1024, 1024, 3072], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [8, 8, 8, 8, 4, 4], 'decoder_ende_attention_heads': [8, 8, 8, 4, 4, 4], 'decoder_arbitrary_ende_attn': [1, -1, 2, -1, -1, -1]}",2022-11-30 02:54:49.657459,0,1095874650,23.883133083803862 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 1024, 3072, 2048, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 3072, 1024, 2048, 1024], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 8, 8, 8, 4, 4], 'decoder_self_attention_heads': [8, 4, 8, 8, 4, 4], 'decoder_ende_attention_heads': [8, 8, 8, 4, 4, 4], 'decoder_arbitrary_ende_attn': [1, 1, 2, -1, -1, -1]}",2022-11-30 02:55:21.070738,0,1206896730,25.148678288815 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 2048, 1024, 1024, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 3072, 2048, 2048, 3072], 'decoder_layer_num': [1], 'encoder_self_attention_heads': [8, 4, 8, 8, 8, 8], 'decoder_self_attention_heads': [8, 8, 8, 4, 4, 4], 'decoder_ende_attention_heads': [4, 8, 8, 4, 4, 4], 'decoder_arbitrary_ende_attn': [-1, 1, 2, -1, -1, -1]}",2022-11-30 02:55:52.009648,0,1014389844,22.06913488473211 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 2048, 3072, 2048, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 1024, 1024, 1024], 'decoder_layer_num': [3], 'encoder_self_attention_heads': [8, 8, 8, 8, 4, 4], 'decoder_self_attention_heads': [8, 4, 8, 8, 4, 4], 'decoder_ende_attention_heads': [8, 8, 8, 4, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, 2, -1, -1, -1]}",2022-11-30 02:56:25.509329,0,1382753376,25.54875613062997 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 2048, 2048, 3072, 2048, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 1024, 1024, 2048, 1024], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_self_attention_heads': [8, 8, 8, 8, 4, 8], 'decoder_ende_attention_heads': [8, 8, 8, 8, 4, 4], 'decoder_arbitrary_ende_attn': [1, 1, 2, -1, -1, -1]}",2022-11-30 02:56:56.957773,0,1301268570,25.649734062232383 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 1024, 1024, 1024, 1024], 'decoder_layer_num': [3], 'encoder_self_attention_heads': [8, 8, 8, 8, 4, 8], 'decoder_self_attention_heads': [8, 8, 8, 8, 4, 4], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, 1, 2, -1, 1, 1]}",2022-11-30 02:57:31.518411,0,1493775456,26.10832383442528 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 1024, 1024, 2048, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 3072, 1024, 1024, 3072], 'decoder_layer_num': [1], 'encoder_self_attention_heads': [8, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [8, 8, 8, 8, 4, 8], 'decoder_ende_attention_heads': [4, 8, 8, 4, 4, 4], 'decoder_arbitrary_ende_attn': [1, -1, 2, -1, -1, -1]}",2022-11-30 02:58:01.613773,0,1031040084,22.362002479551165 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 1024, 1024, 2048, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 3072, 1024, 1024, 3072], 'decoder_layer_num': [1], 'encoder_self_attention_heads': [8, 4, 8, 8, 8, 8], 'decoder_self_attention_heads': [8, 8, 8, 8, 4, 8], 'decoder_ende_attention_heads': [4, 8, 8, 4, 4, 4], 'decoder_arbitrary_ende_attn': [1, -1, 2, -1, -1, -1]}",2022-11-30 02:58:31.705698,0,1062497364,22.819328594276566 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 1024, 3072, 2048, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 3072, 1024, 1024, 3072], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [8, 8, 8, 8, 4, 8], 'decoder_ende_attention_heads': [8, 8, 8, 4, 8, 4], 'decoder_arbitrary_ende_attn': [-1, -1, 2, -1, -1, 1]}",2022-11-30 02:59:02.697733,0,1173596250,24.988685667678173 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 2048, 3072, 2048, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 1024, 2048, 1024], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [8, 8, 8, 4, 4, 4], 'decoder_ende_attention_heads': [8, 8, 8, 4, 4, 4], 'decoder_arbitrary_ende_attn': [1, -1, 2, -1, -1, -1]}",2022-11-30 02:59:33.659490,0,1221703770,25.302962108892185 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 2048, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 1024, 1024, 1024], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [8, 8, 8, 8, 4, 8], 'decoder_self_attention_heads': [8, 8, 8, 8, 4, 4], 'decoder_ende_attention_heads': [8, 8, 8, 4, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, 2, -1, -1, 1]}",2022-11-30 03:00:09.446922,0,1541959782,26.112077468690867 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 1024, 1024, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 3072, 1024, 1024, 3072], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [8, 8, 8, 8, 4, 4], 'decoder_ende_attention_heads': [8, 8, 8, 4, 4, 4], 'decoder_arbitrary_ende_attn': [-1, -1, 2, -1, -1, -1]}",2022-11-30 03:00:39.473024,0,1079224410,23.955772671551667 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 1024, 1024, 1024], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [8, 8, 8, 8, 4, 4], 'decoder_self_attention_heads': [8, 4, 8, 8, 4, 4], 'decoder_ende_attention_heads': [8, 8, 8, 4, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, 2, -1, -1, -1]}",2022-11-30 03:01:15.155182,0,1541959782,26.31776013053777 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 2048, 3072, 2048, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 3072, 1024, 2048, 1024], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [8, 8, 8, 4, 4, 8], 'decoder_ende_attention_heads': [8, 8, 8, 8, 4, 4], 'decoder_arbitrary_ende_attn': [1, -1, 2, -1, -1, -1]}",2022-11-30 03:01:46.072247,0,1253161050,25.489643989727142 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 1024, 1024, 1024, 1024], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [8, 8, 8, 4, 4, 8], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, 1, 2, -1, -1, 1]}",2022-11-30 03:02:17.596453,0,1364183130,25.815981278057624 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 2048, 1024, 2048, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 3072, 1024, 2048, 1024], 'decoder_layer_num': [1], 'encoder_self_attention_heads': [8, 4, 8, 8, 8, 8], 'decoder_self_attention_heads': [8, 8, 8, 4, 4, 8], 'decoder_ende_attention_heads': [8, 8, 8, 4, 4, 4], 'decoder_arbitrary_ende_attn': [-1, 1, 2, -1, -1, -1]}",2022-11-30 03:02:48.933726,0,1045847124,22.32432265753782 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 2048, 3072, 2048, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 1024, 1024, 2048, 1024], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [8, 8, 8, 4, 4, 8], 'decoder_ende_attention_heads': [8, 8, 8, 8, 4, 4], 'decoder_arbitrary_ende_attn': [1, 1, 2, -1, -1, -1]}",2022-11-30 03:03:21.469495,0,1269811290,25.513369223069635 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 2048, 2048, 3072, 2048, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 3072, 1024, 2048, 1024], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [8, 8, 8, 4, 4, 8], 'decoder_ende_attention_heads': [8, 8, 8, 8, 4, 4], 'decoder_arbitrary_ende_attn': [1, -1, 2, -1, -1, -1]}",2022-11-30 03:03:53.160286,0,1284618330,25.46582706181441 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 1024, 1024], 'decoder_layer_num': [5], 'encoder_self_attention_heads': [8, 8, 8, 8, 4, 4], 'decoder_self_attention_heads': [8, 8, 8, 8, 4, 4], 'decoder_ende_attention_heads': [8, 8, 8, 4, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, 2, -1, 1, -1]}",2022-11-30 03:04:31.684525,0,1717816428,26.710511535879036 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 1024, 1024], 'decoder_layer_num': [5], 'encoder_self_attention_heads': [8, 8, 8, 8, 4, 8], 'decoder_self_attention_heads': [8, 8, 8, 8, 4, 4], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, 2, -1, 1, -1]}",2022-11-30 03:05:10.042233,0,1717816428,26.84283267727433 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 2048, 2048, 1024, 2048, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 1024, 2048, 1024], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [8, 8, 8, 8, 4, 4], 'decoder_ende_attention_heads': [8, 8, 8, 4, 4, 4], 'decoder_arbitrary_ende_attn': [1, -1, 2, -1, -1, -1]}",2022-11-30 03:05:41.855922,0,1190246490,24.970583908771232 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 2048, 2048, 1024, 1024], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [8, 8, 8, 8, 4, 4], 'decoder_self_attention_heads': [8, 8, 8, 8, 4, 8], 'decoder_ende_attention_heads': [8, 8, 8, 4, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, 2, -1, 2, -1]}",2022-11-30 03:06:20.184503,0,1636331622,26.504345633971067 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 1024, 1024, 2048, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 3072, 1024, 1024, 3072], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [8, 8, 8, 8, 4, 4], 'decoder_ende_attention_heads': [8, 8, 8, 4, 4, 4], 'decoder_arbitrary_ende_attn': [1, -1, 2, -1, -1, -1]}",2022-11-30 03:06:51.460598,0,1127331930,24.268248941796838 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 1024, 1024], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_self_attention_heads': [8, 8, 8, 8, 4, 4], 'decoder_ende_attention_heads': [8, 8, 8, 4, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, 2, -1, -1, 1]}",2022-11-30 03:07:28.142691,0,1573417062,26.430998131687453 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 2048, 3072, 2048, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 3072, 1024, 2048, 3072], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 8, 8, 8, 4, 8], 'decoder_self_attention_heads': [8, 8, 8, 8, 4, 8], 'decoder_ende_attention_heads': [8, 8, 8, 8, 4, 4], 'decoder_arbitrary_ende_attn': [1, -1, 2, -1, -1, 1]}",2022-11-30 03:07:59.749480,0,1253161050,25.38845190370804 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 1024, 3072, 2048, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 3072, 2048, 1024, 1024], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 8, 8, 8, 4, 4], 'decoder_self_attention_heads': [8, 4, 8, 8, 4, 4], 'decoder_ende_attention_heads': [8, 8, 8, 4, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, 2, -1, -1, -1]}",2022-11-30 03:08:30.871157,0,1190246490,25.041571400304573 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 1024, 1024, 2048, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 3072, 2048, 2048, 3072], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 4, 8, 8, 8, 8], 'decoder_self_attention_heads': [8, 8, 8, 8, 4, 4], 'decoder_ende_attention_heads': [4, 8, 4, 4, 4, 4], 'decoder_arbitrary_ende_attn': [-1, -1, 2, -1, -1, -1]}",2022-11-30 03:09:02.664589,0,1110681690,24.006237352801346 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 2048, 3072, 2048, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 3072, 1024, 2048, 1024], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [8, 8, 8, 8, 4, 4], 'decoder_ende_attention_heads': [8, 8, 8, 4, 4, 4], 'decoder_arbitrary_ende_attn': [1, 1, 2, -1, -1, -1]}",2022-11-30 03:09:34.238610,0,1238354010,25.34276030239052 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 1024, 1024, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 1024, 3072, 2048], 'decoder_layer_num': [1], 'encoder_self_attention_heads': [4, 4, 8, 4, 8, 8], 'decoder_self_attention_heads': [4, 4, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 8, 8, 8, 8, 8], 'decoder_arbitrary_ende_attn': [1, -1, 1, -1, -1, 2]}",2022-11-30 03:10:34.311288,0,999582804,21.80118151877014 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 3072, 2048, 1024, 1024], 'decoder_layer_num': [5], 'encoder_self_attention_heads': [4, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [4, 8, 8, 8, 8, 8], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [2, -1, 2, -1, 1, 1]}",2022-11-30 03:11:13.593310,0,1797381228,27.043140669460854 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 1024, 2048, 2048, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 3072, 2048, 1024, 2048], 'decoder_layer_num': [1], 'encoder_self_attention_heads': [8, 4, 8, 4, 8, 8], 'decoder_self_attention_heads': [8, 8, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 8, 8, 8, 8, 8], 'decoder_arbitrary_ende_attn': [1, -1, 2, -1, -1, 1]}",2022-11-30 03:11:43.503387,0,1093954644,23.007414463752117 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 1024, 1024, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 1024, 3072, 2048], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [4, 4, 8, 4, 8, 8], 'decoder_self_attention_heads': [4, 8, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, 1, -1, -1, 2]}",2022-11-30 03:12:14.163528,0,1095874650,23.79086433289906 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 1024, 2048, 1024, 1024], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [4, 8, 8, 8, 8, 4], 'decoder_self_attention_heads': [4, 8, 8, 8, 8, 8], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [2, -1, 2, -1, 1, 1]}",2022-11-30 03:12:50.848022,0,1621524582,26.025864778342207 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 2048, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 2048, 1024, 2048], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_self_attention_heads': [8, 8, 8, 8, 4, 8], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, 1, 1, 1, 1]}",2022-11-30 03:13:21.556759,0,1190246490,25.077358466768757 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 2048, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 1024, 1024, 1024, 2048], 'decoder_layer_num': [3], 'encoder_self_attention_heads': [8, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [8, 8, 8, 8, 8, 8], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, 1, -1, 1, 1]}",2022-11-30 03:13:55.398877,0,1429017696,26.12059006524399 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 1024, 1024, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 2048, 1024, 2048], 'decoder_layer_num': [1], 'encoder_self_attention_heads': [4, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [8, 8, 8, 8, 8, 8], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, 1, 1, -1, 1]}",2022-11-30 03:14:26.326007,0,999582804,22.127524836343532 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 2048, 2048, 2048, 2048, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 1024, 1024, 2048], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [4, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [8, 8, 8, 8, 8, 8], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, 1, -1, -1, 1]}",2022-11-30 03:14:57.762642,0,1221703770,25.21914419948441 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 3072, 2048, 1024, 1024], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [4, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [4, 8, 8, 8, 8, 8], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, 2, -1, 1, 1]}",2022-11-30 03:15:36.252929,0,1667788902,26.6927938370969 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 3072, 2048, 1024, 1024], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [4, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [4, 8, 8, 8, 8, 8], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [2, -1, 2, -1, 1, 1]}",2022-11-30 03:16:14.254276,0,1652981862,26.66556123105007 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 1024, 2048, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 1024, 1024, 2048], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [4, 4, 8, 4, 8, 8], 'decoder_self_attention_heads': [4, 4, 8, 4, 4, 8], 'decoder_ende_attention_heads': [4, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, 2, -1, -1, 2]}",2022-11-30 03:16:45.445260,0,1127331930,24.13548008816253 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 1024, 2048, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 2048, 1024, 1024], 'decoder_layer_num': [1], 'encoder_self_attention_heads': [4, 4, 8, 4, 8, 8], 'decoder_self_attention_heads': [4, 4, 8, 8, 8, 8], 'decoder_ende_attention_heads': [4, 8, 8, 8, 8, 8], 'decoder_arbitrary_ende_attn': [1, -1, 2, -1, -1, 2]}",2022-11-30 03:17:15.639603,0,1031040084,22.152066191159324 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 1024, 2048], 'decoder_layer_num': [3], 'encoder_self_attention_heads': [4, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [4, 4, 8, 8, 8, 8], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, 2, -1, 1, 1]}",2022-11-30 03:17:54.414138,0,1477125216,26.149095479536744 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 1024, 2048, 1024, 2048], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [8, 4, 8, 8, 4, 8], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, 1, 2, 1, 1]}",2022-11-30 03:18:25.964419,0,1347532890,25.790673448942552 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 2048, 1024, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 2048, 1024, 2048], 'decoder_layer_num': [1], 'encoder_self_attention_heads': [8, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [8, 8, 8, 8, 8, 8], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, 1, 1, 1, 1]}",2022-11-30 03:18:56.523414,0,1031040084,22.518290658576667 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 2048, 1024, 2048], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [4, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [4, 8, 8, 8, 4, 8], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, 2, 2, 1, 1]}",2022-11-30 03:19:27.666139,0,1316075610,25.42608049975173 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 2048, 3072, 2048, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 2048, 1024, 2048], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [8, 4, 8, 8, 4, 8], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, 1, 1, 1, 1]}",2022-11-30 03:19:59.551900,0,1221703770,25.355575788780996 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 2048, 1024, 1024], 'decoder_layer_num': [1], 'encoder_self_attention_heads': [4, 4, 8, 4, 8, 8], 'decoder_self_attention_heads': [4, 4, 8, 8, 8, 8], 'decoder_ende_attention_heads': [4, 8, 8, 8, 8, 8], 'decoder_arbitrary_ende_attn': [1, -1, 1, -1, -1, 2]}",2022-11-30 03:20:30.914307,0,1062497364,22.58490426133199 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 2048, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 1024, 2048, 1024, 2048], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [8, 4, 8, 8, 8, 8], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, 1, 2, 1, 1]}",2022-11-30 03:21:03.259812,0,1316075610,25.632790518407255 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 2048, 1024, 2048], 'decoder_layer_num': [3], 'encoder_self_attention_heads': [4, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [4, 4, 8, 8, 8, 8], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, 2, -1, 1, 1]}",2022-11-30 03:21:37.308205,0,1445667936,26.01769230789309 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 2048, 3072, 2048, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 2048, 2048, 1024], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [4, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [4, 8, 8, 8, 4, 8], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, 1, -1, 1, 2]}",2022-11-30 03:22:13.654111,0,1493852262,26.206006379852774 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 1024, 2048, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 2048, 1024, 2048], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [8, 4, 8, 8, 4, 8], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 8], 'decoder_arbitrary_ende_attn': [1, -1, 1, -1, 1, 1]}",2022-11-30 03:22:44.661526,0,1127331930,24.704098583912252 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 1024, 3072, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 1024, 3072, 2048], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [4, 8, 8, 4, 8, 8], 'decoder_self_attention_heads': [4, 8, 8, 4, 8, 8], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, 1, -1, 1, 2]}",2022-11-30 03:23:16.791537,0,1158789210,24.4930221168223 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 2048, 1024, 2048, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 2048, 1024, 2048], 'decoder_layer_num': [1], 'encoder_self_attention_heads': [8, 4, 8, 8, 8, 8], 'decoder_self_attention_heads': [8, 8, 8, 8, 8, 8], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, 1, 1, -1, 1]}",2022-11-30 03:23:48.818752,0,1062497364,22.608186762946183 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 1024, 2048, 1024, 1024], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [4, 8, 8, 8, 8, 4], 'decoder_self_attention_heads': [4, 8, 8, 8, 8, 8], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [2, -1, 2, -1, 1, 1]}",2022-11-30 03:24:26.151864,0,1590067302,26.365434081423825 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 2048, 1024, 2048], 'decoder_layer_num': [5], 'encoder_self_attention_heads': [4, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [4, 8, 8, 8, 8, 8], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, 2, -1, 1, 1]}",2022-11-30 03:25:04.997659,0,1686359148,26.898935796393708 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 2048, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 1024, 1024, 2048], 'decoder_layer_num': [3], 'encoder_self_attention_heads': [4, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [8, 8, 8, 8, 8, 8], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, 1, -1, 1, 1]}",2022-11-30 03:25:38.612617,0,1397560416,26.061090394566637 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 1024, 1024], 'decoder_layer_num': [5], 'encoder_self_attention_heads': [4, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [4, 8, 8, 8, 8, 8], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [2, -1, 2, -1, 1, 1]}",2022-11-30 03:26:18.290877,0,1734466668,26.887018361917384 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 2048, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 2048, 1024, 2048], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [4, 4, 8, 8, 4, 8], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, 1, 1, 1, 1]}",2022-11-30 03:26:50.457197,0,1284618330,25.456784215173144 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 2048, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 1024, 1024, 2048], 'decoder_layer_num': [3], 'encoder_self_attention_heads': [4, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [8, 4, 8, 8, 8, 8], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, 1, -1, 1, 1]}",2022-11-30 03:27:24.405212,0,1397560416,26.060964017387292 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 2048, 1024, 1024], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [4, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [8, 8, 8, 8, 8, 8], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [2, -1, 2, -1, 1, 1]}",2022-11-30 03:28:00.762632,0,1558610022,26.24033664128142 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 2048, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 2048, 1024, 1024], 'decoder_layer_num': [1], 'encoder_self_attention_heads': [4, 4, 8, 4, 8, 8], 'decoder_self_attention_heads': [4, 4, 8, 8, 8, 8], 'decoder_ende_attention_heads': [4, 8, 8, 8, 8, 8], 'decoder_arbitrary_ende_attn': [1, -1, 2, -1, -1, 1]}",2022-11-30 03:28:30.909491,0,1093954644,22.91490934985308 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 2048, 1024, 1024], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [4, 8, 8, 8, 8, 4], 'decoder_self_attention_heads': [4, 4, 8, 8, 8, 8], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 8], 'decoder_arbitrary_ende_attn': [1, -1, 2, -1, 1, 1]}",2022-11-30 03:29:06.861622,0,1573417062,26.51638836290174 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 2048, 2048, 3072, 2048, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 1024, 1024, 2048], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [4, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [8, 8, 8, 8, 8, 8], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, 1, -1, 1, 1]}",2022-11-30 03:29:38.763394,0,1253161050,25.32548928774925 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 2048, 1024, 2048], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [4, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [4, 8, 8, 8, 8, 8], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, 2, -1, 1, 1]}",2022-11-30 03:30:14.354144,0,1541959782,26.36876929191619 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 1024, 1024, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 1024, 3072, 2048], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 8, 8, 4, 8, 8], 'decoder_self_attention_heads': [8, 4, 8, 8, 8, 8], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, 1, -1, 1, 1]}",2022-11-30 03:30:45.758366,0,1095874650,24.13357019256553 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 2048, 1024, 1024], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [4, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [8, 8, 8, 8, 8, 8], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [2, -1, 2, -1, 1, 1]}",2022-11-30 03:31:22.344668,0,1527152742,26.27025764535305 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 1024, 2048, 3072, 2048, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 2048, 1024, 2048], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [8, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [8, 4, 8, 8, 4, 8], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, 1, 1, 1, 1]}",2022-11-30 03:31:54.461060,0,1253161050,25.41836880480659 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 2048, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 1024, 1024, 2048], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [4, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [8, 4, 8, 8, 8, 8], 'decoder_ende_attention_heads': [4, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, 1, -1, 1, 1]}",2022-11-30 03:32:26.060470,0,1284618330,25.472978060833974 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 2048, 1024, 1024], 'decoder_layer_num': [4], 'encoder_self_attention_heads': [4, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [8, 8, 8, 8, 8, 8], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, 2, -1, 1, 1]}",2022-11-30 03:33:02.551992,0,1510502502,26.241380252439953 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 1024, 2048, 2048, 1024], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [4, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [4, 8, 8, 8, 8, 8], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, 1, -1, 1, 1]}",2022-11-30 03:33:33.960758,0,1347532890,25.707442276004617 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 1024, 1024], 'decoder_layer_num': [5], 'encoder_self_attention_heads': [4, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [4, 8, 8, 8, 4, 8], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, 2, -1, 1, 1]}",2022-11-30 03:34:12.877897,0,1717816428,26.937139740910155 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 1024, 1024, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 1024, 3072, 2048], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [4, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [4, 4, 8, 4, 8, 8], 'decoder_ende_attention_heads': [4, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, 1, -1, 1, 2]}",2022-11-30 03:34:43.973196,0,1095874650,23.88536811980862 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 3072, 2048, 1024, 1024], 'decoder_layer_num': [5], 'encoder_self_attention_heads': [4, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [4, 8, 8, 8, 8, 8], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, 2, -1, 1, 1]}",2022-11-30 03:35:22.799736,0,1749273708,27.035785346779857 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [2048, 1024, 3072, 2048, 1024, 1024], 'decoder_layer_num': [5], 'encoder_self_attention_heads': [4, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [4, 8, 8, 8, 8, 8], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, 2, -1, 1, 1]}",2022-11-30 03:36:01.488138,0,1780730988,26.94260959266693 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [2048, 2048, 2048, 3072, 3072, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 3072, 2048, 1024, 1024], 'decoder_layer_num': [5], 'encoder_self_attention_heads': [4, 8, 8, 8, 8, 8], 'decoder_self_attention_heads': [4, 8, 8, 8, 8, 8], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [2, -1, 2, -1, 1, 1]}",2022-11-30 03:36:41.098531,0,1765923948,27.1368398440508 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 1024, 2048, 2048, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 2048, 1024, 2048], 'decoder_layer_num': [1], 'encoder_self_attention_heads': [8, 8, 8, 4, 8, 4], 'decoder_self_attention_heads': [8, 4, 8, 4, 4, 8], 'decoder_ende_attention_heads': [4, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, 2, 1, 1, 1]}",2022-11-30 03:37:11.314304,0,1062497364,22.486957414672982 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 2048, 2048, 2048, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 2048, 1024, 1024], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [4, 4, 8, 8, 8, 8], 'decoder_self_attention_heads': [4, 8, 8, 8, 8, 8], 'decoder_ende_attention_heads': [8, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, 1, -1, 1, 1]}",2022-11-30 03:37:43.678526,0,1190246490,24.977323840246868 +"{'encoder_embed_dim': [512], 'decoder_embed_dim': [512], 'encoder_ffn_embed_dim': [1024, 1024, 1024, 1024, 1024, 1024], 'decoder_ffn_embed_dim': [1024, 1024, 1024, 1024, 3072, 1024], 'decoder_layer_num': [2], 'encoder_self_attention_heads': [4, 4, 8, 4, 8, 8], 'decoder_self_attention_heads': [8, 8, 8, 4, 4, 8], 'decoder_ende_attention_heads': [4, 8, 8, 8, 8, 4], 'decoder_arbitrary_ende_attn': [1, -1, 1, -1, -1, 2]}",2022-11-30 03:38:15.756806,0,1095874650,23.82580846204717 diff --git a/neural_compressor/conf/config.py b/neural_compressor/conf/config.py index 64a100dc113..f3bb4b5206e 100644 --- a/neural_compressor/conf/config.py +++ b/neural_compressor/conf/config.py @@ -1091,6 +1091,7 @@ def percent_to_float(data): Optional("num_evals", default=100000): int, Optional("results_csv_path", default=None): str, Optional("dataset_path", default=None): str, + Optional("supernet_ckpt_path", default=None): str, Optional("batch_size", default=64): int, }, }, diff --git a/neural_compressor/experimental/nas/dynas.py b/neural_compressor/experimental/nas/dynas.py index 5f061f6006f..3c63e19d27f 100644 --- a/neural_compressor/experimental/nas/dynas.py +++ b/neural_compressor/experimental/nas/dynas.py @@ -15,8 +15,8 @@ # limitations under the License. import os -import pandas as pd +import pandas as pd from neural_compressor.conf.config import Conf, NASConfig from neural_compressor.utils import logger @@ -37,36 +37,73 @@ class DyNAS(NASBase): def __init__(self, conf_fname_or_obj): """Initialize the attributes.""" - from .dynast.dynas_manager import ParameterManager + from .dynast.dynas_manager import (ParameterManager, + TransformerLTEncoding) from .dynast.dynas_predictor import Predictor - from .dynast.dynas_search import ProblemMultiObjective, SearchAlgoManager + from .dynast.dynas_search import (ProblemMultiObjective, + SearchAlgoManager) from .dynast.dynas_utils import (EvaluationInterfaceMobileNetV3, - EvaluationInterfaceResNet50, OFARunner) + EvaluationInterfaceResNet50, + EvaluationInterfaceTransformerLT, + OFARunner, TransformerLTRunner) + self.ParameterManager = ParameterManager self.Predictor = Predictor self.ProblemMultiObjective = ProblemMultiObjective self.SearchAlgoManager = SearchAlgoManager - self.OFARunner = OFARunner self.SUPERNET_PARAMETERS = { - 'ofa_resnet50': - {'d' : {'count' : 5, 'vars' : [0, 1, 2]}, - 'e' : {'count' : 18, 'vars' : [0.2, 0.25, 0.35]}, - 'w' : {'count' : 6, 'vars' : [0, 1, 2]} }, - 'ofa_mbv3_d234_e346_k357_w1.0': - {'ks' : {'count' : 20, 'vars' : [3, 5, 7]}, - 'e' : {'count' : 20, 'vars' : [3, 4, 6]}, - 'd' : {'count' : 5, 'vars' : [2, 3, 4]} }, - 'ofa_mbv3_d234_e346_k357_w1.2': - {'ks' : {'count' : 20, 'vars' : [3, 5, 7]}, - 'e' : {'count' : 20, 'vars' : [3, 4, 6]}, - 'd' : {'count' : 5, 'vars' : [2, 3, 4]} } - } + 'ofa_resnet50': { + 'd': {'count': 5, 'vars': [0, 1, 2]}, + 'e': {'count': 18, 'vars': [0.2, 0.25, 0.35]}, + 'w': {'count': 6, 'vars': [0, 1, 2]}, + }, + 'ofa_mbv3_d234_e346_k357_w1.0': { + 'ks': {'count': 20, 'vars': [3, 5, 7]}, + 'e': {'count': 20, 'vars': [3, 4, 6]}, + 'd': {'count': 5, 'vars': [2, 3, 4]}, + }, + 'ofa_mbv3_d234_e346_k357_w1.2': { + 'ks': {'count': 20, 'vars': [3, 5, 7]}, + 'e': {'count': 20, 'vars': [3, 4, 6]}, + 'd': {'count': 5, 'vars': [2, 3, 4]}, + }, + 'transformer_lt_wmt_en_de': { + 'encoder_embed_dim': {'count': 1, 'vars': [640, 512]}, + 'decoder_embed_dim': {'count': 1, 'vars': [640, 512]}, + 'encoder_ffn_embed_dim': {'count': 6, 'vars': [3072, 2048, 1024]}, + 'decoder_ffn_embed_dim': {'count': 6, 'vars': [3072, 2048, 1024]}, + 'decoder_layer_num': {'count': 1, 'vars': [6, 5, 4, 3, 2, 1]}, + 'encoder_self_attention_heads': {'count': 6, 'vars': [8, 4]}, + 'decoder_self_attention_heads': {'count': 6, 'vars': [8, 4]}, + 'decoder_ende_attention_heads': {'count': 6, 'vars': [8, 4]}, + 'decoder_arbitrary_ende_attn': {'count': 6, 'vars': [-1, 1, 2]}, + }, + } + self.RUNNERS = { + 'ofa_resnet50': OFARunner, + 'ofa_mbv3_d234_e346_k357_w1.0': OFARunner, + 'ofa_mbv3_d234_e346_k357_w1.2': OFARunner, + 'transformer_lt_wmt_en_de': TransformerLTRunner, + } + self.EVALUATION_INTERFACE = {'ofa_resnet50': EvaluationInterfaceResNet50, 'ofa_mbv3_d234_e346_k357_w1.0': EvaluationInterfaceMobileNetV3, - 'ofa_mbv3_d234_e346_k357_w1.2': EvaluationInterfaceMobileNetV3} + 'ofa_mbv3_d234_e346_k357_w1.2': EvaluationInterfaceMobileNetV3, + 'transformer_lt_wmt_en_de': EvaluationInterfaceTransformerLT} + self.LINAS_INNERLOOP_EVALS = {'ofa_resnet50': 5000, 'ofa_mbv3_d234_e346_k357_w1.0': 20000, - 'ofa_mbv3_d234_e346_k357_w1.2': 20000} + 'ofa_mbv3_d234_e346_k357_w1.2': 20000, + 'transformer_lt_wmt_en_de': 10000} + + self.SUPERNET_ENCODING = { + 'ofa_resnet50': ParameterManager, + 'ofa_mbv3_d234_e346_k357_w1.0': ParameterManager, + 'ofa_mbv3_d234_e346_k357_w1.2': ParameterManager, + 'ofa_proxyless_d234_e346_k357_w1.3': ParameterManager, + 'transformer_lt_wmt_en_de': TransformerLTEncoding, + } + super().__init__() self.acc_predictor = None self.macs_predictor = None @@ -74,7 +111,6 @@ def __init__(self, conf_fname_or_obj): self.results_csv_path = None self.init_cfg(conf_fname_or_obj) - def estimate(self, individual): """Estimate performance of the model. @@ -85,19 +121,19 @@ def estimate(self, individual): def init_for_search(self): """Initialize the search configuration.""" - self.supernet_manager = self.ParameterManager( - param_dict=self.SUPERNET_PARAMETERS[self.supernet], - seed=self.seed + self.supernet_manager = self.SUPERNET_ENCODING[self.supernet]( + param_dict=self.SUPERNET_PARAMETERS[self.supernet], seed=self.seed ) # Validation High-Fidelity Measurement Runner - self.runner_validate = self.OFARunner( + self.runner_validate = self.RUNNERS[self.supernet]( supernet=self.supernet, acc_predictor=None, macs_predictor=None, latency_predictor=None, - imagenetpath=self.dataset_path, + datasetpath=self.dataset_path, batch_size=self.batch_size, + checkpoint_path=self.supernet_ckpt_path, ) # Setup validation interface @@ -121,16 +157,23 @@ def search(self): # Randomly sample search space for initial population # if number of results in results_csv_path smaller than population. + + if not os.path.exists(self.results_csv_path): + # Clear also creates empty CSV file. + self.validation_interface.clear_csv() + df = pd.read_csv(self.results_csv_path) - latest_population = [self.supernet_manager.random_sample() \ - for _ in range(max(self.population - df.shape[0], 0))] + latest_population = [self.supernet_manager.random_sample() + for _ in range(max(self.population - df.shape[0], 0))] # Start Lightweight Iterative Neural Architecture Search (LINAS) num_loops = round(self.num_evals/self.population) for loop in range(num_loops): - logger.info('[DyNAS-T] Starting LINAS loop {} of {}.'.format(loop+1, num_loops)) - for individual in latest_population: + for i, individual in enumerate(latest_population): + logger.info( + '[DyNAS-T] Starting eval {} of {} in LINAS loop {} of {}.'.format( + i+1, len(latest_population), loop+1, num_loops)) self.validation_interface.eval_subnet(individual) self.create_acc_predictor() @@ -138,13 +181,14 @@ def search(self): self.create_latency_predictor() # Inner-loop Low-Fidelity Predictor Runner, need to re-instantiate every loop - runner_predict = self.OFARunner( + runner_predict = self.RUNNERS[self.supernet]( supernet=self.supernet, acc_predictor=self.acc_predictor, macs_predictor=self.macs_predictor, latency_predictor=self.latency_predictor, - imagenetpath=self.dataset_path, + datasetpath=self.dataset_path, batch_size=self.batch_size, + checkpoint_path=self.supernet_ckpt_path ) # Setup validation interface @@ -153,7 +197,7 @@ def search(self): manager=self.supernet_manager, metrics=self.metrics, csv_path=None, - predictor_mode = True + predictor_mode=True ) problem = self.ProblemMultiObjective( @@ -163,19 +207,22 @@ def search(self): ) if self.search_algo == 'age': - search_manager = self.SearchAlgoManager(algorithm='age', seed=self.seed) + search_manager = self.SearchAlgoManager( + algorithm='age', seed=self.seed) search_manager.configure_age(population=self.population, - num_evals=self.LINAS_INNERLOOP_EVALS[self.supernet]) + num_evals=self.LINAS_INNERLOOP_EVALS[self.supernet]) else: - search_manager = self.SearchAlgoManager(algorithm='nsga2', seed=self.seed) + search_manager = self.SearchAlgoManager( + algorithm='nsga2', seed=self.seed) search_manager.configure_nsga2(population=self.population, - num_evals=self.LINAS_INNERLOOP_EVALS[self.supernet]) + num_evals=self.LINAS_INNERLOOP_EVALS[self.supernet]) results = search_manager.run_search(problem) latest_population = results.pop.get('X') - logger.info("[DyNAS-T] Validated model architectures in file: {}".format(self.results_csv_path)) + logger.info( + "[DyNAS-T] Validated model architectures in file: {}".format(self.results_csv_path)) output = list() for individual in latest_population: @@ -193,11 +240,11 @@ def select_model_arch(self): # pragma: no cover def create_acc_predictor(self): """Create the accuracy predictor.""" if 'acc' in self.metrics: - logger.info('Building Accuracy Predictor') + logger.info('[DyNAS-T] Building Accuracy Predictor') df = self.supernet_manager.import_csv(self.results_csv_path, config='config', objective='acc', - column_names=['config','date','lat','macs','acc']) + column_names=['config', 'date', 'lat', 'macs', 'acc']) features, labels = self.supernet_manager.create_training_set(df) self.acc_predictor = self.Predictor() self.acc_predictor.train(features, labels.ravel()) @@ -207,11 +254,11 @@ def create_acc_predictor(self): def create_macs_predictor(self): """Create the MACs predictor.""" if 'macs' in self.metrics: - logger.info('Building MACs Predictor') + logger.info('[DyNAS-T] Building MACs Predictor') df = self.supernet_manager.import_csv(self.results_csv_path, config='config', objective='macs', - column_names=['config','date','lat','macs','acc']) + column_names=['config', 'date', 'lat', 'macs', 'acc']) features, labels = self.supernet_manager.create_training_set(df) self.macs_predictor = self.Predictor() self.macs_predictor.train(features, labels.ravel()) @@ -221,11 +268,11 @@ def create_macs_predictor(self): def create_latency_predictor(self): """Create the latency predictor.""" if 'lat' in self.metrics: - logger.info('Building Latency Predictor') + logger.info('[DyNAS-T] Building Latency Predictor') df = self.supernet_manager.import_csv(self.results_csv_path, config='config', objective='lat', - column_names=['config','date','lat','macs','acc']) + column_names=['config', 'date', 'lat', 'macs', 'acc']) features, labels = self.supernet_manager.create_training_set(df) self.latency_predictor = self.Predictor() self.latency_predictor.train(features, labels.ravel()) @@ -240,11 +287,11 @@ def init_cfg(self, conf_fname_or_obj): elif isinstance(conf_fname_or_obj, NASConfig): conf_fname_or_obj.validate() self.conf = conf_fname_or_obj.usr_cfg - else: # pragma: no cover + else: # pragma: no cover raise NotImplementedError( "Please provide a str path to the config file or an object of NASConfig." ) - #self.init_search_cfg(self.conf.nas) + # self.init_search_cfg(self.conf.nas) assert 'dynas' in self.conf.nas, "Must specify dynas section." dynas_config = self.conf.nas.dynas self.search_algo = self.conf.nas.search.search_algorithm @@ -253,8 +300,9 @@ def init_cfg(self, conf_fname_or_obj): self.num_evals = dynas_config.num_evals self.results_csv_path = dynas_config.results_csv_path self.dataset_path = dynas_config.dataset_path + self.supernet_ckpt_path = dynas_config.supernet_ckpt_path self.batch_size = dynas_config.batch_size - if dynas_config.population < 10: # pragma: no cover + if dynas_config.population < 10: # pragma: no cover raise NotImplementedError( "Please specify a population size >= 10" ) diff --git a/neural_compressor/experimental/nas/dynast/__init__.py b/neural_compressor/experimental/nas/dynast/__init__.py index eeb58f6f0a7..c93d1e9d016 100644 --- a/neural_compressor/experimental/nas/dynast/__init__.py +++ b/neural_compressor/experimental/nas/dynast/__init__.py @@ -15,4 +15,4 @@ # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and -# limitations under the License. \ No newline at end of file +# limitations under the License. diff --git a/neural_compressor/experimental/nas/dynast/dynas_manager.py b/neural_compressor/experimental/nas/dynast/dynas_manager.py index 3a4868d41b5..4ba28d087be 100644 --- a/neural_compressor/experimental/nas/dynast/dynas_manager.py +++ b/neural_compressor/experimental/nas/dynast/dynas_manager.py @@ -23,9 +23,8 @@ import numpy as np import pandas as pd -from sklearn.model_selection import train_test_split - from neural_compressor.utils import logger +from sklearn.model_selection import train_test_split class ParameterManager: @@ -67,21 +66,25 @@ def process_param_dict(self) -> Tuple[list, list, int]: for i in range(options['count']): parameter_upperbound.append(len(options['vars']) - 1) index_simple = [x for x in range(len(options['vars']))] - parameter_mapper.append(dict(zip(index_simple, options['vars']))) + parameter_mapper.append( + dict(zip(index_simple, options['vars']))) - if self.verbose: # pragma: no cover + if self.verbose: # pragma: no cover logger.info( - '[DyNAS-T] Problem definition variables: {}'.format(parameter_count) + '[DyNAS-T] Problem definition variables: {}'.format( + parameter_count) ) logger.info( - '[DyNAS-T] Variable Upper Bound array: {}'.format(parameter_upperbound) + '[DyNAS-T] Variable Upper Bound array: {}'.format( + parameter_upperbound) ) logger.info( '[DyNAS-T] Mapping dictionary created of length: {}'.format( len(parameter_mapper) ) ) - logger.info('[DyNAS-T] Parameter Bound: {}'.format(parameter_bound)) + logger.info( + '[DyNAS-T] Parameter Bound: {}'.format(parameter_bound)) return parameter_mapper, parameter_upperbound, parameter_count @@ -142,7 +145,8 @@ def random_samples(self, size: int = 100, trial_limit: int = 100000) -> List[lis trials += 1 if trials >= trial_limit: - logger.warning('[DyNAS-T] Unable to create unique list of samples.') + logger.warning( + '[DyNAS-T] Unable to create unique list of samples.') return pymoo_vector_list @@ -172,7 +176,8 @@ def translate2pymoo(self, parameters: dict) -> list: param_counter = 0 for i in range(value['count']): output.append( - self.inv_mapper[mapper_counter][parameters[key][param_counter]] + self.inv_mapper[mapper_counter][parameters[key] + [param_counter]] ) mapper_counter += 1 param_counter += 1 @@ -277,3 +282,193 @@ def create_training_set( ) ) return features_train, features_test, labels_train, labels_test + + +class TransformerLTEncoding(ParameterManager): #noqa: D101 + def __init__(self, param_dict: dict, verbose: bool = False, seed: int = 0): #noqa: D107 + super().__init__(param_dict, verbose, seed) + + def onehot_custom(self, subnet_cfg, provide_onehot=True): #noqa: D102 + + features = [] + features.extend(subnet_cfg['encoder_embed_dim']) + + encode_layer_num_int = 6 + + # Encoder FFN Embed Dim + encoder_ffn_embed_dim = subnet_cfg['encoder_ffn_embed_dim'] + + if encode_layer_num_int < 6: + encoder_ffn_embed_dim.extend([0]*(6-encode_layer_num_int)) + features.extend(encoder_ffn_embed_dim) + + # Encoder Self-Attn Heads + + encoder_self_attention_heads = subnet_cfg['encoder_self_attention_heads'][:encode_layer_num_int] + + if encode_layer_num_int < 6: + encoder_self_attention_heads.extend([0]*(6-encode_layer_num_int)) + features.extend(encoder_self_attention_heads) + + features.extend(subnet_cfg['decoder_embed_dim']) + + decoder_layer_num = subnet_cfg['decoder_layer_num'] + decoder_layer_num_int = decoder_layer_num[0] + features.extend(decoder_layer_num) + + # Decoder FFN Embed Dim + decoder_ffn_embed_dim = subnet_cfg['decoder_ffn_embed_dim'][:decoder_layer_num_int] + + if decoder_layer_num_int < 6: + decoder_ffn_embed_dim.extend([0]*(6-decoder_layer_num_int)) + features.extend(decoder_ffn_embed_dim) + + # Decoder Attn Heads + decoder_self_attention_heads = subnet_cfg['decoder_self_attention_heads'][:decoder_layer_num_int] + + if decoder_layer_num_int < 6: + decoder_self_attention_heads.extend([0]*(6-decoder_layer_num_int)) + features.extend(decoder_self_attention_heads) + + # Decoder ENDE HEADS + + decoder_ende_attention_heads = subnet_cfg['decoder_ende_attention_heads'][:decoder_layer_num_int] + + if decoder_layer_num_int < 6: + decoder_ende_attention_heads.extend([0]*(6-decoder_layer_num_int)) + + features.extend(decoder_ende_attention_heads) + + arbitrary_ende_attn_trans = [] + for i in range(decoder_layer_num_int): + if subnet_cfg['decoder_arbitrary_ende_attn'][i] == -1: + arbitrary_ende_attn_trans.append(1) + elif subnet_cfg['decoder_arbitrary_ende_attn'][i] == 1: + arbitrary_ende_attn_trans.append(2) + elif subnet_cfg['decoder_arbitrary_ende_attn'][i] == 2: + arbitrary_ende_attn_trans.append(3) + + if decoder_layer_num_int < 6: + arbitrary_ende_attn_trans.extend([0]*(6-decoder_layer_num_int)) + features.extend(arbitrary_ende_attn_trans) + + if provide_onehot == True: + examples = np.array([features]) + one_hot_count = 0 + unique_values = self.unique_values + + for unique in unique_values: + one_hot_count += len(unique.tolist()) + + one_hot_examples = np.zeros((examples.shape[0], one_hot_count)) + for e, example in enumerate(examples): + offset = 0 + for f in range(len(example)): + index = np.where(unique_values[f] == example[f])[ + 0] + offset + one_hot_examples[e, index] = 1.0 + offset += len(unique_values[f]) + return one_hot_examples + + else: + return features + + def import_csv( + self, + filepath: str, + config: str, + objective: str, + column_names: List[str] = None, + drop_duplicates: bool = True, + ) -> pd.DataFrame: + """Import a csv file generated from a supernetwork search for the purpose of training a predictor. + + filepath - path of the csv to be imported. + config - the subnetwork configuration + objective - target/label for the subnet configuration (e.g. accuracy, latency) + column_names - a list of column names for the dataframe + df - the output dataframe that contains the original config dict, pymoo, and 1-hot + equivalent vector for training. + """ + if column_names == None: + df = pd.read_csv(filepath) + else: + df = pd.read_csv(filepath) + df.columns = column_names + df = df[[config, objective]] + # Old corner case coverage + df[config] = df[config].replace({'null': 'None'}, regex=True) + + if drop_duplicates: + df.drop_duplicates(subset=[config], inplace=True) + df.reset_index(drop=True, inplace=True) + + convert_to_dict = list() + convert_to_pymoo = list() + convert_to_onehot = list() + for i in range(len(df)): + # Elastic Param Config format + config_as_dict = ast.literal_eval(df[config].iloc[i]) + convert_to_dict.append(config_as_dict) + # PyMoo 1-D vector format + config_as_pymoo = self.translate2pymoo(config_as_dict) + convert_to_pymoo.append(config_as_pymoo) + # Onehot predictor format + config_as_onehot = self.onehot_custom( + config_as_dict, provide_onehot=False) + convert_to_onehot.append(config_as_onehot) + df[config] = convert_to_dict + df['config_pymoo'] = convert_to_pymoo + df['config_onehot'] = convert_to_onehot + + return df + + # @staticmethod + def create_training_set( + self, + dataframe: pd.DataFrame, + train_with_all: bool = True, + split: float = 0.33, + seed: bool = None, + ) -> Tuple[list, list, list, list]: + """Create a sklearn compatible test/train. + + The set is created from an imported results csv after "import_csv" method is run. + """ + collect_rows = list() + for i in range(len(dataframe)): + collect_rows.append(np.asarray(dataframe['config_onehot'].iloc[i])) + features = np.asarray(collect_rows) + labels = dataframe.drop( + columns=['config', 'config_pymoo', 'config_onehot']).values + + assert len(features) == len(labels) + one_hot_count = 0 + unique_values = [] + + for c in range(features.shape[1]): + unique_values.append(np.unique(features[:, c])) + one_hot_count += len(unique_values[-1]) + one_hot_examples = np.zeros((features.shape[0], one_hot_count)) + for e, example in enumerate(features): + offset = 0 + for f in range(len(example)): + index = np.where(unique_values[f] == example[f])[0] + offset + one_hot_examples[e, index] = 1.0 + offset += len(unique_values[f]) + + features = one_hot_examples + self.unique_values = unique_values + if train_with_all: + logger.info('[DyNAS-T] Training set size={}'.format(len(labels))) + return features, labels + else: + features_train, features_test, labels_train, labels_test = train_test_split( + features, labels, test_size=split, random_state=seed + ) + logger.info( + '[DyNAS-T] Test ({}) Train ({}) ratio is {}.'.format( + len(labels_train), len(labels_test), split + ) + ) + return features_train, features_test, labels_train, labels_test diff --git a/neural_compressor/experimental/nas/dynast/dynas_predictor.py b/neural_compressor/experimental/nas/dynast/dynas_predictor.py index fd1d80bf2b6..15b167bb86d 100644 --- a/neural_compressor/experimental/nas/dynast/dynas_predictor.py +++ b/neural_compressor/experimental/nas/dynast/dynas_predictor.py @@ -48,15 +48,25 @@ def __init__(self, alphas=DEFAULT_ALPHAS, cost_factors=DEFAULT_COST_FACTORS, self.best_index = 0 # Create lists of regressors and associated hyper-parameters - regressors = [linear_model.Ridge(max_iter=max_iterations), - svm.SVR(kernel='rbf', gamma='auto', epsilon=0.0, max_iter=max_iterations)] + regressors = [ + linear_model.Ridge(max_iter=max_iterations), + svm.SVR(kernel='rbf', gamma='auto', + epsilon=0.0, max_iter=max_iterations), + ] hyper_parameters = [{'alpha': alphas}, {'C': cost_factors}] # Create list of hyper-parameter searchers self.searchers = [] for regressor, parameters in zip(regressors, hyper_parameters): - self.searchers.append(GridSearchCV(estimator=regressor, param_grid=parameters, n_jobs=-1, - scoring='neg_mean_absolute_percentage_error', verbose=SEARCHER_VERBOSITY if (verbose) else 0)) + self.searchers.append( + GridSearchCV( + estimator=regressor, + param_grid=parameters, + n_jobs=-1, + scoring='neg_mean_absolute_percentage_error', + verbose=SEARCHER_VERBOSITY if (verbose) else 0, + ) + ) def train(self, examples, labels): """Train the predictor on the specified examples and labels using the underlying regressor. @@ -65,8 +75,14 @@ def train(self, examples, labels): examples: Examples to be used for training. labels: Labels to be used for training. """ + # Compute normalization factor + max_label = np.amax(np.abs(labels)) + if max_label > 0.0: + self.normalization_factor = 10 ** (np.floor(np.log10(max_label)) - 1.0) + else: + self.normalization_factor = 1.0 + # Compute normalized labels - self.normalization_factor = 10 ** (np.floor(np.log10(np.amax(labels))) - 1.0) normalized_labels = labels / self.normalization_factor # Train regressors with optimal parameters @@ -101,7 +117,7 @@ def get_parameters(self): Optimal parameter values of the underlying regressor. """ # Retrieve optimal parameters - parameters = {} + parameters = {'best_index': self.best_index} for searcher in self.searchers: regressor_name = searcher.best_estimator_.__class__.__name__ for key in searcher.best_params_: diff --git a/neural_compressor/experimental/nas/dynast/dynas_search.py b/neural_compressor/experimental/nas/dynast/dynas_search.py index d1a0996eb7e..e8c1c15224c 100644 --- a/neural_compressor/experimental/nas/dynast/dynas_search.py +++ b/neural_compressor/experimental/nas/dynast/dynas_search.py @@ -22,15 +22,15 @@ import autograd.numpy as anp import numpy as np import pymoo +from neural_compressor.experimental.nas.dynast.dynas_utils import \ + EvaluationInterface +from neural_compressor.utils import logger from pymoo.algorithms.moo.age import AGEMOEA from pymoo.algorithms.moo.nsga2 import NSGA2 from pymoo.core.problem import Problem from pymoo.factory import get_crossover, get_mutation, get_sampling from pymoo.optimize import minimize -from neural_compressor.experimental.nas.dynast.dynas_utils import EvaluationInterface -from neural_compressor.utils import logger - class SearchAlgoManager: """Manage the search parameters for the DyNAS-T single/multi-objective search. @@ -60,9 +60,10 @@ def __init__( elif self.algorithm == 'age': self.configure_age() self.engine = 'pymoo' - else: # pragma: no cover + else: # pragma: no cover logger.error( - '[DyNAS-T] algorithm "{}" not implemented.'.format(self.algorithm) + '[DyNAS-T] algorithm "{}" not implemented.'.format( + self.algorithm) ) raise NotImplementedError @@ -88,8 +89,10 @@ def configure_nsga2( self.algorithm_def = NSGA2( pop_size=population, sampling=sample_strategy, - crossover=get_crossover("int_sbx", prob=crossover_prob, eta=crossover_eta), - mutation=get_mutation("int_pm", prob=mutation_prob, eta=mutation_eta), + crossover=get_crossover( + "int_sbx", prob=crossover_prob, eta=crossover_eta), + mutation=get_mutation( + "int_pm", prob=mutation_prob, eta=mutation_eta), eliminate_duplicates=True, ) @@ -116,8 +119,10 @@ def configure_age( self.algorithm_def = AGEMOEA( pop_size=population, sampling=sample_strategy, - crossover=get_crossover("int_sbx", prob=crossover_prob, eta=crossover_eta), - mutation=get_mutation("int_pm", prob=mutation_prob, eta=mutation_eta), + crossover=get_crossover( + "int_sbx", prob=crossover_prob, eta=crossover_eta), + mutation=get_mutation( + "int_pm", prob=mutation_prob, eta=mutation_eta), eliminate_duplicates=True, ) @@ -140,7 +145,7 @@ def run_search( save_history=save_history, verbose=self.verbose, ) - else: # pragma: no cover + else: # pragma: no cover logger.error('[DyNAS-T] Invalid algorithm engine configuration!') raise NotImplementedError @@ -194,12 +199,11 @@ def _evaluate( # Measure new individuals for i in range(len(x)): - _, objective_x, objective_y = self.evaluation_interface.eval_subnet(x[i]) + _, objective_x, objective_y = self.evaluation_interface.eval_subnet( + x[i]) objective_x_arr.append(objective_x) objective_y_arr.append(objective_y) - print('.', end='', flush=True) - # Update PyMoo with evaluation data out["F"] = anp.column_stack([objective_x_arr, objective_y_arr]) diff --git a/neural_compressor/experimental/nas/dynast/dynas_utils.py b/neural_compressor/experimental/nas/dynast/dynas_utils.py index 133010ef9dc..9c9e4b1f4ce 100644 --- a/neural_compressor/experimental/nas/dynast/dynas_utils.py +++ b/neural_compressor/experimental/nas/dynast/dynas_utils.py @@ -27,16 +27,23 @@ import numpy as np import ofa from fvcore.nn import FlopCountAnalysis -from ofa.imagenet_classification.data_providers.imagenet import ImagenetDataProvider -from ofa.imagenet_classification.run_manager import ImagenetRunConfig, RunManager -from ofa.tutorial.flops_table import rm_bn_from_net - -from neural_compressor.experimental.nas.dynast.dynas_manager import ParameterManager +from neural_compressor.experimental.nas.dynast.dynas_manager import \ + ParameterManager from neural_compressor.experimental.nas.dynast.dynas_predictor import Predictor +# from neural_compressor.experimental.nas.dynast.supernetwork.machine_translation.transformer_interface import ( +# compute_bleu, compute_latency, compute_macs) from neural_compressor.utils.utility import LazyImport, logger +from ofa.imagenet_classification.data_providers.imagenet import \ + ImagenetDataProvider +from ofa.imagenet_classification.run_manager import (ImagenetRunConfig, + RunManager) +from ofa.tutorial.flops_table import rm_bn_from_net torch = LazyImport('torch') torchvision = LazyImport('torchvision') +transformer_interface = LazyImport( + 'neural_compressor.experimental.nas.dynast.supernetwork.machine_translation.transformer_interface' +) def get_macs( @@ -171,8 +178,9 @@ def __init__( acc_predictor: Predictor, macs_predictor: Predictor, latency_predictor: Predictor, - imagenetpath: str, + datasetpath: str, batch_size: int, + **kwargs, ) -> None: """Initialize the attributes.""" self.supernet = supernet @@ -181,7 +189,7 @@ def __init__( self.latency_predictor = latency_predictor self.device = 'cpu' self.test_size = None - ImagenetDataProvider.DEFAULT_PATH = imagenetpath + ImagenetDataProvider.DEFAULT_PATH = datasetpath self.ofa_network = ofa.model_zoo.ofa_net(supernet, pretrained=True) self.run_config = ImagenetRunConfig(test_batch_size=64, n_worker=20) self.batch_size = batch_size @@ -251,7 +259,8 @@ def validate_top1( run_manager.reset_running_statistics(net=subnet) # Test sampled subnet - self.run_config.data_provider.assign_active_img_size(subnet_cfg['r'][0]) + self.run_config.data_provider.assign_active_img_size( + subnet_cfg['r'][0]) loss, acc = run_manager.validate(net=subnet, no_logs=False) top1 = acc[0] return top1 @@ -271,7 +280,7 @@ def validate_macs( model = self.get_subnet(subnet_cfg) input_size = (self.batch_size, 3, 224, 224) macs = get_macs(model=model, input_size=input_size, device=self.device) - logger.info('Model\'s macs: {}'.format(macs)) + logger.info('[DyNAS-T] Model\'s macs: {}'.format(macs)) return macs @torch.no_grad() @@ -299,7 +308,8 @@ def measure_latency( measure_steps=measure_steps, device=self.device, ) - logger.info('Model\'s latency: {} +/- {}'.format(latency_mean, latency_std)) + logger.info( + '[DyNAS-T] Model\'s latency: {} +/- {}'.format(latency_mean, latency_std)) return latency_mean, latency_std @@ -329,6 +339,95 @@ def get_subnet( return self.subnet +class TransformerLTRunner(Runner): #noqa: D101 + + def __init__( + self, + supernet: str, + acc_predictor: Predictor, + macs_predictor: Predictor, + latency_predictor: Predictor, + datasetpath: str, + batch_size: int, + checkpoint_path: str, + **kwargs, + ) -> None: #noqa: D107 + self.supernet = supernet + self.acc_predictor = acc_predictor + self.macs_predictor = macs_predictor + self.latency_predictor = latency_predictor + self.device = 'cpu' + self.test_size = None + self.batch_size = batch_size + self.dataset_path = datasetpath + self.checkpoint_path = checkpoint_path + + def estimate_accuracy_bleu( + self, + subnet_cfg: dict, + ) -> float: #noqa: D102 + top1 = self.acc_predictor.predict(subnet_cfg) + return top1 + + def estimate_macs( + self, + subnet_cfg: dict, + ) -> int: #noqa: D102 + macs = self.macs_predictor.predict(subnet_cfg) + return macs + + def estimate_latency( + self, + subnet_cfg: dict, + ) -> float: #noqa: D102 + latency = self.latency_predictor.predict(subnet_cfg) + return latency + + def validate_bleu( + self, + subnet_cfg: dict, + ) -> float: #noqa: D102 + + bleu = transformer_interface.compute_bleu(subnet_cfg, self.dataset_path, + self.checkpoint_path) + return bleu + + def validate_macs( + self, + subnet_cfg: dict, + ) -> float: + """Measure Torch model's FLOPs/MACs as per FVCore calculation. + + Args: + subnet_cfg: sub-network Torch model + Returns: + `macs` + """ + macs = transformer_interface.compute_macs(subnet_cfg, self.dataset_path) + logger.info('[DyNAS-T] Model\'s macs: {}'.format(macs)) + + return macs + + @torch.no_grad() + def measure_latency( + self, + subnet_cfg: dict, + ) -> Tuple[float, float]: + """Measure model's latency. + + Args: + subnet_cfg: sub-network Torch model + Returns: + mean latency; std latency + """ + latency_mean, latency_std = transformer_interface.compute_latency( + subnet_cfg, self.dataset_path, self.batch_size) + logger.info( + '[DyNAS-T] Model\'s latency: {} +/- {}'.format(latency_mean, latency_std)) + + return latency_mean, latency_std + + class EvaluationInterface: """Evaluation Interface class. @@ -368,7 +467,8 @@ def clear_csv(self) -> None: if self.csv_path: f = open(self.csv_path, "w") writer = csv.writer(f) - result = ['Sub-network', 'Date', 'Latency (ms)', ' MACs', 'Top-1 Acc (%)'] + result = ['Sub-network', 'Date', + 'Latency (ms)', 'MACs', 'Top-1 Acc (%)'] writer.writerow(result) f.close() @@ -484,11 +584,14 @@ def eval_subnet( # Always evaluate/predict top1 lat, macs = 0, 0 if self.predictor_mode == True: - top1 = self.evaluator.estimate_accuracy_top1(self.manager.onehot_generic(x).reshape(1,-1))[0] + top1 = self.evaluator.estimate_accuracy_top1( + self.manager.onehot_generic(x).reshape(1, -1))[0] if 'macs' in self.metrics: - macs = self.evaluator.estimate_macs(self.manager.onehot_generic(x).reshape(1,-1))[0] + macs = self.evaluator.estimate_macs( + self.manager.onehot_generic(x).reshape(1, -1))[0] if 'lat' in self.metrics: - lat = self.evaluator.estimate_latency(self.manager.onehot_generic(x).reshape(1,-1))[0] + lat = self.evaluator.estimate_latency( + self.manager.onehot_generic(x).reshape(1, -1))[0] else: top1 = self.evaluator.validate_top1(subnet_sample) macs = self.evaluator.validate_macs(subnet_sample) @@ -510,6 +613,84 @@ def eval_subnet( return sample, macs, -top1 +class EvaluationInterfaceTransformerLT(EvaluationInterface): #noqa: D101 + def __init__( + self, + evaluator: Runner, + manager: ParameterManager, + metrics=['acc', 'macs'], + predictor_mode=False, + csv_path=None, + ) -> None: #noqa: D107 + super().__init__(evaluator, manager, metrics, predictor_mode, csv_path) + + def eval_subnet( + self, + x: list, + ) -> Tuple[dict, float, float]: #noqa: D102 + # PyMoo vector to Elastic Parameter Mapping + param_dict = self.manager.translate2param(x) + + sample = { + 'encoder': { + 'encoder_embed_dim': param_dict['encoder_embed_dim'][0], + 'encoder_layer_num': 6, # param_dict['encoder_layer_num'][0], + 'encoder_ffn_embed_dim': param_dict['encoder_ffn_embed_dim'], + 'encoder_self_attention_heads': param_dict['encoder_self_attention_heads'], + }, + 'decoder': { + 'decoder_embed_dim': param_dict['decoder_embed_dim'][0], + 'decoder_layer_num': param_dict['decoder_layer_num'][0], + 'decoder_ffn_embed_dim': param_dict['decoder_ffn_embed_dim'], + 'decoder_self_attention_heads': param_dict['decoder_self_attention_heads'], + 'decoder_ende_attention_heads': param_dict['decoder_ende_attention_heads'], + 'decoder_arbitrary_ende_attn': param_dict['decoder_arbitrary_ende_attn'] + } + } + + subnet_sample = copy.deepcopy(sample) + + # Always evaluate/predict top1 + lat, macs = 0, 0 + if self.predictor_mode == True: + bleu = self.evaluator.estimate_accuracy_bleu( + self.manager.onehot_custom(param_dict).reshape(1, -1))[0] + if 'macs' in self.metrics: + macs = self.evaluator.estimate_macs( + self.manager.onehot_custom(param_dict).reshape(1, -1))[0] + if 'lat' in self.metrics: + lat = self.evaluator.estimate_latency( + self.manager.onehot_custom(param_dict).reshape(1, -1))[0] + else: + bleu = self.evaluator.validate_bleu(subnet_sample) + macs = self.evaluator.validate_macs(subnet_sample) + if 'lat' in self.metrics: + lat, _ = self.evaluator.measure_latency(subnet_sample) + + if self.csv_path: + with open(self.csv_path, 'a') as f: + writer = csv.writer(f) + date = str(datetime.now()) + result = [param_dict, date, lat, macs, bleu, ] + writer.writerow(result) + + # PyMoo only minimizes objectives, thus accuracy needs to be negative + # Requires format: subnetwork, objective x, objective y + if 'lat' in self.metrics: + return sample, lat, -bleu + else: + return sample, macs, -bleu + + def clear_csv(self) -> None: #noqa: D102 + if self.csv_path: + f = open(self.csv_path, "w") + writer = csv.writer(f) + result = ['Sub-network', 'Date', + 'Latency (ms)', 'MACs', 'BLEU'] + writer.writerow(result) + f.close() + + def get_torchvision_model( model_name: str, ) -> torch.nn.Module: @@ -525,14 +706,15 @@ def get_torchvision_model( model = getattr(torchvision.models, model_name)(pretrained=True) model.eval() return model - except AttributeError as ae: # pragma: no cover + except AttributeError as ae: # pragma: no cover logger.error( 'Model {model_name} not available. This can be due to either a typo or the model is not ' 'available in torchvision=={torchvision_version}. \nAvailable models: {available_models}'.format( model_name=model_name, torchvision_version=torchvision.__version__, available_models=', '.join( - [m for m in dir(torchvision.models) if not m.startswith('_')] + [m for m in dir(torchvision.models) + if not m.startswith('_')] ), ) ) diff --git a/neural_compressor/experimental/nas/dynast/supernetwork/__init__.py b/neural_compressor/experimental/nas/dynast/supernetwork/__init__.py new file mode 100644 index 00000000000..451e864f2c7 --- /dev/null +++ b/neural_compressor/experimental/nas/dynast/supernetwork/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) 2022 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""This module contains all code related to the supernets.""" \ No newline at end of file diff --git a/neural_compressor/experimental/nas/dynast/supernetwork/machine_translation/__init__.py b/neural_compressor/experimental/nas/dynast/supernetwork/machine_translation/__init__.py new file mode 100644 index 00000000000..9003687dcb2 --- /dev/null +++ b/neural_compressor/experimental/nas/dynast/supernetwork/machine_translation/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) 2022 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""This module contains all code related to the machine translation (Transformer LT) supernet.""" diff --git a/neural_compressor/experimental/nas/dynast/supernetwork/machine_translation/modules_supernetwork.py b/neural_compressor/experimental/nas/dynast/supernetwork/machine_translation/modules_supernetwork.py new file mode 100644 index 00000000000..1a5c9739372 --- /dev/null +++ b/neural_compressor/experimental/nas/dynast/supernetwork/machine_translation/modules_supernetwork.py @@ -0,0 +1,638 @@ +#noqa: D100 +# https://github.com/mit-han-lab/hardware-aware-transformers/blob/master/LICENSE +# +# Copyright (c) 2022 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections import defaultdict + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.nn import Parameter +from torch.nn.modules.module import _addindent + +from neural_compressor.utils.utility import LazyImport + +fairseq = LazyImport("fairseq") + +INCREMENTAL_STATE_INSTANCE_ID = defaultdict(lambda: 0) + + +def _get_full_incremental_state_key(module_instance, key): + module_name = module_instance.__class__.__name__ + + # assign a unique ID to each module instance, so that incremental state is + # not shared across module instances + if not hasattr(module_instance, '_fairseq_instance_id'): + INCREMENTAL_STATE_INSTANCE_ID[module_name] += 1 + module_instance._fairseq_instance_id = INCREMENTAL_STATE_INSTANCE_ID[module_name] + + return '{}.{}.{}'.format(module_name, module_instance._fairseq_instance_id, key) + + +def get_incremental_state(module, incremental_state, key): #noqa: D102 + """Helper for getting incremental state for an nn.Module.""" + full_key = _get_full_incremental_state_key(module, key) + if incremental_state is None or full_key not in incremental_state: + return None + return incremental_state[full_key] + + +def set_incremental_state(module, incremental_state, key, value): #noqa: D102 + """Helper for setting incremental state for an nn.Module.""" + if incremental_state is not None: + full_key = _get_full_incremental_state_key(module, key) + incremental_state[full_key] = value + + +class EmbeddingSuper(nn.Embedding): #noqa: D101 + def __init__(self, num_embeddings, super_embed_dim, padding_idx, *args, **kwargs): #noqa: D107 + super().__init__(num_embeddings, super_embed_dim, padding_idx, *args, **kwargs) + + # the largest embed dim + self.super_embed_dim = { + 'encoder': super_embed_dim, 'decoder': super_embed_dim} + + # the current sampled embed dim + self.sample_embed_dim = {'encoder': None, 'decoder': None} + + self.samples = {'encoder': {}, 'decoder': {}} + self.profiling = False + self.reset_parameters() + + def profile(self, mode=True): #noqa: D102 + self.profiling = mode + + def reset_parameters(self): #noqa: D102 + super().reset_parameters() + nn.init.normal_(self.weight, mean=0, std=self.embedding_dim ** -0.5) + nn.init.constant_(self.weight[self.padding_idx], 0) + + def set_sample_config(self, sample_embed_dim, part): #noqa: D102 + self.sample_embed_dim[part] = sample_embed_dim + self._sample_parameters(part) + + def _sample_parameters(self, part): + weight = self.weight[..., :self.sample_embed_dim[part]] + self.samples[part]['weight'] = weight + + return self.samples + + def sample_parameters(self, part, resample=False): #noqa: D102 + return self._sample_parameters(part) if self.profiling or resample else self.samples + + def sampled_weight(self, part): #noqa: D102 + return self.sample_parameters(part)[part]['weight'] + + def forward(self, input, part='encoder'): #noqa: D102 + return F.embedding( + input, + self.sampled_weight(part), + self.padding_idx, + self.max_norm, + self.norm_type, + self.scale_grad_by_freq, + self.sparse, + ) + + +class LinearSuper(nn.Linear): #noqa: D101 + def __init__(self, super_in_dim, super_out_dim, bias=True, uniform_=None, non_linear='linear'): #noqa: D107 + super().__init__(super_in_dim, super_out_dim, bias=bias) + + # super_in_dim and super_out_dim indicate the largest network! + self.super_in_dim = super_in_dim + self.super_out_dim = super_out_dim + + # input_dim and output_dim indicate the current sampled size + self.sample_in_dim = None + self.sample_out_dim = None + + self.samples = {} + + self._reset_parameters(bias, uniform_, non_linear) + self.profiling = False + + def profile(self, mode=True): #noqa: D102 + self.profiling = mode + + def sample_parameters(self, resample=False): #noqa: D102 + if self.profiling or resample: + return self._sample_parameters() + return self.samples + + def _reset_parameters(self, bias, uniform_, non_linear): + nn.init.xavier_uniform_(self.weight) if uniform_ is None else uniform_( + self.weight, non_linear=non_linear) + if bias: + nn.init.constant_(self.bias, 0.) + + def set_sample_config(self, sample_in_dim, sample_out_dim): #noqa: D102 + self.sample_in_dim = sample_in_dim + self.sample_out_dim = sample_out_dim + + self._sample_parameters() + + def _sample_parameters(self): + self.samples['weight'] = sample_weight( + self.weight, self.sample_in_dim, self.sample_out_dim) + self.samples['bias'] = self.bias + if self.bias is not None: + self.samples['bias'] = sample_bias(self.bias, self.sample_out_dim) + return self.samples + + def forward(self, x): #noqa: D102 + self.sample_parameters() + return F.linear(x, self.samples['weight'], self.samples['bias']) + + def calc_sampled_param_num(self): #noqa: D102 + assert 'weight' in self.samples.keys() + weight_numel = self.samples['weight'].numel() + + if self.samples['bias'] is not None: + bias_numel = self.samples['bias'].numel() + else: + bias_numel = 0 + + return weight_numel + bias_numel + + +def sample_weight(weight, sample_in_dim, sample_out_dim): #noqa: D103 + sample_weight = weight[:, :sample_in_dim] + sample_weight = sample_weight[:sample_out_dim, :] + + return sample_weight + + +def sample_bias(bias, sample_out_dim): #noqa: D103 + sample_bias = bias[:sample_out_dim] + + return sample_bias + + +def LayerNorm(normalized_shape, eps=1e-5, elementwise_affine=True, export=False): #noqa: D103 + if not export and torch.cuda.is_available(): + try: + from apex.normalization import FusedLayerNorm + return FusedLayerNorm(normalized_shape, eps, elementwise_affine) + except ImportError: + pass + return torch.nn.LayerNorm(normalized_shape, eps, elementwise_affine) + + +class LayerNormSuper(torch.nn.LayerNorm): #noqa: D101 + def __init__(self, super_embed_dim): #noqa: D107 + super().__init__(super_embed_dim) + + # the largest embed dim + self.super_embed_dim = super_embed_dim + + # the current sampled embed dim + self.sample_embed_dim = None + + self.samples = {} + self.profiling = False + + def profile(self, mode=True): #noqa: D102 + self.profiling = mode + + def sample_parameters(self, resample=False): #noqa: D102 + if self.profiling or resample: + return self._sample_parameters() + return self.samples + + def _sample_parameters(self): + self.samples['weight'] = self.weight[:self.sample_embed_dim] + self.samples['bias'] = self.bias[:self.sample_embed_dim] + return self.samples + + def set_sample_config(self, sample_embed_dim): # noqa: D102 + self.sample_embed_dim = sample_embed_dim + self._sample_parameters() + + def forward(self, x): # noqa: D102 + self.sample_parameters() + return F.layer_norm( + x, + (self.sample_embed_dim,), + weight=self.samples['weight'], + bias=self.samples['bias'], + eps=self.eps, + ) + + def calc_sampled_param_num(self): # noqa: D102 + assert 'weight' in self.samples.keys() + assert 'bias' in self.samples.keys() + return self.samples['weight'].numel() + self.samples['bias'].numel() + + +class MultiheadAttentionSuper(nn.Module): + """Multi-headed attention. + + See "Attention Is All You Need" for more details. + """ + + def __init__(self, super_embed_dim, num_heads, is_encoder, super_kdim=None, super_vdim=None, dropout=0., bias=True, + add_bias_kv=False, add_zero_attn=False, self_attention=False, + encoder_decoder_attention=False, out_dim=None, qkv_dim=None): # noqa: D107 + super().__init__() + + # the configs of super arch + self.super_q_embed_dim = super_embed_dim + self.super_kv_embed_dim = None + + # the configs of current sampled arch + self.sample_q_embed_dim = None + self.sample_kv_embed_dim = None + + if super_kdim is not None: + assert super_kdim == super_vdim + self.super_kv_embed_dim = super_kdim + else: + self.super_kv_embed_dim = self.super_q_embed_dim + + if qkv_dim is None: + self.qkv_dim = self.super_q_embed_dim + else: + self.qkv_dim = qkv_dim + + # this qkv same dim means the input dim for qkv are the same, not the output dim + # self.qkv_same_dim = self.kdim == self.super_embed_dim and self.vdim == self.super_embed_dim + self.qkv_same_dim = self.super_kv_embed_dim == self.super_q_embed_dim + self.encoder = is_encoder + + # Caution! these actually are the sampled num_heads, head_dim and scaling + self.num_heads = num_heads + self.dropout = dropout + self.head_dim = self.qkv_dim // num_heads + assert self.head_dim * num_heads == self.qkv_dim, "qkv must be divisible by num_heads" + self.scaling = self.head_dim ** -0.5 + + self.self_attention = self_attention + self.encoder_decoder_attention = encoder_decoder_attention + + assert not self.self_attention or self.qkv_same_dim, 'Self-attention requires query, key and ' \ + 'value to be of the same size' + + if self.qkv_same_dim: + self.in_proj_weight = Parameter(torch.Tensor( + 3 * self.qkv_dim, self.super_q_embed_dim)) + else: + self.k_proj_weight = Parameter(torch.Tensor( + self.qkv_dim, self.super_kv_embed_dim)) + self.v_proj_weight = Parameter(torch.Tensor( + self.qkv_dim, self.super_kv_embed_dim)) + self.q_proj_weight = Parameter(torch.Tensor( + self.qkv_dim, self.super_q_embed_dim)) + + if bias: + self.in_proj_bias = Parameter(torch.Tensor(3 * self.qkv_dim)) + else: + self.register_parameter('in_proj_bias', None) + + if out_dim is None: + out_dim = self.super_q_embed_dim + self.out_proj = LinearSuper( + super_in_dim=self.qkv_dim, super_out_dim=out_dim, bias=bias) + + if add_bias_kv: + self.bias_k = Parameter(torch.Tensor(1, 1, self.super_q_embed_dim)) + self.bias_v = Parameter(torch.Tensor(1, 1, self.super_q_embed_dim)) + else: + self.bias_k = self.bias_v = None + + self.add_zero_attn = add_zero_attn + + self.reset_parameters() + + self.onnx_trace = False + + self.enable_torch_version = False + if hasattr(F, "multi_head_attention_forward"): + self.enable_torch_version = True + else: + self.enable_torch_version = False + self.enable_torch_version = False + + def calc_sampled_param_num(self): # noqa: D102 + assert self.in_proj_weight is not None and self.in_proj_bias is not None + in_proj_q_weight_numel = self.sample_q_embed_dim * self.qkv_dim + in_proj_v_weight_numel = in_proj_k_weight_numel = self.sample_kv_embed_dim * self.qkv_dim + in_proj_bias_numel = self.in_proj_bias.numel() + + # does not count in the output proj because it will be counted in LinearSuper layer + # out_proj_weight_numel = self.qkv_dim * self.sample_q_embed_dim + # out_proj_bias_numel = self. + + return in_proj_q_weight_numel + in_proj_k_weight_numel + in_proj_v_weight_numel + in_proj_bias_numel + + def set_sample_config(self, sample_q_embed_dim, sample_attention_heads, sample_kv_embed_dim=None): # noqa: D102 + self.sample_q_embed_dim = sample_q_embed_dim + if sample_kv_embed_dim is None: + self.sample_kv_embed_dim = sample_q_embed_dim + else: + self.sample_kv_embed_dim = sample_kv_embed_dim + + self.num_heads = sample_attention_heads + self.head_dim = self.qkv_dim // self.num_heads + assert self.head_dim * \ + self.num_heads == self.qkv_dim, "qkv_dim must be divisible by sampled num_heads" + self.scaling = self.head_dim ** -0.5 + + self.out_proj.set_sample_config( + sample_in_dim=self.qkv_dim, sample_out_dim=self.sample_q_embed_dim) + + def prepare_for_onnx_export_(self): # noqa: D102 + self.onnx_trace = True + + def reset_parameters(self): # noqa: D102 + if self.qkv_same_dim: + nn.init.xavier_uniform_(self.in_proj_weight) + else: + nn.init.xavier_uniform_(self.k_proj_weight) + nn.init.xavier_uniform_(self.v_proj_weight) + nn.init.xavier_uniform_(self.q_proj_weight) + + nn.init.xavier_uniform_(self.out_proj.weight) + if self.in_proj_bias is not None: + nn.init.constant_(self.in_proj_bias, 0.) + nn.init.constant_(self.out_proj.bias, 0.) + if self.bias_k is not None: + nn.init.xavier_normal_(self.bias_k) + if self.bias_v is not None: + nn.init.xavier_normal_(self.bias_v) + + def forward(self, query, key, value, key_padding_mask=None, incremental_state=None, + need_weights=True, static_kv=False, attn_mask=None): + """Input shape: Time x Batch x Channel. + + Timesteps can be masked by supplying a T x T mask in the + `attn_mask` argument. Padding elements can be excluded from + the key by passing a binary ByteTensor (`key_padding_mask`) with shape: + batch x src_len, where padding elements are indicated by 1s. + """ + tgt_len, bsz, embed_dim = query.size() + + if incremental_state is not None: + saved_state = self._get_input_buffer(incremental_state) + if 'prev_key' in saved_state: + # previous time steps are cached - no need to recompute + # key and value if they are static + if static_kv: + assert self.encoder_decoder_attention and not self.self_attention + key = value = None + else: + saved_state = None + + if self.self_attention: + # self-attention + q, k, v = self.in_proj_qkv(query) + elif self.encoder_decoder_attention: + # encoder-decoder attention + q = self.in_proj_q(query) + if key is None: + assert value is None + k = v = None + else: + k = self.in_proj_k(key) + v = self.in_proj_v(key) + + else: + q = self.in_proj_q(query) + k = self.in_proj_k(key) + v = self.in_proj_v(value) + + q = q * self.scaling + + if self.bias_k is not None: + assert self.bias_v is not None + k = torch.cat([k, self.bias_k.repeat(1, bsz, 1)]) + v = torch.cat([v, self.bias_v.repeat(1, bsz, 1)]) + if attn_mask is not None: + attn_mask = torch.cat( + [attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1) + if key_padding_mask is not None: + key_padding_mask = torch.cat( + [key_padding_mask, key_padding_mask.new_zeros(key_padding_mask.size(0), 1)], dim=1) + + q = q.contiguous().view(tgt_len, bsz * self.num_heads, self.head_dim).transpose(0, 1) + + if k is not None: + k = k.contiguous().view(-1, bsz * self.num_heads, self.head_dim).transpose(0, 1) + if v is not None: + v = v.contiguous().view(-1, bsz * self.num_heads, self.head_dim).transpose(0, 1) + + if saved_state is not None: + # saved states are stored with shape (bsz, num_heads, seq_len, head_dim) + if 'prev_key' in saved_state: + prev_key = saved_state['prev_key'].view( + bsz * self.num_heads, -1, self.head_dim) + if static_kv: + k = prev_key + else: + k = torch.cat((prev_key, k), dim=1) + if 'prev_value' in saved_state: + prev_value = saved_state['prev_value'].view( + bsz * self.num_heads, -1, self.head_dim) + if static_kv: + v = prev_value + else: + v = torch.cat((prev_value, v), dim=1) + saved_state['prev_key'] = k.view( + bsz, self.num_heads, -1, self.head_dim) + saved_state['prev_value'] = v.view( + bsz, self.num_heads, -1, self.head_dim) + + self._set_input_buffer(incremental_state, saved_state) + + src_len = k.size(1) + + # This is part of a workaround to get around fork/join parallelism + # not supporting Optional types. + if key_padding_mask is not None and key_padding_mask.shape == torch.Size([]): + key_padding_mask = None + + if key_padding_mask is not None: + fil = key_padding_mask.new_ones( + key_padding_mask.size(0), src_len-key_padding_mask.size(1)) + key_padding_mask = torch.cat((key_padding_mask, fil), dim=1) + assert key_padding_mask.size(0) == bsz + assert key_padding_mask.size(1) == src_len + + if self.add_zero_attn: + src_len += 1 + k = torch.cat( + [k, k.new_zeros((k.size(0), 1) + k.size()[2:])], dim=1) + v = torch.cat( + [v, v.new_zeros((v.size(0), 1) + v.size()[2:])], dim=1) + if attn_mask is not None: + attn_mask = torch.cat( + [attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1) + if key_padding_mask is not None: + key_padding_mask = torch.cat( + [key_padding_mask, torch.zeros(key_padding_mask.size(0), 1).type_as(key_padding_mask)], dim=1) + + attn_weights = torch.bmm(q, k.transpose(1, 2)) + attn_weights = self.apply_sparse_mask( + attn_weights, tgt_len, src_len, bsz) + + assert list(attn_weights.size()) == [ + bsz * self.num_heads, tgt_len, src_len] + + if attn_mask is not None: + attn_mask = attn_mask.unsqueeze(0) + if self.onnx_trace: + attn_mask = attn_mask.repeat(attn_weights.size(0), 1, 1) + attn_weights += attn_mask + + if key_padding_mask is not None: + attn_weights = attn_weights.view( + bsz, self.num_heads, tgt_len, src_len) + if self.onnx_trace: + attn_weights = torch.where( + key_padding_mask.unsqueeze(1).unsqueeze(2), + torch.Tensor([float("-Inf")]), + attn_weights.float() + ).type_as(attn_weights) + else: + attn_weights = attn_weights.masked_fill( + key_padding_mask.unsqueeze(1).unsqueeze(2), + float('-inf'), + ) + attn_weights = attn_weights.view( + bsz * self.num_heads, tgt_len, src_len) + + attn_weights = fairseq.utils.softmax( + attn_weights, dim=-1, onnx_trace=self.onnx_trace, + ).type_as(attn_weights) + attn_weights = F.dropout( + attn_weights, p=self.dropout, training=self.training) + + attn = torch.bmm(attn_weights, v) + + assert list(attn.size()) == [ + bsz * self.num_heads, tgt_len, self.head_dim] + + if (self.onnx_trace and attn.size(1) == 1): + # when ONNX tracing a single decoder step (sequence length == 1) + # the transpose is a no-op copy before view, thus unnecessary + attn = attn.contiguous().view(tgt_len, bsz, self.qkv_dim) + else: + attn = attn.transpose(0, 1).contiguous().view( + tgt_len, bsz, self.qkv_dim) + attn = self.out_proj(attn) + + if need_weights: + # average attention weights over heads + attn_weights = attn_weights.view( + bsz, self.num_heads, tgt_len, src_len) + + attn_weights = attn_weights.sum(dim=1) / self.num_heads + else: + attn_weights = None + + return attn, attn_weights + + def in_proj_qkv(self, query): # noqa: D102 + return self._in_proj(query, sample_dim=self.sample_q_embed_dim).chunk(3, dim=-1) + + def in_proj_q(self, query): # noqa: D102 + if self.qkv_same_dim: + return self._in_proj(query, end=self.qkv_dim, sample_dim=self.sample_q_embed_dim) + else: + bias = self.in_proj_bias + if bias is not None: + bias = bias[:self.qkv_dim] + return F.linear(query, self.q_proj_weight[..., :self.sample_q_embed_dim], bias) + + def in_proj_k(self, key): # noqa: D102 + if self.qkv_same_dim: + return self._in_proj(key, start=self.qkv_dim, end=2 * self.qkv_dim, sample_dim=self.sample_kv_embed_dim) + else: + weight = self.k_proj_weight + bias = self.in_proj_bias + if bias is not None: + bias = bias[self.qkv_dim:2 * self.qkv_dim] + return F.linear(key, weight[..., :self.sample_kv_embed_dim], bias) + + def in_proj_v(self, value): # noqa: D102 + if self.qkv_same_dim: + return self._in_proj(value, start=2 * self.qkv_dim, sample_dim=self.sample_kv_embed_dim) + else: + weight = self.v_proj_weight + bias = self.in_proj_bias + if bias is not None: + bias = bias[2 * self.qkv_dim:] + return F.linear(value, weight[..., :self.sample_kv_embed_dim], bias) + + def _in_proj(self, input, sample_dim, start=0, end=None): + weight = self.in_proj_weight + bias = self.in_proj_bias + weight = weight[start:end, :sample_dim] + if bias is not None: + bias = bias[start:end] + return F.linear(input, weight, bias) + + def reorder_incremental_state(self, incremental_state, new_order): + """Reorder buffered internal state (for incremental generation).""" + input_buffer = self._get_input_buffer(incremental_state) + if input_buffer is not None: + for k in input_buffer.keys(): + input_buffer[k] = input_buffer[k].index_select(0, new_order) + self._set_input_buffer(incremental_state, input_buffer) + + def _get_input_buffer(self, incremental_state): + return get_incremental_state( + self, + incremental_state, + 'attn_state', + ) or {} + + def _set_input_buffer(self, incremental_state, buffer): + set_incremental_state( + self, + incremental_state, + 'attn_state', + buffer, + ) + + def apply_sparse_mask(self, attn_weights, tgt_len, src_len, bsz): # noqa: D102 + return attn_weights + + def __repr__(self): # noqa: D105 + # We treat the extra repr like the sub-module, one item per line + extra_lines = [] + extra_repr = self.extra_repr() + # empty string will be split into list [''] + if extra_repr: + extra_lines = extra_repr.split('\n') + child_lines = [] + for key, module in self._modules.items(): + mod_str = repr(module) + mod_str = _addindent(mod_str, 2) + child_lines.append('(' + key + '): ' + mod_str) + lines = extra_lines + child_lines + + main_str = self._get_name() + '\tnum_heads:' + str(self.num_heads) + \ + '\t qkv_dim:' + str(self.qkv_dim) + if lines: + # simple one-liner info, which most builtin Modules will use + if len(extra_lines) == 1 and not child_lines: + main_str += extra_lines[0] + else: + main_str += '\n ' + '\n '.join(lines) + '\n' + + main_str += ')' + return main_str diff --git a/neural_compressor/experimental/nas/dynast/supernetwork/machine_translation/transformer_interface.py b/neural_compressor/experimental/nas/dynast/supernetwork/machine_translation/transformer_interface.py new file mode 100644 index 00000000000..0b76b052b21 --- /dev/null +++ b/neural_compressor/experimental/nas/dynast/supernetwork/machine_translation/transformer_interface.py @@ -0,0 +1,347 @@ +# https://github.com/mit-han-lab/hardware-aware-transformers/blob/master/LICENSE +# +# Copyright (c) 2022 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Translate pre-processed data with a trained model.""" +import time +import warnings + +import numpy as np + +from neural_compressor.utils.utility import logger, LazyImport + +from .transformer_supernetwork import TransformerSuperNetwork + +torch = LazyImport('torch') +torchprofile = LazyImport('torchprofile') +fairseq = LazyImport('fairseq') + +warnings.filterwarnings("ignore") + + +def compute_bleu(config, dataset_path, checkpoint_path): + """Measure BLEU score of the Transformer-based model.""" + options = fairseq.options + utils = fairseq.utils + tasks = fairseq.tasks + MosesTokenizer = fairseq.data.encoders.moses_tokenizer.MosesTokenizer + StopwatchMeter = fairseq.meters.StopwatchMeter + progress_bar = fairseq.progress_bar + + parser = options.get_generation_parser() + + args = options.parse_args_and_arch(parser, [dataset_path]) + + args.data = dataset_path + args.beam = 5 + args.remove_bpe = '@@ ' + args.gen_subset = 'test' + args.lenpen = 0.6 + args.source_lang = 'en' + args.target_lang = 'de' + args.batch_size = 128 + args.eval_bleu_remove_bpe = '@@ ' + args.eval_bleu_detok = 'moses' + + utils.import_user_module(args) + + use_cuda = torch.cuda.is_available() and not args.cpu + + # when running on CPU, use fp32 as default + if not use_cuda: + args.fp16 = False + + torch.manual_seed(args.seed) + + # Optimize ensemble for generation + # Load dataset splits + task = tasks.setup_task(args) + task.load_dataset(args.gen_subset) + + tokenizer = MosesTokenizer(args) + task.tokenizer=tokenizer + # Set dictionaries + try: + src_dict = getattr(task, 'source_dictionary', None) + except NotImplementedError: + src_dict = None + tgt_dict = task.target_dictionary + + # Load ensemble + model = TransformerSuperNetwork(task) + state = torch.load(checkpoint_path, map_location=torch.device('cpu')) + + model.load_state_dict(state['model'], + strict=True) + + if use_cuda: + model.cuda() + model.set_sample_config(config) + model.make_generation_fast_( + beamable_mm_beam_size=None if args.no_beamable_mm else args.beam, + need_attn=args.print_alignment, + ) + if args.fp16: + model.half() + if use_cuda: + model.cuda() + + + # Load alignment dictionary for unknown word replacement + # (None if no unknown word replacement, empty if no path to align dictionary) + align_dict = utils.load_align_dict(args.replace_unk) + + # Load dataset (possibly sharded) + itr = task.get_batch_iterator( + dataset=task.dataset(args.gen_subset), + max_tokens=args.max_tokens, + max_sentences=128, + max_positions=utils.resolve_max_positions( + task.max_positions(), + *[model.max_positions()] + ), + ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, + required_batch_size_multiple=args.required_batch_size_multiple, + num_shards=args.num_shards, + shard_id=args.shard_id, + num_workers=args.num_workers, + ).next_epoch_itr(shuffle=False) + + # Initialize generator + gen_timer = StopwatchMeter() + generator = task.build_generator([model], args) + + num_sentences = 0 + bleu_list = [] + with progress_bar.build_progress_bar(args, itr) as t: + for sample in t: + sample = utils.move_to_cuda(sample) if use_cuda else sample + if 'net_input' not in sample: + continue + + bleu = task._inference_with_bleu(generator,sample,model) + bleu_list.append(bleu.score) + + num_sentences += sample['nsentences'] + + bleu_score = np.mean(np.array(bleu_list)) + return bleu_score + + +def compute_latency(config, dataset_path, batch_size, get_model_parameters=False): + """Measure latency of the Transformer-based model.""" + options = fairseq.options + utils = fairseq.utils + tasks = fairseq.tasks + + parser = options.get_generation_parser() + + args = options.parse_args_and_arch(parser, [dataset_path]) + + args.data = dataset_path + args.beam = 5 + args.remove_bpe = '@@ ' + args.gen_subset = 'test' + args.lenpen = 0.6 + args.source_lang = 'en' + args.target_lang = 'de' + args.batch_size = batch_size + utils.import_user_module(args) + args.latgpu = False + args.latcpu = True + args.latiter = 100 + + # Initialize CUDA and distributed training + if torch.cuda.is_available() and not args.cpu: + torch.cuda.set_device(args.device_id) + torch.manual_seed(args.seed) + + # Optimize ensemble for generation + # Load dataset splits + task = tasks.setup_task(args) + task.load_dataset(args.gen_subset) + + # Load ensemble + model = TransformerSuperNetwork(task) + + # specify the length of the dummy input for profile + # for iwslt, the average length is 23, for wmt, that is 30 + dummy_sentence_length_dict = {'iwslt': 23, 'wmt': 30} + + dummy_sentence_length = dummy_sentence_length_dict['wmt'] + + dummy_src_tokens = [2] + [7] * (dummy_sentence_length - 1) + dummy_prev = [7] * (dummy_sentence_length - 1) + [2] + + src_tokens_test = torch.tensor( + [dummy_src_tokens], dtype=torch.long) + src_lengths_test = torch.tensor([dummy_sentence_length]) + prev_output_tokens_test_with_beam = torch.tensor( + [dummy_prev] * args.beam, dtype=torch.long) + bsz = 1 + new_order = torch.arange(bsz).view(-1, 1).repeat(1, + args.beam).view(-1).long() + if args.latcpu: + model.cpu() + logger.info('Measuring model latency on CPU for dataset generation...') + elif args.latgpu: + model.cuda() + src_tokens_test = src_tokens_test + src_lengths_test = src_lengths_test + prev_output_tokens_test_with_beam = prev_output_tokens_test_with_beam + logger.info('Measuring model latency on GPU for dataset generation...') + start = torch.cuda.Event(enable_timing=True) + end = torch.cuda.Event(enable_timing=True) + + model.set_sample_config(config) + + model.eval() + + with torch.no_grad(): + + # dry runs + for _ in range(15): + encoder_out_test = model.encoder( + src_tokens=src_tokens_test, src_lengths=src_lengths_test) + + encoder_latencies = [] + logger.info('[DyNAS-T] Measuring encoder for dataset generation...') + for _ in range(args.latiter): + if args.latgpu: + start = time.time() + elif args.latcpu: + start = time.time() + + model.encoder(src_tokens=src_tokens_test, + src_lengths=src_lengths_test) + + if args.latgpu: + end = time.time() + encoder_latencies.append((end - start) * 1000) + elif args.latcpu: + end = time.time() + encoder_latencies.append((end - start) * 1000) + + encoder_latencies.sort() + encoder_latencies = encoder_latencies[int( + args.latiter * 0.1): -max(1, int(args.latiter * 0.1))] + logger.info( + f'[DyNAS-T] Encoder latency for dataset generation: Mean: ' + '{np.mean(encoder_latencies)} ms; Std: {np.std(encoder_latencies)} ms' + ) + + encoder_out_test_with_beam = model.encoder.reorder_encoder_out( + encoder_out_test, new_order) + + # dry runs + for _ in range(15): + model.decoder(prev_output_tokens=prev_output_tokens_test_with_beam, + encoder_out=encoder_out_test_with_beam) + + # decoder is more complicated because we need to deal with incremental states and auto regressive things + decoder_iterations_dict = {'iwslt': 23, 'wmt': 30} + + decoder_iterations = decoder_iterations_dict['wmt'] + decoder_latencies = [] + + logger.info('[DyNAS-T] Measuring decoder for dataset generation...') + for _ in range(args.latiter): + if args.latgpu: + start = time.time() + elif args.latcpu: + start = time.time() + incre_states = {} + for k_regressive in range(decoder_iterations): + model.decoder(prev_output_tokens=prev_output_tokens_test_with_beam[:, :k_regressive + 1], + encoder_out=encoder_out_test_with_beam, incremental_state=incre_states) + if args.latgpu: + end = time.time() + decoder_latencies.append((end - start) * 1000) + + elif args.latcpu: + end = time.time() + decoder_latencies.append((end - start) * 1000) + + # only use the 10% to 90% latencies to avoid outliers + decoder_latencies.sort() + decoder_latencies = decoder_latencies[int( + args.latiter * 0.1): -max(1, int(args.latiter * 0.1))] + + logger.info( + f'[DyNAS-T] Decoder latency for dataset generation: Mean: ' + '{np.mean(decoder_latencies)} ms; \t Std: {np.std(decoder_latencies)} ms' + ) + + lat_mean = np.mean(encoder_latencies)+np.mean(decoder_latencies) + lat_std = np.std(encoder_latencies)+np.std(decoder_latencies) + return lat_mean, lat_std + + +def compute_macs(config, dataset_path): + """Calculate MACs for Transformer-based models.""" + options = fairseq.options + utils = fairseq.utils + tasks = fairseq.tasks + + parser = options.get_generation_parser() + + args = options.parse_args_and_arch(parser,[dataset_path]) + + args.data = dataset_path + args.beam = 5 + args.remove_bpe = '@@ ' + args.gen_subset = 'test' + args.lenpen = 0.6 + args.source_lang = 'en' + args.target_lang = 'de' + args.batch_size = 128 + utils.import_user_module(args) + args.latgpu=False + args.latcpu=True + args.latiter=100 + + # Initialize CUDA and distributed training + if torch.cuda.is_available() and not args.cpu: + torch.cuda.set_device(args.device_id) + torch.manual_seed(args.seed) + + #Optimize ensemble for generation + # Load dataset splits + task = tasks.setup_task(args) + task.load_dataset(args.gen_subset) + + # Load model + logger.info('[DyNAS-T] loading model(s) from {}'.format(args.path)) + model = TransformerSuperNetwork(task) + + # specify the length of the dummy input for profile + # for iwslt, the average length is 23, for wmt, that is 30 + dummy_sentence_length_dict = {'iwslt': 23, 'wmt': 30} + + dummy_sentence_length = dummy_sentence_length_dict['wmt'] + + + dummy_src_tokens = [2] + [7] * (dummy_sentence_length - 1) + dummy_prev = [7] * (dummy_sentence_length - 1) + [2] + + model.eval() + model.profile(mode=True) + model.set_sample_config(config) + macs = torchprofile.profile_macs(model, args=(torch.tensor([dummy_src_tokens], dtype=torch.long), + torch.tensor([30]), torch.tensor([dummy_prev], dtype=torch.long))) + + model.profile(mode=False) + + return macs diff --git a/neural_compressor/experimental/nas/dynast/supernetwork/machine_translation/transformer_supernetwork.py b/neural_compressor/experimental/nas/dynast/supernetwork/machine_translation/transformer_supernetwork.py new file mode 100644 index 00000000000..1034c6519a8 --- /dev/null +++ b/neural_compressor/experimental/nas/dynast/supernetwork/machine_translation/transformer_supernetwork.py @@ -0,0 +1,1088 @@ +#noqa: D100 +# https://github.com/mit-han-lab/hardware-aware-transformers/blob/master/LICENSE +# +# Copyright (c) 2022 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math + +import torch +import torch.nn.functional as F +from torch import nn + +from neural_compressor.utils import logger +from neural_compressor.utils.utility import LazyImport + +from .modules_supernetwork import (EmbeddingSuper, LayerNormSuper, LinearSuper, + MultiheadAttentionSuper) + +fairseq = LazyImport("fairseq") + +DEFAULT_MAX_SOURCE_POSITIONS = 1024 +DEFAULT_MAX_TARGET_POSITIONS = 1024 + + +class TransformerSuperNetwork(fairseq.models.BaseFairseqModel): + """Transformer model from `"Attention Is All You Need" (Vaswani, et al, 2017)`. + + + + Args: + encoder (TransformerEncoder): the encoder + decoder (TransformerDecoder): the decoder + + The Transformer model provides the following named architectures and + command-line arguments: + + .. argparse:: + :ref: fairseq.models.transformer_parser + :prog: + """ + + def __init__(self, task): #noqa: D107 + super().__init__() + + src_dict, tgt_dict = task.source_dictionary, task.target_dictionary + encoder_config = {'encoder_embed_dim': 640, + 'encoder_layers': 6, + 'encoder_attention_heads': 8, + 'encoder_ffn_embed_dim': 3072, + 'encoder_embed_path': None} + + decoder_config = {'decoder_embed_dim': 640, + 'decoder_layers': 6, + 'decoder_attention_heads': 8, + 'decoder_ffn_embed_dim': 3072} + + encoder_embed_tokens = self.build_embedding( + src_dict, encoder_config['encoder_embed_dim'], encoder_config['encoder_embed_path'] + ) + decoder_embed_tokens = encoder_embed_tokens + self.share_decoder_input_output_embed = True + + self.encoder = TransformerEncoder( + encoder_config, src_dict, encoder_embed_tokens) + self.decoder = TransformerDecoder( + decoder_config, tgt_dict, decoder_embed_tokens) + + def build_embedding(self, dictionary, embed_dim, path=None): #noqa: D102 + utils = fairseq.utils + + num_embeddings = len(dictionary) + padding_idx = dictionary.pad() + emb = Embedding(num_embeddings, embed_dim, padding_idx) + # if provided, load from preloaded dictionaries + if path: + embed_dict = utils.parse_embedding(path) + utils.load_embedding(embed_dict, dictionary, emb) + return emb + + def profile(self, mode=True): #noqa: D102 + for module in self.modules(): + if hasattr(module, 'profile') and self != module: + module.profile(mode) + + def get_sampled_params_numel(self, config): #noqa: D102 + self.set_sample_config(config) + numels = [] + for name, module in self.named_modules(): + if hasattr(module, 'calc_sampled_param_num'): + # a hacky way to skip the layers that exceed encoder-layer-num or decoder-layer-num + if ( + name.split('.')[0] == 'encoder' + and eval(name.split('.')[2]) >= config['encoder']['encoder_layer_num'] + ): + continue + if ( + name.split('.')[0] == 'decoder' + and eval(name.split('.')[2]) >= config['decoder']['decoder_layer_num'] + ): + continue + + numels.append(module.calc_sampled_param_num()) + return sum(numels) + + def set_sample_config(self, config): #noqa: D102 + logger.info('[DyNAS-T] Setting active configuration to {}'.format(config)) + self.encoder.set_sample_config(config) + self.decoder.set_sample_config(config) + + def forward(self,src_tokens,src_lengths,prev_output_token): #noqa: D102 + encoder_output = self.encoder.forward(src_tokens,src_lengths) + output = self.decoder(prev_output_token,encoder_output) + return output + + +class TransformerEncoder(fairseq.models.FairseqEncoder): + """Transformer encoder consisting of *args.encoder_layers* layers. + + Each layer is a :class:`TransformerEncoderLayer`. + + Args: + args (argparse.Namespace): parsed command-line arguments + dictionary (~fairseq.data.Dictionary): encoding dictionary + embed_tokens (torch.nn.Embedding): input embedding + """ + + def __init__(self, encoder_config, dictionary, embed_tokens): #noqa: D107 + super().__init__(dictionary) + # the configs of super arch + self.super_embed_dim = encoder_config['encoder_embed_dim'] + self.super_ffn_embed_dim = [ + encoder_config['encoder_ffn_embed_dim']] * encoder_config['encoder_layers'] + self.super_layer_num = encoder_config['encoder_layers'] + self.super_self_attention_heads = [ + encoder_config['encoder_attention_heads']] * encoder_config['encoder_layers'] + + self.super_dropout = 0.3 + self.super_activation_dropout = 0 + + self.super_embed_scale = math.sqrt(self.super_embed_dim) + + # the configs of current sampled arch + self.sample_embed_dim = None + self.sample_ffn_embed_dim = None + self.sample_layer_num = None + self.sample_self_attention_heads = None + + self.sample_dropout = None + self.sample_activation_dropout = None + + self.sample_embed_scale = None + + self.register_buffer('version', torch.Tensor([3])) + + self.padding_idx = embed_tokens.padding_idx + self.max_source_positions = DEFAULT_MAX_SOURCE_POSITIONS + + self.embed_tokens = embed_tokens + + self.embed_positions = fairseq.modules.PositionalEmbedding( + self.max_source_positions, self.super_embed_dim, self.padding_idx, + learned=False, + ) + + self.layers = nn.ModuleList([]) + self.layers.extend([ + TransformerEncoderLayer(encoder_config, layer_idx=i) + for i in range(self.super_layer_num) + ]) + + if False: + self.layer_norm = LayerNormSuper(self.super_embed_dim) + else: + self.layer_norm = None + + self.vocab_original_scaling = False + + def set_sample_config(self, config: dict): #noqa: D102 + + self.sample_embed_dim = config['encoder']['encoder_embed_dim'] + + # Caution: this is a list for all layers + self.sample_ffn_embed_dim = config['encoder']['encoder_ffn_embed_dim'] + + self.sample_layer_num = config['encoder']['encoder_layer_num'] + + # Caution: this is a list for all layers + self.sample_self_attention_heads = config['encoder']['encoder_self_attention_heads'] + + self.sample_dropout = calc_dropout( + self.super_dropout, self.sample_embed_dim, self.super_embed_dim) + self.sample_activation_dropout = calc_dropout( + self.super_activation_dropout, self.sample_embed_dim, self.super_embed_dim) + + self.sample_embed_scale = math.sqrt( + self.sample_embed_dim) if not self.vocab_original_scaling else self.super_embed_scale + + self.embed_tokens.set_sample_config( + sample_embed_dim=self.sample_embed_dim, part='encoder') + + if self.layer_norm is not None: + self.layer_norm.set_sample_config( + sample_embed_dim=self.sample_embed_dim) + + for i, layer in enumerate(self.layers): + # not exceed sample layer number + if i < self.sample_layer_num: + layer.set_sample_config(is_identity_layer=False, + sample_embed_dim=self.sample_embed_dim, + sample_ffn_embed_dim_this_layer=self.sample_ffn_embed_dim[i], + sample_self_attention_heads_this_layer=self.sample_self_attention_heads[ + i], + sample_dropout=self.sample_dropout, + sample_activation_dropout=self.sample_activation_dropout) + # exceeds sample layer number + else: + layer.set_sample_config(is_identity_layer=True) + + def forward(self, src_tokens, src_lengths): + """Forward function. + + Args: + src_tokens (LongTensor): tokens in the source language of shape + `(batch, src_len)` + src_lengths (torch.LongTensor): lengths of each source sentence of + shape `(batch)` + + Returns: + dict: + - **encoder_out** (Tensor): the last encoder layer's output of + shape `(src_len, batch, embed_dim)` + - **encoder_padding_mask** (ByteTensor): the positions of + padding elements of shape `(batch, src_len)` + """ + # embed tokens and positions + x = self.sample_embed_scale * \ + self.embed_tokens(src_tokens, part='encoder') + if self.embed_positions is not None: + positions = self.embed_positions(src_tokens) + + # sample the positional embedding and add + x += positions[..., :self.sample_embed_dim] + x = F.dropout(x, p=self.sample_dropout, training=self.training) + + # B x T x C -> T x B x C + x = x.transpose(0, 1) + + # compute padding mask + encoder_padding_mask = src_tokens.eq(self.padding_idx) + if not encoder_padding_mask.any(): + encoder_padding_mask = None + + all_x = [] + # encoder layers + for layer in self.layers: + x = layer(x, encoder_padding_mask) + all_x.append(x) + + if self.layer_norm: + x = self.layer_norm(x) + + return { + 'encoder_out': x, + 'encoder_out_all': all_x, + 'encoder_padding_mask': encoder_padding_mask, + } + + def reorder_encoder_out(self, encoder_out, new_order): + """Reorder encoder output according to *new_order*. + + Args: + encoder_out: output from the ``forward()`` method + new_order (LongTensor): desired order + + Returns: + *encoder_out* rearranged according to *new_order* + """ + if encoder_out['encoder_out'] is not None: + encoder_out['encoder_out'] = \ + encoder_out['encoder_out'].index_select(1, new_order) + if encoder_out['encoder_padding_mask'] is not None: + encoder_out['encoder_padding_mask'] = \ + encoder_out['encoder_padding_mask'].index_select(0, new_order) + # need to reorder each layer of output + if 'encoder_out_all' in encoder_out.keys(): + new_encoder_out_all = [] + for encoder_out_one_layer in encoder_out['encoder_out_all']: + new_encoder_out_all.append( + encoder_out_one_layer.index_select(1, new_order)) + encoder_out['encoder_out_all'] = new_encoder_out_all + + return encoder_out + + def max_positions(self): + """Maximum input length supported by the encoder.""" + if self.embed_positions is None: + return self.max_source_positions + return min(self.max_source_positions, self.embed_positions.max_positions()) + + def upgrade_state_dict_named(self, state_dict, name): + """Upgrade a (possibly old) state dict for new versions of fairseq.""" + utils = fairseq.utils + if isinstance(self.embed_positions, fairseq.modules.SinusoidalPositionalEmbedding): + weights_key = '{}.embed_positions.weights'.format(name) + if weights_key in state_dict: + del state_dict[weights_key] + state_dict['{}.embed_positions._float_tensor'.format( + name)] = torch.FloatTensor(1) + for i in range(len(self.layers)): + # update layer norms + self.layers[i].upgrade_state_dict_named( + state_dict, "{}.layers.{}".format(name, i)) + + version_key = '{}.version'.format(name) + if utils.item(state_dict.get(version_key, torch.Tensor([1]))[0]) < 2: + # earlier checkpoints did not normalize after the stack of layers + self.layer_norm = None + self.normalize = False + state_dict[version_key] = torch.Tensor([1]) + return state_dict + + +class TransformerDecoder(fairseq.models.FairseqIncrementalDecoder): + """Transformer decoder consisting of *args.decoder_layers* layers. + + Each layer is a :class:`TransformerDecoderLayer`. + + Args: + args (argparse.Namespace): parsed command-line arguments + dictionary (~fairseq.data.Dictionary): decoding dictionary + embed_tokens (torch.nn.Embedding): output embedding + no_encoder_attn (bool, optional): whether to attend to encoder outputs + (default: False). + """ + + def __init__(self, decoder_config, dictionary, embed_tokens, no_encoder_attn=False): #noqa: D107 + super().__init__(dictionary) + + # the configs of super arch + self.super_embed_dim = decoder_config['decoder_embed_dim'] + self.super_ffn_embed_dim = decoder_config['decoder_ffn_embed_dim'] * \ + decoder_config['decoder_layers'] + self.super_layer_num = decoder_config['decoder_layers'] + self.super_self_attention_heads = 8 * \ + [decoder_config['decoder_attention_heads']] * \ + decoder_config['decoder_layers'] + self.super_ende_attention_heads = [ + decoder_config['decoder_attention_heads']] * decoder_config['decoder_layers'] + self.super_arbitrary_ende_attn = [-1] * \ + decoder_config['decoder_layers'] + + self.super_dropout = 0.3 + self.super_activation_dropout = 0.0 + + self.super_embed_scale = math.sqrt(self.super_embed_dim) + + # the configs of current sampled arch + self.sample_embed_dim = None + self.sample_ffn_embed_dim = None + self.sample_layer_num = None + self.sample_self_attention_heads = None + self.sample_ende_attention_heads = None + self.sample_arbitrary_ende_attn = None + + self.sample_dropout = None + self.sample_activation_dropout = None + + self.sample_embed_scale = None + + # the configs of current sampled arch + self.register_buffer('version', torch.Tensor([3])) + + self.share_input_output_embed = True + + self.output_embed_dim = decoder_config['decoder_embed_dim'] + + padding_idx = embed_tokens.padding_idx + self.max_target_positions = DEFAULT_MAX_TARGET_POSITIONS + + self.embed_tokens = embed_tokens + + self.embed_positions = fairseq.modules.PositionalEmbedding( + self.max_target_positions, self.super_embed_dim, padding_idx, + learned=False, + ) if not False else None + + self.layers = nn.ModuleList([]) + self.layers.extend([ + TransformerDecoderLayer( + decoder_config, layer_idx=i, no_encoder_attn=no_encoder_attn) + for i in range(self.super_layer_num) + ]) + + self.adaptive_softmax = None + + self.project_out_dim = Linear(self.super_embed_dim, self.output_embed_dim, bias=False) \ + if self.super_embed_dim != self.output_embed_dim else None + + if not self.share_input_output_embed: + self.embed_out = nn.Parameter(torch.Tensor( + len(dictionary), self.output_embed_dim)) + nn.init.normal_(self.embed_out, mean=0, + std=self.output_embed_dim ** -0.5) + + self.layer_norm = None + self.get_attn = False + + self.vocab_original_scaling = False + + def set_sample_config(self, config: dict): #noqa: D102 + + self.sample_embed_dim = config['decoder']['decoder_embed_dim'] + self.sample_encoder_embed_dim = config['encoder']['encoder_embed_dim'] + + # Caution: this is a list for all layers + self.sample_ffn_embed_dim = config['decoder']['decoder_ffn_embed_dim'] + + # Caution: this is a list for all layers + self.sample_self_attention_heads = config['decoder']['decoder_self_attention_heads'] + + # Caution: this is a list for all layers + self.sample_ende_attention_heads = config['decoder']['decoder_ende_attention_heads'] + + self.sample_arbitrary_ende_attn = config['decoder']['decoder_arbitrary_ende_attn'] + + self.sample_layer_num = config['decoder']['decoder_layer_num'] + + self.sample_dropout = calc_dropout( + self.super_dropout, self.sample_embed_dim, self.super_embed_dim) + self.sample_activation_dropout = calc_dropout( + self.super_activation_dropout, self.sample_embed_dim, self.super_embed_dim) + + self.sample_embed_scale = math.sqrt( + self.sample_embed_dim) if not self.vocab_original_scaling else self.super_embed_scale + + self.embed_tokens.set_sample_config( + sample_embed_dim=self.sample_embed_dim, part='decoder') + + if self.layer_norm is not None: + self.layer_norm.set_sample_config( + sample_embed_dim=self.sample_embed_dim) + + for i, layer in enumerate(self.layers): + # not exceed sample layer number + if i < self.sample_layer_num: + layer.set_sample_config(is_identity_layer=False, + sample_embed_dim=self.sample_embed_dim, + sample_encoder_embed_dim=self.sample_encoder_embed_dim, + sample_ffn_embed_dim_this_layer=self.sample_ffn_embed_dim[i], + sample_self_attention_heads_this_layer=self.sample_self_attention_heads[ + i], + sample_ende_attention_heads_this_layer=self.sample_ende_attention_heads[ + i], + sample_dropout=self.sample_dropout, + sample_activation_dropout=self.sample_activation_dropout) + # exceeds sample layer number + else: + layer.set_sample_config(is_identity_layer=True) + + def forward(self, prev_output_tokens, encoder_out=None, incremental_state=None, **unused): + """Forward pass. + + Args: + prev_output_tokens (LongTensor): previous decoder outputs of shape + `(batch, tgt_len)`, for teacher forcing + encoder_out (Tensor, optional): output from the encoder, used for + encoder-side attention + incremental_state (dict): dictionary used for storing state during + :ref:`Incremental decoding` + + Returns: + tuple: + - the decoder's output of shape `(batch, tgt_len, vocab)` + - a dictionary with any model-specific outputs + """ + x, extra = self.extract_features( + prev_output_tokens, encoder_out, incremental_state) + x = self.output_layer(x) + return x, extra + + def extract_features(self, prev_output_tokens, encoder_out=None, incremental_state=None, **unused): + """Similar to *forward* but only return features. + + Returns: + tuple: + - the decoder's features of shape `(batch, tgt_len, embed_dim)` + - a dictionary with any model-specific outputs + """ + # embed positions + positions = self.embed_positions( + prev_output_tokens, + incremental_state=incremental_state, + ) if self.embed_positions is not None else None + + if positions is not None: + positions = positions[..., :self.sample_embed_dim] + + if incremental_state is not None: + # only take the last token in to the decoder + prev_output_tokens = prev_output_tokens[:, -1:] + if positions is not None: + positions = positions[:, -1:] + + # embed tokens and positions + x = self.sample_embed_scale * \ + self.embed_tokens(prev_output_tokens, part='decoder') + + if positions is not None: + x += positions + x = F.dropout(x, p=self.sample_dropout, training=self.training) + + # B x T x C -> T x B x C + x = x.transpose(0, 1) + attn = None + attns = [] + inner_states = [x] + + # decoder layers + for i, layer in enumerate(self.layers): + encoder_out_feed = None + encoder_padding_mask_feed = None + + if encoder_out is not None: + # only use the last layer + if i >= self.sample_layer_num or self.sample_arbitrary_ende_attn[i] == -1: + encoder_out_feed = encoder_out['encoder_out'] + # concat one second last output layer + elif self.sample_arbitrary_ende_attn[i] == 1: + encoder_out_feed = torch.cat( + [encoder_out['encoder_out'], encoder_out['encoder_out_all'][-2]], dim=0) + elif self.sample_arbitrary_ende_attn[i] == 2: + encoder_out_feed = torch.cat( + [encoder_out['encoder_out'], + encoder_out['encoder_out_all'][-2], + encoder_out['encoder_out_all'][-3]], + dim=0) + else: + raise NotImplementedError( + "arbitrary_ende_attn should in [-1, 1, 2]") + + if encoder_out['encoder_padding_mask'] is not None: + if i >= self.sample_layer_num or self.sample_arbitrary_ende_attn[i] == -1: + encoder_padding_mask_feed = encoder_out['encoder_padding_mask'] + # concat one more + elif self.sample_arbitrary_ende_attn[i] == 1: + encoder_padding_mask_feed = torch.cat( + [encoder_out['encoder_padding_mask'], encoder_out['encoder_padding_mask']], dim=1) + # concat two more + elif self.sample_arbitrary_ende_attn[i] == 2: + encoder_padding_mask_feed = torch.cat( + [encoder_out['encoder_padding_mask'], + encoder_out['encoder_padding_mask'], + encoder_out['encoder_padding_mask']], + dim=1) + else: + raise NotImplementedError( + "arbitrary_ende_attn should in [-1, 1, 2]") + + x, attn = layer( + x, + encoder_out_feed, + encoder_padding_mask_feed, + incremental_state, + self_attn_mask=self.buffered_future_mask( + x) if incremental_state is None else None, + ) + inner_states.append(x) + attns.append(attn) + + if self.layer_norm: + x = self.layer_norm(x) # pylint: disable=not-callable + + # T x B x C -> B x T x C + x = x.transpose(0, 1) + + if self.project_out_dim is not None: + x = self.project_out_dim(x) + if not self.get_attn: + attns = attns[-1] + return x, {'attn': attns, 'inner_states': inner_states} + + def output_layer(self, features, **kwargs): + """Project features to the vocabulary size.""" + if self.adaptive_softmax is None: + # project back to size of vocabulary + if self.share_input_output_embed: + return F.linear(features, self.embed_tokens.sampled_weight('decoder')) + else: + return F.linear(features, self.embed_out[:, :self.sample_embed_dim]) + else: + return features + + def max_positions(self): + """Maximum output length supported by the decoder.""" + if self.embed_positions is None: + return self.max_target_positions + return min(self.max_target_positions, self.embed_positions.max_positions()) + + def buffered_future_mask(self, tensor): #noqa: D102 + utils = fairseq.utils + + dim = tensor.size(0) + if ( + not hasattr(self, '_future_mask') + or self._future_mask is None # pylint: disable=access-member-before-definition + or self._future_mask.device != tensor.device # pylint: disable=access-member-before-definition + or self._future_mask.size(0) < dim # pylint: disable=access-member-before-definition + ): + self._future_mask = torch.triu( # pylint: disable=access-member-before-definition + utils.fill_with_neg_inf(tensor.new(dim, dim)), 1) + return self._future_mask[:dim, :dim] # pylint: disable=access-member-before-definition + + def upgrade_state_dict_named(self, state_dict, name): + """Upgrade a (possibly old) state dict for new versions of fairseq.""" + utils = fairseq.utils + if isinstance(self.embed_positions, fairseq.modules.SinusoidalPositionalEmbedding): + weights_key = '{}.embed_positions.weights'.format(name) + if weights_key in state_dict: + del state_dict[weights_key] + state_dict['{}.embed_positions._float_tensor'.format( + name)] = torch.FloatTensor(1) + + for i in range(len(self.layers)): + # update layer norms + layer_norm_map = { + '0': 'self_attn_layer_norm', + '1': 'encoder_attn_layer_norm', + '2': 'final_layer_norm' + } + for old, new in layer_norm_map.items(): + for m in ('weight', 'bias'): + k = '{}.layers.{}.layer_norms.{}.{}'.format( + name, i, old, m) + if k in state_dict: + state_dict['{}.layers.{}.{}.{}'.format( + name, i, new, m)] = state_dict[k] + del state_dict[k] + + version_key = '{}.version'.format(name) + if utils.item(state_dict.get(version_key, torch.Tensor([1]))[0]) <= 2: + # earlier checkpoints did not normalize after the stack of layers + self.layer_norm = None + self.normalize = False + state_dict[version_key] = torch.Tensor([1]) + + return state_dict + + +class TransformerEncoderLayer(nn.Module): + """Encoder layer block. + + In the original paper each operation (multi-head attention or FFN) is + postprocessed with: `dropout -> add residual -> layernorm`. In the + tensor2tensor code they suggest that learning is more robust when + preprocessing each layer with layernorm and postprocessing with: + `dropout -> add residual`. We default to the approach in the paper, but the + tensor2tensor approach can be enabled by setting + *args.encoder_normalize_before* to ``True``. + + Args: + args (argparse.Namespace): parsed command-line arguments + """ + + def __init__(self, encoder_config, layer_idx): #noqa: D107 + super().__init__() + + utils = fairseq.utils + + # the configs of super arch + self.super_embed_dim = encoder_config['encoder_embed_dim'] + self.super_ffn_embed_dim_this_layer = encoder_config['encoder_ffn_embed_dim'] + self.super_self_attention_heads_this_layer = encoder_config['encoder_attention_heads'] + + self.super_dropout = 0.3 + self.super_activation_dropout = 0 + + # the configs of current sampled arch + self.sample_embed_dim = None + self.sample_ffn_embed_dim_this_layer = None + self.sample_self_attention_heads_this_layer = None + + self.sample_dropout = None + self.sample_activation_dropout = None + + self.is_identity_layer = None + + self.qkv_dim = 512 + + self.self_attn = MultiheadAttentionSuper( + super_embed_dim=self.super_embed_dim, num_heads=self.super_self_attention_heads_this_layer, + is_encoder=True, dropout=0.1, self_attention=True, qkv_dim=self.qkv_dim, + ) + + self.self_attn_layer_norm = LayerNormSuper(self.super_embed_dim) + self.dropout = 0.1 + self.activation_fn = utils.get_activation_fn( + activation='relu' + ) + self.normalize_before = False + + self.fc1 = LinearSuper(super_in_dim=self.super_embed_dim, super_out_dim=self.super_ffn_embed_dim_this_layer, + uniform_=None, non_linear='relu') # init.uniform_ + self.fc2 = LinearSuper(super_in_dim=self.super_ffn_embed_dim_this_layer, + super_out_dim=self.super_embed_dim, uniform_=None, non_linear='linear') + self.final_layer_norm = LayerNormSuper(self.super_embed_dim) + + def set_sample_config( + self, + is_identity_layer, + sample_embed_dim=None, + sample_ffn_embed_dim_this_layer=None, + sample_self_attention_heads_this_layer=None, + sample_dropout=None, + sample_activation_dropout=None, + ): #noqa: D102 + + if is_identity_layer: + self.is_identity_layer = True + return + + self.is_identity_layer = False + + self.sample_embed_dim = sample_embed_dim + self.sample_ffn_embed_dim_this_layer = sample_ffn_embed_dim_this_layer + self.sample_self_attention_heads_this_layer = sample_self_attention_heads_this_layer + + self.sample_dropout = sample_dropout + self.sample_activation_dropout = sample_activation_dropout + + self.self_attn_layer_norm.set_sample_config( + sample_embed_dim=self.sample_embed_dim) + + self.self_attn.set_sample_config(sample_q_embed_dim=self.sample_embed_dim, + sample_attention_heads=self.sample_self_attention_heads_this_layer) + + self.fc1.set_sample_config( + sample_in_dim=self.sample_embed_dim, sample_out_dim=self.sample_ffn_embed_dim_this_layer) + self.fc2.set_sample_config( + sample_in_dim=self.sample_ffn_embed_dim_this_layer, sample_out_dim=self.sample_embed_dim) + + self.final_layer_norm.set_sample_config( + sample_embed_dim=self.sample_embed_dim) + + def upgrade_state_dict_named(self, state_dict, name): + """Renames keys in state dict. + + Rename layer norm states from `...layer_norms.0.weight` to + `...self_attn_layer_norm.weight` and `...layer_norms.1.weight` to + `...final_layer_norm.weight` + """ + layer_norm_map = { + '0': 'self_attn_layer_norm', + '1': 'final_layer_norm' + } + for old, new in layer_norm_map.items(): + for m in ('weight', 'bias'): + k = '{}.layer_norms.{}.{}'.format(name, old, m) + if k in state_dict: + state_dict[ + '{}.{}.{}'.format(name, new, m) + ] = state_dict[k] + del state_dict[k] + + def forward(self, x, encoder_padding_mask, attn_mask=None): + """Forward pass. + + Args: + x (Tensor): input to the layer of shape `(seq_len, batch, embed_dim)` + encoder_padding_mask (ByteTensor): binary ByteTensor of shape + `(batch, src_len)` where padding elements are indicated by ``1``. + attn_mask (ByteTensor): binary tensor of shape (T_tgt, T_src), where + T_tgt is the length of query, while T_src is the length of key, + though here both query and key is x here, + attn_mask[t_tgt, t_src] = 1 means when calculating embedding + for t_tgt, t_src is excluded (or masked out), =0 means it is + included in attention + + Returns: + encoded output of shape `(seq_len, batch, embed_dim)` + """ + if self.is_identity_layer: + return x + residual = x + x = self.maybe_layer_norm(self.self_attn_layer_norm, x, before=True) + if attn_mask is not None: + attn_mask = attn_mask.masked_fill(attn_mask.byte(), -1e8) + # anything in original attn_mask = 1, becomes -1e8 + # anything in original attn_mask = 0, becomes 0 + # Note that we cannot use -inf here, because at some edge cases, + # the attention weight (before softmax) for some padded element in query + # will become -inf, which results in NaN in model parameters + # TODO: to formally solve this problem, we need to change fairseq's + # MultiheadAttention. We will do this later on. + x, _ = self.self_attn(query=x, key=x, value=x, + key_padding_mask=encoder_padding_mask) + x = F.dropout(x, p=self.dropout, training=self.training) + x[:residual.size(0), :, :] = residual + x[:residual.size(0), :, :] + x = self.maybe_layer_norm(self.self_attn_layer_norm, x, after=True) + + residual = x + x = self.maybe_layer_norm(self.final_layer_norm, x, before=True) + x = self.activation_fn(self.fc1(x)) + x = F.dropout(x, p=self.sample_activation_dropout, + training=self.training) + x = self.fc2(x) + x = F.dropout(x, p=self.sample_dropout, training=self.training) + x = residual + x + x = self.maybe_layer_norm(self.final_layer_norm, x, after=True) + return x + + def maybe_layer_norm(self, layer_norm, x, before=False, after=False): #noqa: D102 + assert before ^ after + if after ^ self.normalize_before: + return layer_norm(x) + else: + return x + + +class TransformerDecoderLayer(nn.Module): + """Decoder layer block. + + In the original paper each operation (multi-head attention, encoder + attention or FFN) is postprocessed with: `dropout -> add residual -> + layernorm`. In the tensor2tensor code they suggest that learning is more + robust when preprocessing each layer with layernorm and postprocessing with: + `dropout -> add residual`. We default to the approach in the paper, but the + tensor2tensor approach can be enabled by setting + *args.decoder_normalize_before* to ``True``. + + Args: + args (argparse.Namespace): parsed command-line arguments + no_encoder_attn (bool, optional): whether to attend to encoder outputs + (default: False). + """ + + def __init__( + self, + decoder_config, + layer_idx, + no_encoder_attn=False, + add_bias_kv=False, + add_zero_attn=False, + ): #noqa: D107 + super().__init__() + + utils = fairseq.utils + + # the configs of super arch + self.super_embed_dim = decoder_config['decoder_embed_dim'] + self.super_encoder_embed_dim = decoder_config['decoder_embed_dim'] + self.super_ffn_embed_dim_this_layer = decoder_config['decoder_ffn_embed_dim'] + self.super_self_attention_heads_this_layer = decoder_config['decoder_attention_heads'] + self.super_ende_attention_heads_this_layer = decoder_config['decoder_attention_heads'] + + self.super_dropout = 0.3 + self.super_activation_dropout = 0 + + # the configs of current sampled arch + self.sample_embed_dim = None + self.sample_encoder_embed_dim = None + self.sample_ffn_embed_dim_this_layer = None + self.sample_self_attention_heads_this_layer = None + self.sample_ende_attention_heads_this_layer = None + self.sample_dropout = None + self.sample_activation_dropout = None + self.is_identity_layer = None + self.qkv_dim = 512 + self.layer_idx = layer_idx + + self.self_attn = MultiheadAttentionSuper( + is_encoder=False, + super_embed_dim=self.super_embed_dim, + num_heads=self.super_self_attention_heads_this_layer, + dropout=0.1, + add_bias_kv=add_bias_kv, + add_zero_attn=add_zero_attn, + self_attention=True, + qkv_dim=self.qkv_dim + ) + self.activation_fn = utils.get_activation_fn( + activation='relu' + ) + self.normalize_before = False + + # use layerNorm rather than FusedLayerNorm for exporting. + # char_inputs can be used to determint this. + # TODO remove this once we update apex with the fix + + self.self_attn_layer_norm = LayerNormSuper(self.super_embed_dim) + + if no_encoder_attn: + self.encoder_attn = None + self.encoder_attn_layer_norm = None + else: + self.encoder_attn = MultiheadAttentionSuper( + super_embed_dim=self.super_embed_dim, + num_heads=self.super_ende_attention_heads_this_layer, + is_encoder=False, + super_kdim=self.super_encoder_embed_dim, + super_vdim=self.super_encoder_embed_dim, + dropout=0.1, + encoder_decoder_attention=True, + qkv_dim=self.qkv_dim + ) + self.encoder_attn_layer_norm = LayerNormSuper(self.super_embed_dim) + + self.fc1 = LinearSuper(super_in_dim=self.super_embed_dim, super_out_dim=self.super_ffn_embed_dim_this_layer, + uniform_=None, non_linear='relu') + self.fc2 = LinearSuper(super_in_dim=self.super_ffn_embed_dim_this_layer, super_out_dim=self.super_embed_dim, + uniform_=None, non_linear='linear') + + self.final_layer_norm = LayerNormSuper(self.super_embed_dim) + self.need_attn = True + + self.onnx_trace = False + + def set_sample_config(self, + is_identity_layer, + sample_embed_dim=None, + sample_encoder_embed_dim=None, + sample_ffn_embed_dim_this_layer=None, + sample_self_attention_heads_this_layer=None, + sample_ende_attention_heads_this_layer=None, + sample_dropout=None, + sample_activation_dropout=None, + ): #noqa: D102 + + if is_identity_layer: + self.is_identity_layer = True + return + + self.is_identity_layer = False + + self.sample_embed_dim = sample_embed_dim + self.sample_encoder_embed_dim = sample_encoder_embed_dim + self.sample_ffn_embed_dim_this_layer = sample_ffn_embed_dim_this_layer + self.sample_self_attention_heads_this_layer = sample_self_attention_heads_this_layer + self.sample_ende_attention_heads_this_layer = sample_ende_attention_heads_this_layer + + self.sample_dropout = sample_dropout + self.sample_activation_dropout = sample_activation_dropout + + self.self_attn_layer_norm.set_sample_config( + sample_embed_dim=self.sample_embed_dim) + self.encoder_attn_layer_norm.set_sample_config( + sample_embed_dim=self.sample_embed_dim) + + self.self_attn.set_sample_config(sample_q_embed_dim=self.sample_embed_dim, + sample_attention_heads=self.sample_self_attention_heads_this_layer) + self.encoder_attn.set_sample_config( + sample_q_embed_dim=self.sample_embed_dim, + sample_kv_embed_dim=self.sample_encoder_embed_dim, + sample_attention_heads=self.sample_ende_attention_heads_this_layer, + ) + + self.fc1.set_sample_config( + sample_in_dim=self.sample_embed_dim, sample_out_dim=self.sample_ffn_embed_dim_this_layer) + self.fc2.set_sample_config( + sample_in_dim=self.sample_ffn_embed_dim_this_layer, sample_out_dim=self.sample_embed_dim) + + self.final_layer_norm.set_sample_config( + sample_embed_dim=self.sample_embed_dim) + + def prepare_for_onnx_export_(self): #noqa: D102 + self.onnx_trace = True + + def forward( + self, + x, + encoder_out=None, + encoder_padding_mask=None, + incremental_state=None, + prev_self_attn_state=None, + prev_attn_state=None, + self_attn_mask=None, + self_attn_padding_mask=None, + ): + """Forward pass. + + Args: + x (Tensor): input to the layer of shape `(seq_len, batch, embed_dim)` + encoder_padding_mask (ByteTensor): binary ByteTensor of shape + `(batch, src_len)` where padding elements are indicated by ``1``. + + Returns: + encoded output of shape `(seq_len, batch, embed_dim)` + """ + if self.is_identity_layer: + return x, None + + residual = x + x = self.maybe_layer_norm(self.self_attn_layer_norm, x, before=True) + if prev_self_attn_state is not None: + if incremental_state is None: + incremental_state = {} + prev_key, prev_value = prev_self_attn_state + saved_state = {"prev_key": prev_key, "prev_value": prev_value} + self.self_attn._set_input_buffer(incremental_state, saved_state) + x, attn = self.self_attn( + query=x, + key=x, + value=x, + key_padding_mask=self_attn_padding_mask, + incremental_state=incremental_state, + need_weights=False, + attn_mask=self_attn_mask, + ) + x = F.dropout(x, p=self.sample_dropout, training=self.training) + x = residual + x + x = self.maybe_layer_norm(self.self_attn_layer_norm, x, after=True) + + if self.encoder_attn is not None: + residual = x + x = self.maybe_layer_norm( + self.encoder_attn_layer_norm, x, before=True) + if prev_attn_state is not None: + if incremental_state is None: + incremental_state = {} + prev_key, prev_value = prev_attn_state + saved_state = {"prev_key": prev_key, "prev_value": prev_value} + self.encoder_attn._set_input_buffer( + incremental_state, saved_state) + x, attn = self.encoder_attn( + query=x, + key=encoder_out, + value=encoder_out, + key_padding_mask=encoder_padding_mask, + incremental_state=incremental_state, + static_kv=True, + need_weights=(not self.training and self.need_attn), + ) + x = F.dropout(x, p=self.sample_dropout, training=self.training) + x = residual + x + x = self.maybe_layer_norm( + self.encoder_attn_layer_norm, x, after=True) + + residual = x + x = self.maybe_layer_norm(self.final_layer_norm, x, before=True) + x = self.activation_fn(self.fc1(x)) + x = F.dropout(x, p=self.sample_activation_dropout, + training=self.training) + x = self.fc2(x) + x = F.dropout(x, p=self.sample_dropout, training=self.training) + x = residual + x + x = self.maybe_layer_norm(self.final_layer_norm, x, after=True) + if self.onnx_trace and incremental_state is not None: + saved_state = self.self_attn._get_input_buffer(incremental_state) + self_attn_state = saved_state["prev_key"], saved_state["prev_value"] + return x, attn, self_attn_state + return x, attn + + def maybe_layer_norm(self, layer_norm, x, before=False, after=False): #noqa: D102 + assert before ^ after + if after ^ self.normalize_before: + return layer_norm(x) + else: + return x + + def make_generation_fast_(self, need_attn=False, **kwargs): #noqa: D102 + self.need_attn = need_attn + + +def calc_dropout(dropout, sample_embed_dim, super_embed_dim): #noqa: D103 + return dropout * 1.0 * sample_embed_dim / super_embed_dim + + +def Embedding(num_embeddings, embedding_dim, padding_idx): #noqa: D103 + return EmbeddingSuper(num_embeddings, embedding_dim, padding_idx=padding_idx) + + +def Linear(in_features, out_features, bias=True, uniform_=None, non_linear='linear'): #noqa: D103 + m = nn.Linear(in_features, out_features, bias) + nn.init.xavier_uniform_(m.weight) if uniform_ is None else uniform_( #noqa: D103 + m.weight, non_linear=non_linear) + if bias: + nn.init.constant_(m.bias, 0.) + return m diff --git a/neural_compressor/experimental/nas/nas.py b/neural_compressor/experimental/nas/nas.py index f6580f52a72..21e88aa3563 100644 --- a/neural_compressor/experimental/nas/nas.py +++ b/neural_compressor/experimental/nas/nas.py @@ -56,13 +56,13 @@ def __new__(self, conf_fname_or_obj, *args, **kwargs): elif isinstance(conf_fname_or_obj, Config): self.conf = NASConfig() self.conf.map_pyconfig_to_cfg(conf_fname_or_obj) - else: # pragma: no cover + else: # pragma: no cover raise NotImplementedError( "Please provide a str path to the config file." ) assert self.conf.usr_cfg.nas is not None, "nas section must be set" if isinstance(self.conf.usr_cfg.nas.approach, str) and \ - self.conf.usr_cfg.nas.approach.lower() in NASMethods: + self.conf.usr_cfg.nas.approach.lower() in NASMethods: method = self.conf.usr_cfg.nas.approach.lower() else: logger.warning( @@ -127,29 +127,38 @@ def search(self, res_save_path=None): ) ) model_arch_paras = self.select_model_arch() - logger.info("Model architecture {} proposed.".format(model_arch_paras)) + logger.info( + "Model architecture {} proposed.".format(model_arch_paras)) model = self._model_builder(model_arch_paras) model_paras = self.count_model_parameters(model) logger.info( - "***** Number of model parameters: {:.2f}M *****".format(model_paras / 10**6) + "***** Number of model parameters: {:.2f}M *****".format( + model_paras / 10**6) ) - self.model_paras_num[tuple(model_arch_paras.values())] = model_paras + self.model_paras_num[tuple( + model_arch_paras.values())] = model_paras if tuple(model_arch_paras.values()) in self.search_results: - logger.info("Skip evaluated model architecture {}.".format(model_arch_paras)) + logger.info( + "Skip evaluated model architecture {}.".format(model_arch_paras)) continue if tuple(model_arch_paras.values()) in self.resumed_search_results: logger.info( - "Find previous results of model architecture: {}.".format(model_arch_paras) + "Find previous results of model architecture: {}.".format( + model_arch_paras) ) - metrics = self.resumed_search_results[tuple(model_arch_paras.values())] + metrics = self.resumed_search_results[tuple( + model_arch_paras.values())] else: - logger.info("Assessing model architecture: {}.".format(model_arch_paras)) + logger.info( + "Assessing model architecture: {}.".format(model_arch_paras)) metrics = self.estimate(model) logger.info( - "Metrics of model architecture {} is {}.".format(model_arch_paras, metrics) + "Metrics of model architecture {} is {}.".format( + model_arch_paras, metrics) ) self.search_results[tuple(model_arch_paras.values())] = metrics - self._search_algorithm.get_feedback(sum(self.metrics_conversion(metrics))) + self._search_algorithm.get_feedback( + sum(self.metrics_conversion(metrics))) self.dump_search_results( os.path.join(save_path, 'Trial_{}_results.txt'.format(i+1)) ) @@ -158,9 +167,12 @@ def search(self, res_save_path=None): if model_arch_vec not in self.search_results: self.search_results[model_arch_vec] = \ self.resumed_search_results[model_arch_vec] - model = self._model_builder(self.params_vec2params_dict(model_arch_vec)) - self.model_paras_num[model_arch_vec] = self.count_model_parameters(model) - self.dump_search_results(os.path.join(save_path, 'Final_results.txt'.format(i+1))) + model = self._model_builder( + self.params_vec2params_dict(model_arch_vec)) + self.model_paras_num[model_arch_vec] = self.count_model_parameters( + model) + self.dump_search_results(os.path.join( + save_path, 'Final_results.txt'.format(i+1))) self.find_best_model_archs() logger.info( "{fix} Found {n} best model architectures {fix}".format( @@ -168,10 +180,11 @@ def search(self, res_save_path=None): ) ) for i, model_arch in enumerate(self.best_model_archs): - logger.info("Best model architecture {}: {}".format(i+1, model_arch)) + logger.info( + "Best model architecture {}: {}".format(i+1, model_arch)) return self.best_model_archs - def estimate(self, model): # pragma: no cover + def estimate(self, model): # pragma: no cover """Estimate performance of the model. Depends on specific NAS algorithm. Returns: @@ -188,7 +201,8 @@ def count_model_parameters(self, model): if isinstance(model, torch.nn.Module): return sum(p.numel() for p in model.parameters()) else: - raise NotImplementedError("Only support torch model now.") # pragma: no cover + raise NotImplementedError( + "Only support torch model now.") # pragma: no cover def load_search_results(self, path): """Load previous search results if exist.""" @@ -196,11 +210,13 @@ def load_search_results(self, path): lastest_results_record = os.path.join(path, 'lastest_results.npy') if not os.path.exists(path) or not os.path.exists(lastest_results_record): return - self.resumed_search_results = np.load(lastest_results_record, allow_pickle=True).item() + self.resumed_search_results = np.load( + lastest_results_record, allow_pickle=True).item() os.makedirs(os.path.join(path, 'previous_results'), exist_ok=True) for f in os.listdir(path): if os.path.isfile(os.path.join(path, f)): - shutil.move(os.path.join(path, f), os.path.join(path, 'previous_results', f)) + shutil.move(os.path.join(path, f), os.path.join( + path, 'previous_results', f)) logger.info("Loaded previous results.") def dump_search_results(self, path): @@ -209,23 +225,24 @@ def dump_search_results(self, path): np.save(lastest_results_record, self.search_results, allow_pickle=True) write_contents = '=' * 30 + ' All Search Results ' + '=' * 30 + '\n\n' for model_arch_vec in self.search_results: - tmp = ','.join(['{}_{}'.format(k, v) \ - for k, v in zip(self.search_space_keys, model_arch_vec)]) + tmp = ','.join(['{}_{}'.format(k, v) + for k, v in zip(self.search_space_keys, model_arch_vec)]) write_contents += '{}: {} Paras: {}M\n'.format( tmp, self.search_results[model_arch_vec], self.model_paras_num[model_arch_vec] / 10**6 ) - write_contents += '\n\n\n' + '=' * 30 + ' Best Search Results ' + '=' * 30 + '\n\n' + write_contents += '\n\n\n' + '=' * 30 + \ + ' Best Search Results ' + '=' * 30 + '\n\n' self.find_best_model_archs() for i, model_arch in enumerate(self.best_model_archs): model_arch_vec = tuple(model_arch.values()) - tmp = ','.join(['{}_{}'.format(k, v) \ - for k, v in zip(self.search_space_keys, model_arch_vec)]) + tmp = ','.join(['{}_{}'.format(k, v) + for k, v in zip(self.search_space_keys, model_arch_vec)]) write_contents += \ '{}. {}: {} Paras: {}M\n'.format( i+1, tmp, self.search_results[model_arch_vec], self.model_paras_num[model_arch_vec] / 10**6 - ) + ) with open(path, mode='w') as f: f.write(write_contents) @@ -239,7 +256,7 @@ def params_vec2params_dict(self, paras_vec): """ assert len(paras_vec) == len(self.search_space_keys), \ "Length of paras_vec and search_space_keys should be the same." - return {k:v for k, v in zip(self.search_space_keys, paras_vec)} + return {k: v for k, v in zip(self.search_space_keys, paras_vec)} def find_best_model_archs(self): """Find the best model architectures. @@ -248,10 +265,11 @@ def find_best_model_archs(self): """ assert len(self.search_results) > 0, "Zero result in search_results." model_arches = list(self.search_results.keys()) - metrics = [self.metrics_conversion(self.search_results[ma]) for ma in model_arches] + metrics = [self.metrics_conversion( + self.search_results[ma]) for ma in model_arches] pareto_front_indices = find_pareto_front(metrics) - self.best_model_archs = [self.params_vec2params_dict(model_arches[i]) \ - for i in pareto_front_indices] + self.best_model_archs = [self.params_vec2params_dict(model_arches[i]) + for i in pareto_front_indices] def metrics_conversion(self, metrics): """Convert the metrics to specific format. @@ -268,11 +286,11 @@ def metrics_conversion(self, metrics): "Keys of metrics not match with metrics in the configuration." metrics = list(metrics.values()) if self.higher_is_better is None: - self.higher_is_better = [True,] * len(metrics) - logger.warning("higher_is_better not set in the configuration, " + \ - "set it to all True for every metric entry by default.") - converted_metrics = [metric if higher_is_better else -metric \ - for metric, higher_is_better in zip(metrics, self.higher_is_better)] + self.higher_is_better = [True, ] * len(metrics) + logger.warning("higher_is_better not set in the configuration, " + + "set it to all True for every metric entry by default.") + converted_metrics = [metric if higher_is_better else -metric + for metric, higher_is_better in zip(metrics, self.higher_is_better)] return converted_metrics def init_search_cfg(self, config): @@ -301,18 +319,21 @@ def init_search_cfg(self, config): if self.search_cfg.higher_is_better else None self.seed = self.search_cfg.seed self.max_trials = self.search_cfg.max_trials \ - if self.search_cfg.max_trials is not None else 3 # set default 3 for max_trials + if self.search_cfg.max_trials is not None else 3 # set default 3 for max_trials self.search_algorithm_type = self.search_cfg.search_algorithm \ if self.search_cfg.search_algorithm else None if not self.search_algorithm_type: - self._search_algorithm = BayesianOptimizationSearcher(self.search_space, self.seed) + self._search_algorithm = BayesianOptimizationSearcher( + self.search_space, self.seed) elif self.search_algorithm_type.lower() == 'grid': self._search_algorithm = GridSearcher(self.search_space) elif self.search_algorithm_type.lower() == 'random': - self._search_algorithm = RandomSearcher(self.search_space, self.seed) + self._search_algorithm = RandomSearcher( + self.search_space, self.seed) elif self.search_algorithm_type.lower() == 'bo': - self._search_algorithm = BayesianOptimizationSearcher(self.search_space, self.seed) - else: # pragma: no cover + self._search_algorithm = BayesianOptimizationSearcher( + self.search_space, self.seed) + else: # pragma: no cover logger.warning( 'Please be aware that \'{}\' is not a built-in search algorithm.'.format( self.search_algorithm_type @@ -322,7 +343,7 @@ def init_search_cfg(self, config): @property def search_space(self): """Getter of the search space. - + Returns: The search space. """ @@ -336,7 +357,7 @@ def search_space(self, search_space): @property def search_algorithm(self): """Getter of the search algorithm. - + Returns: The search algorithm. """ @@ -350,7 +371,7 @@ def search_algorithm(self, search_algorithm): @property def model_builder(self): """Getter of the model builder. - + Returns: The model builder. """ @@ -363,4 +384,4 @@ def model_builder(self, model_builder): def __repr__(self): """Class representation.""" - return 'Base Class of NAS' # pragma: no cover \ No newline at end of file + return 'Base Class of NAS' # pragma: no cover diff --git a/neural_compressor/experimental/nas/nas_utils.py b/neural_compressor/experimental/nas/nas_utils.py index d68556cafcd..72fe884c38b 100644 --- a/neural_compressor/experimental/nas/nas_utils.py +++ b/neural_compressor/experimental/nas/nas_utils.py @@ -35,6 +35,7 @@ def nas_registry(nas_method): cls: The class of register. """ assert isinstance(nas_method, str), "Expect nas_method to be a string." + def decorator(cls): NASMethods[nas_method.lower()] = cls return cls @@ -82,4 +83,4 @@ def find_pareto_front(metrics): pareto_front_point_indices = pareto_front_point_indices[nondominated_points] metrics = metrics[nondominated_points] next_point_idx = np.sum(nondominated_points[:next_point_idx+1]) - return pareto_front_point_indices \ No newline at end of file + return pareto_front_point_indices diff --git a/neural_compressor/experimental/nas/search_algorithms.py b/neural_compressor/experimental/nas/search_algorithms.py index bf1c804c289..72ef8a0c9c2 100644 --- a/neural_compressor/experimental/nas/search_algorithms.py +++ b/neural_compressor/experimental/nas/search_algorithms.py @@ -38,7 +38,8 @@ def __init__(self, search_space) -> None: self.search_space_keys = sorted(search_space.keys()) for k in self.search_space_keys: assert isinstance(self.search_space[k], (list, tuple)), \ - "Value of key \'{}\' must be a list or tuple to specify choices".format(k) + "Value of key \'{}\' must be a list or tuple to specify choices".format( + k) def suggest(self): """Suggest the model architecture.""" @@ -129,8 +130,10 @@ class BayesianOptimizationSearcher(Searcher): def __init__(self, search_space, seed=42) -> None: """Initialize the attributes.""" super(BayesianOptimizationSearcher, self).__init__(search_space) - idx_search_space = {k: (0, len(search_space[k])-1) for k in self.search_space_keys} - self.bo_agent = BayesianOptimization(idx_search_space, random_seed=seed) + idx_search_space = { + k: (0, len(search_space[k])-1) for k in self.search_space_keys} + self.bo_agent = BayesianOptimization( + idx_search_space, random_seed=seed) self.last_param_indices = None def suggest(self): @@ -149,7 +152,7 @@ def get_feedback(self, metric): "to get parameters and the input metric is corresponding to this parameters." try: self.bo_agent._space.register(self.last_param_indices, metric) - except KeyError: # pragma: no cover + except KeyError: # pragma: no cover logger.debug("Find registered params, skip it.") pass self.last_param_indices = None @@ -161,4 +164,4 @@ def indices2params_vec(self, indices): # keep ind within the index range of self.search_space[key] ind = int(min(max(round(ind), 0), len(self.search_space[key])-1)) res.append(self.search_space[key][ind]) - return res \ No newline at end of file + return res diff --git a/test/nas/test_nas.py b/test/nas/test_nas.py index cdf00275d5e..10673939388 100644 --- a/test/nas/test_nas.py +++ b/test/nas/test_nas.py @@ -1,15 +1,17 @@ -from multiprocessing.spawn import import_main_path import os import shutil import unittest +from pathlib import Path + import numpy as np import torch from neural_compressor.conf.config import NASConfig from neural_compressor.data import Datasets -from neural_compressor.experimental import common, NAS -from neural_compressor.experimental.data.dataloaders.pytorch_dataloader import PyTorchDataLoader -from neural_compressor.experimental.nas.dynas import DyNAS +from neural_compressor.experimental import NAS, common +from neural_compressor.experimental.data.dataloaders.pytorch_dataloader import \ + PyTorchDataLoader + def build_fake_yaml(approach=None, search_algorithm=None, metrics=['acc']): fake_yaml = """ @@ -197,6 +199,8 @@ def test_dynas(self): config.dynas.batch_size = 64 nas_agent = NAS(config) best_model_archs = nas_agent.search() + self.assertTrue(len(best_model_archs) > 0) + nas_agent.acc_predictor.get_parameters() nas_agent.acc_predictor.save('tmp.pickle') nas_agent.acc_predictor.load('tmp.pickle') @@ -206,11 +210,19 @@ def test_dynas(self): nas_agent.runner_validate.measure_latency(subnet_cfg) nas_agent.validation_interface.clear_csv() os.remove('tmp.pickle') - from neural_compressor.experimental.nas.dynast.dynas_utils import TorchVisionReference - reference = TorchVisionReference('ofa_resnet50_ofa_mbv3', dataset_path=None, batch_size=1) - reference.validate_macs() - reference.measure_latency() - self.assertTrue(len(best_model_archs) > 0) + + def test_vision_reference(self): + from neural_compressor.experimental.nas.dynast.dynas_utils import \ + TorchVisionReference + reference = TorchVisionReference('ofa_mbv3', dataset_path=None, batch_size=1) + macs = reference.validate_macs() + + self.assertEqual(macs, 217234208) + + reference.measure_latency( + warmup_steps=1, + measure_steps=1, + ) if __name__ == "__main__":