support multi-node pruning in Tensorflow (#218)

Spycsh · web-flow · commit 1bee4df68550 · 2022-09-15T15:33:16.000+08:00
diff --git a/examples/optimization/tensorflow/huggingface/text-classification/pruning/README.md b/examples/optimization/tensorflow/huggingface/text-classification/pruning/README.md
@@ -26,4 +26,46 @@ bash run_tuning.sh  --topology=topology
 ```
 bash run_benchmark.sh --topology=topology --mode=benchmark
 ```
-topology is "distilbert_base_sst2"
+topology is "distilbert_base_sst2"
+
+
+### Multi-node usage
+
+We also supported Distributed Data Parallel training on multi nodes settings for pruning.
+
+The default strategy we used is `MultiWorkerMirroredStrategy` in Tensorflow, and with `task_type` set as "worker", we are expected to pass following extra parameters to the script:
+
+* `worker`: a string of your worker ip addresses which is separated by comma and there should not be space between each two of them
+
+* `task_index`: 0 should be set on the chief node (leader) and 1, 2, 3... should be set as the rank of other follower nodes
+
+### Multi-node example
+
+* On leader node
+
+```
+bash run_tuning.sh --topology=distilbert_base_sst2 --worker="localhost:12345,localhost:23456" --task_index=0
+```
+
+which is equal to
+
+```
+python run_glue.py \    
+    --model_name_or_path distilbert-base-uncased-finetuned-sst-2-english \     
+    --task_name sst2 \     
+    --prune \      
+    --do_train \     
+    --do_eval \
+    --output_dir ./tmp/sst2_output \  
+    --overwrite_output_dir \
+    --worker "localhost:12345,localhost:23456" \
+    --task_index 0
+```
+
+* On follower node
+
+```
+bash run_tuning.sh --topology=distilbert_base_sst2 --worker="localhost:12345,localhost:23456" --task_index=1
+```
+
+Please replace the worker ip address list with your own.
diff --git a/examples/optimization/tensorflow/huggingface/text-classification/pruning/run_glue.py b/examples/optimization/tensorflow/huggingface/text-classification/pruning/run_glue.py
diff --git a/examples/optimization/tensorflow/huggingface/text-classification/pruning/run_tuning.sh b/examples/optimization/tensorflow/huggingface/text-classification/pruning/run_tuning.sh
@@ -25,9 +25,15 @@ function init_params {
       --input_model=*)
           input_model=$(echo $var |cut -f2 -d=)
       ;;
-       --output_model=*)
-           tuned_checkpoint=$(echo $var |cut -f2 -d=)
-       ;;
+      --output_model=*)
+          tuned_checkpoint=$(echo $var |cut -f2 -d=)
+      ;;
+      --worker=*)
+          worker=$(echo $var |cut -f2 -d=)
+      ;;
+      --task_index=*)
+          task_index=$(echo $var |cut -f2 -d=)
+      ;;
       *)
           echo "Error: No such parameter: ${var}"
           exit 1
@@ -40,24 +46,43 @@ function init_params {
 # run_tuning
 function run_tuning {
     extra_cmd=''
-    batch_size=16
+    batch_size=64
     if [ "${topology}" = "distilbert_base_sst2" ]; then
         TASK_NAME='sst2'
         model_name_or_path=distilbert-base-uncased-finetuned-sst-2-english
     fi
 
-    python -u ./run_glue.py \
-        --model_name_or_path ${model_name_or_path} \
-        --task_name ${TASK_NAME} \
-        --target_sparsity_ratio 0.1 \
-        --prune \
-        --do_eval \
-        --do_train \
-        --per_device_eval_batch_size ${batch_size} \
-        --output_dir ${tuned_checkpoint} \
-        --overwrite_output_dir \
-        --overwrite_cache \
-        ${extra_cmd}
+    if [ "${worker}" = "" ]
+    then
+        python -u ./run_glue.py \
+            --model_name_or_path ${model_name_or_path} \
+            --task_name ${TASK_NAME} \
+            --target_sparsity_ratio 0.1 \
+            --prune \
+            --do_eval \
+            --do_train \
+            --per_device_train_batch_size ${batch_size} \
+            --per_device_eval_batch_size ${batch_size} \
+            --output_dir ${tuned_checkpoint} \
+            --overwrite_output_dir \
+            --overwrite_cache
+    else
+        python -u ./run_glue.py \
+            --model_name_or_path ${model_name_or_path} \
+            --task_name ${TASK_NAME} \
+            --target_sparsity_ratio 0.1 \
+            --prune \
+            --do_eval \
+            --do_train \
+            --per_device_train_batch_size ${batch_size} \
+            --per_device_eval_batch_size ${batch_size} \
+            --output_dir ${tuned_checkpoint} \
+            --overwrite_output_dir \
+            --overwrite_cache \
+            --worker "${worker}" \
+            --task_index ${task_index} \
+            ${extra_cmd}
+    fi
 }
 
 main "$@"
diff --git a/nlp_toolkit/optimization/config.py b/nlp_toolkit/optimization/config.py
@@ -363,7 +363,7 @@ def framework(self):
 
     @framework.setter
     def framework(self, framework):
-        assert framework.lower() in ["pytorch", "pytorch_fx"], \
+        assert framework.lower() in ["pytorch", "pytorch_fx", "tensorflow"], \
             "framework: {} is not support!".format(framework)
         self.inc_config.usr_cfg.model.framework = framework.lower()
 
diff --git a/nlp_toolkit/optimization/optimizer_tf.py b/nlp_toolkit/optimization/optimizer_tf.py
@@ -34,7 +34,7 @@
 from transformers import PreTrainedModel
 from transformers.training_args_tf import TFTrainingArguments
 from typing import Callable, Optional, List
-from .utils.utility_tf import TFDataloader, TMPPATH
+from .utils.utility_tf import TFDataloader, TMPPATH, get_filepath
 
 tf = LazyImport("tensorflow")
 logger = logging.getLogger(__name__)
@@ -50,6 +50,8 @@ def __init__(
         compute_metrics: Optional[Callable] = None,
         criterion = None,
         optimizer = None,
+        task_type = None,
+        task_id = None,
     ):
         """
         Args:
@@ -78,11 +80,14 @@ def __init__(
         self.compute_metrics = compute_metrics
         self.args = args
         self.optimizer = optimizer
+        self.task_type = task_type
+        self.task_id = task_id
         self.criterion = criterion if criterion is not None else \
             self.model.loss if hasattr(self.model, "loss") else None
-        self.model.save_pretrained(TMPPATH, saved_model=True)
+        self.model.save_pretrained(get_filepath(TMPPATH, self.task_type, self.task_id), saved_model=True)
         _, self.input_names, self.output_names = saved_model_session(
-            os.path.join(TMPPATH,"saved_model/1"), input_tensor_names=[], output_tensor_names=[])
+            os.path.join(get_filepath(TMPPATH, self.task_type, self.task_id), "saved_model/1"), input_tensor_names=[],
+             output_tensor_names=[])
         self.eval_distributed = False
 
     @property
@@ -298,7 +303,8 @@ def init_quantizer(
         self.metrics = self.quant_config.metrics
 
         quantizer = Quantization(self.quant_config.inc_config)
-        quantizer.model = common.Model(os.path.join(TMPPATH,"saved_model/1"), modelType="saved_model")
+        quantizer.model = common.Model(
+            os.path.join(get_filepath(TMPPATH, self.task_type, self.task_id),"saved_model/1"), modelType="saved_model")
 
         self.quantizer = quantizer
         return quantizer
@@ -325,8 +331,7 @@ def _inc_quantize(
                                                                batch_size=self.args.per_device_eval_batch_size)
             else:   # pragma: no cover
                 assert False, "Please pass calibration dataset to TFNoTrainerOptimizer.calib_dataloader"
-        elif self.quant_config.approach == QuantizationMode.QUANTIZATIONAWARETRAINING.value:
-            # pragma: no cover
+        elif self.quant_config.approach == QuantizationMode.QUANTIZATIONAWARETRAINING.value:   # pragma: no cover
             assert False, \
                 "Unsupport quantization aware training for tensorflow framework"
 
@@ -369,7 +374,7 @@ def init_pruner(
             "please pass a instance of PruningConfig to trainer.prune!"
 
         pruner = Pruning(self.pruning_config.inc_config)
-        pruner.model = os.path.join(TMPPATH,"saved_model/1")
+        pruner.model = os.path.join(get_filepath(TMPPATH, self.task_type, self.task_id),"saved_model/1")
         pruner.model.model_type = "saved_model"
 
         self.pruner = pruner
@@ -416,7 +421,11 @@ def prune(
 
         opt_model = self.pruner.fit()
 
-        return self.model
+        opt_model.save(self.args.output_dir)
+        logger.info(
+            "pruned model have saved to {}".format(self.args.output_dir)
+        )
+        return opt_model.model
 
     def init_distiller(
         self,
@@ -506,4 +515,4 @@ def on_train_batch_end(self, batch, logs=None):
                         callbacks=[PruningCb()])
 
         self.pruner.model._sess = None
-        input_model.save_pretrained(TMPPATH, saved_model=True)
+        input_model.save_pretrained(get_filepath(TMPPATH, self.task_type, self.task_id), saved_model=True)
diff --git a/nlp_toolkit/optimization/utils/metrics.py b/nlp_toolkit/optimization/utils/metrics.py
@@ -1,3 +1,20 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2022 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 class Metric(object):
     def __init__(self, name: str, greater_is_better: bool = True, is_relative: bool = True,
                  criterion: float = 0.01, weight_ratio: float = None):
diff --git a/nlp_toolkit/optimization/utils/objectives.py b/nlp_toolkit/optimization/utils/objectives.py
@@ -1,3 +1,20 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2022 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 class Objective(object):
     def __init__(self, name: str, greater_is_better: bool = True, weight_ratio: float = None):
         self.name = name
diff --git a/nlp_toolkit/optimization/utils/utility.py b/nlp_toolkit/optimization/utils/utility.py
@@ -1,3 +1,20 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2022 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import importlib
 import os
 from neural_compressor.utils.utility import LazyImport
diff --git a/nlp_toolkit/optimization/utils/utility_tf.py b/nlp_toolkit/optimization/utils/utility_tf.py
@@ -1,5 +1,23 @@
-from collections import OrderedDict, UserDict
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2022 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
+from collections import OrderedDict, UserDict
+import json
+import os
 
 TMPPATH = "tmp"
 class TFDataloader(object):
@@ -31,4 +49,28 @@ def __iter__(self):
                 labels = [label.numpy() for label in labels]
             else:
                 labels = labels.numpy()
-            yield inputs, labels
+            yield inputs, labels
+
+
+def distributed_init(worker_addresses, type='worker', index=0):
+    tf_config = {
+        'cluster': {
+            'worker': worker_addresses
+        },
+        'task': {'type': type, 'index': index}
+    }
+    os.environ['TF_CONFIG'] = json.dumps(tf_config)
+
+def _is_chief(task_type, task_id):
+    # here only consider the case in which TF_CONFIG task_type is set as worker
+    # and task_id=0 represents the chief
+    return (task_type == 'worker' and task_id == 0)
+
+# get model folder path for the distributed environment
+def get_filepath(base_dirpath, task_type, task_id):
+    if task_type is None:    # single node
+        return base_dirpath
+    elif _is_chief(task_type, task_id):
+        return os.path.join(base_dirpath, 'chief')
+    else:
+        return os.path.join(base_dirpath, 'worker_' + str(task_id))
diff --git a/tests/test_tf_pruning.py b/tests/test_tf_pruning.py
@@ -1,3 +1,4 @@
+from nlp_toolkit.optimization.utils.utility_tf import get_filepath
 import numpy as np
 import os
 import shutil
@@ -74,6 +75,19 @@ def tearDownClass(self):
         shutil.rmtree('./quantized_model', ignore_errors=True)
 
     def test_tf_model_quant(self):
+        # check whether it is possible to set distributed environment
+        # only for coverage currently
+        from nlp_toolkit.optimization.utils.utility_tf import distributed_init
+        distributed_init(["localhost:12345","localhost:23456"], "worker", 0)
+        self.assertTrue(os.environ['TF_CONFIG'] != None)
+        del os.environ['TF_CONFIG']
+        # check whether filepath can be set correctly if using distributed environment
+        # only for coverage currently
+        from nlp_toolkit.optimization.utils.utility_tf import get_filepath
+        self.assertTrue(type(get_filepath("dummy", "worker", 0)) == str)
+        self.assertTrue(type(get_filepath("dummy", "worker", 1)) == str)
+        self.assertTrue(get_filepath("dummy", "worker", 0) != get_filepath("dummy", "worker", 1))
+
         metric = load_metric("glue", "sst2")
         def compute_metrics(preds, label_ids):
             preds = preds["logits"]
@@ -99,12 +113,10 @@ def compute_metrics(preds, label_ids):
             epochs=int(1), pruner_config=pruner_config, metrics=tune_metric
         )
         p_model = self.optimizer.prune(pruning_config=pruning_conf)
-        p_model.save_pretrained(self.args.output_dir, saved_model=True)
-        loaded_model = tf.saved_model.load(os.path.join(self.args.output_dir, "saved_model/1"))
-
+        loaded_model = tf.saved_model.load(self.args.output_dir)
         p_model = self.optimizer.prune(pruning_config=pruning_conf,
-                                        train_dataset=self.dummy_dataset,
-                                        eval_dataset=self.dummy_dataset,)
+                                train_dataset=self.dummy_dataset,
+                                eval_dataset=self.dummy_dataset,)
 
         def eval_func(model):
             return 1