-
Notifications
You must be signed in to change notification settings - Fork 1.2k
Closed
Description
System Information
- Framework (e.g. TensorFlow) / Algorithm (e.g. KMeans): Tensorflow
- Framework Version: 1.8
- Python Version: 2
- CPU or GPU: GPU
- Python SDK Version: 1.5.1
- Are you using a custom image: No
Describe the problem
I created a distributed training job which trains a transfer learning model using VGG16.
The job would succeed if I don't load pre-trained weights when creating VGG16 backbone model.
backend = tf.keras.applications.vgg16.VGG16(weights=None,
include_top=False,
input_shape=(224, 224, 3))
However, exception would throw when I try to load pre-trained weights as done in the following code snippet:
backend = tf.keras.applications.vgg16.VGG16(weights='imagenet',
include_top=False,
input_shape=(224, 224, 3))
Note that, for non-distributed training, loading pre-trained weights would not cause any exception.
Minimal repro / logs
InvalidArgumentError (see above for traceback): Cannot assign a device for operation 'block5_conv3/bias': Operation was explicitly assigned to /job:ps/task:0 but available devices are [ /job:localhost/replica:0/task:0/device:CPU:0 ]. Make sure the device specification refers to a valid device.
#011 [[Node: block5_conv3/bias = VariableV2[_class=["loc:@block5_conv3/bias"], container="", dtype=DT_FLOAT, shape=[512], shared_name="", _device="/job:ps/task:0"]()]]
Traceback (most recent call last):
File "/usr/local/lib/python2.7/dist-packages/container_support/training.py", line 36, in start
fw.train()
File "/usr/local/lib/python2.7/dist-packages/tf_container/train_entry_point.py", line 164, in train
train_wrapper.train()
File "/usr/local/lib/python2.7/dist-packages/tf_container/trainer.py", line 73, in train
tf.estimator.train_and_evaluate(estimator=estimator, train_spec=train_spec, eval_spec=eval_spec)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/training.py", line 439, in train_and_evaluate
executor.run()
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/training.py", line 546, in run
getattr(self, task_to_run)()
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/training.py", line 601, in run_master
self._start_distributed_training(saving_listeners=saving_listeners)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/training.py", line 739, in _start_distributed_training
saving_listeners=saving_listeners)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 363, in train
loss = self._train_model(input_fn, hooks, saving_listeners)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 843, in _train_model
return self._train_model_default(input_fn, hooks, saving_listeners)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 856, in _train_model_default
features, labels, model_fn_lib.ModeKeys.TRAIN, self.config)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 831, in _call_model_fn
model_fn_results = self._model_fn(features=features, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/tf_container/trainer.py", line 108, in _model_fn
return self.customer_script.model_fn(features, labels, mode, params)
File "/opt/ml/code/keras_distributed_transfer_learning.py", line 14, in model_fn
input_shape=(224, 224, 3))
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/keras/_impl/keras/applications/vgg16.py", line 225, in VGG16
model.load_weights(weights_path)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/keras/_impl/keras/engine/network.py", line 1190, in load_weights
saving.load_weights_from_hdf5_group(f, self.layers)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/keras/_impl/keras/engine/saving.py", line 719, in load_weights_from_hdf5_group
K.batch_set_value(weight_value_tuples)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/keras/_impl/keras/backend.py", line 2707, in batch_set_value
get_session().run(assign_ops, feed_dict=feed_dict)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/keras/_impl/keras/backend.py", line 442, in get_session
_initialize_variables(session)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/keras/_impl/keras/backend.py", line 666, in _initialize_variables
[variables_module.is_variable_initialized(v) for v in candidate_vars])
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 900, in run
run_metadata_ptr)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 1135, in _run
feed_dict_tensor, options, run_metadata)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 1316, in _do_run
run_metadata)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 1335, in _do_call
raise type(e)(node_def, op, message)
- Exact command to reproduce:
code for creating the job
import sagemaker
from sagemaker.tensorflow import TensorFlow
from sagemaker.session import s3_input
from sagemaker import get_execution_role
sagemaker_session = sagemaker.Session()
role = get_execution_role()
training_steps = 100
evaluation_steps = 10
estimator = TensorFlow(
entry_point='keras_distributed_transfer_learning.py',
source_dir='./',
role=role,
training_steps=100,
evaluation_steps=10,
train_instance_count=2,
train_instance_type='ml.p2.xlarge',
input_mode='File')
input_dataset = s3_input('s3://xxxx/cats_and_dogs/')
estimator.fit(input_dataset)
keras_distributed_transfer_learning.py
import tensorflow as tf
from tensorflow.python.estimator.model_fn import ModeKeys as Modes
INPUT_TENSOR_NAME = "input_1"
NUM_CLASSES = 2
BATCH_SIZE = 10
def model_fn(features, labels, mode, params):
"""The model_fn argument for creating an Estimator."""
backend = tf.keras.applications.vgg16.VGG16(weights='imagenet',
include_top=False,
input_shape=(224, 224, 3))
x = backend.output
x = tf.keras.layers.Flatten()(x)
x = tf.keras.layers.Dense(NUM_CLASSES, activation='softmax')(x)
model = tf.keras.models.Model(inputs=backend.input, outputs=x)
image = tf.keras.layers.Input(tensor=features[INPUT_TENSOR_NAME])
# Define operations
if mode in (Modes.PREDICT, Modes.EVAL):
logits = model(image, training=False)
predicted_indices = tf.argmax(input=logits, axis=1)
probabilities = tf.nn.softmax(logits, name='softmax_tensor')
if mode in (Modes.TRAIN):
logits = model(image, training=True)
global_step = tf.train.get_or_create_global_step()
loss = tf.losses.softmax_cross_entropy(
onehot_labels=labels, logits=logits)
tf.summary.scalar('OptimizeLoss', loss)
if mode in (Modes.EVAL):
logits = model(image, training=False)
global_step = tf.train.get_or_create_global_step()
loss = tf.losses.softmax_cross_entropy(
onehot_labels=labels, logits=logits)
tf.summary.scalar('OptimizeLoss', loss)
if mode == Modes.PREDICT:
predictions = {
'classes': predicted_indices,
'probabilities': probabilities
}
export_outputs = {
'predictions': tf.estimator.export.PredictOutput(predictions)
}
return tf.estimator.EstimatorSpec(
mode, predictions=predictions, export_outputs=export_outputs)
if mode == Modes.TRAIN:
optimizer = tf.train.AdamOptimizer(learning_rate=0.001)
train_op = optimizer.minimize(loss, global_step=global_step)
return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
if mode == Modes.EVAL:
eval_metric_ops = {
'accuracy': tf.metrics.accuracy(tf.argmax(labels, 1), predicted_indices)
}
return tf.estimator.EstimatorSpec(
mode, loss=loss, eval_metric_ops=eval_metric_ops)
def _input_fn(training_dir, input_shape, batch_size):
generator = tf.keras.preprocessing.image.ImageDataGenerator().flow_from_directory(training_dir, target_size=input_shape, batch_size=batch_size)
tensor_shapes = (tf.TensorShape([None, input_shape[0], input_shape[1], 3]), tf.TensorShape([None, NUM_CLASSES]))
tensor_types = (tf.float32, tf.float32)
dataset = tf.data.Dataset.from_generator(lambda: generator, tensor_types, tensor_shapes)
features, labels = dataset.make_one_shot_iterator().get_next()
return {INPUT_TENSOR_NAME: features}, labels
def train_input_fn(training_dir, hyperparameters):
return _input_fn(training_dir + '/train/', (224, 224), BATCH_SIZE)
def eval_input_fn(training_dir, hyperparameters):
return _input_fn(training_dir + '/test/', (224, 224), BATCH_SIZE)
def serving_input_fn(hyperparameters):
inputs = {INPUT_TENSOR_NAME: tf.placeholder(tf.float32, [None, 224, 224, 3])}
return tf.estimator.export.ServingInputReceiver(inputs, inputs)
jepatti, WuyangLI and dimosr
Metadata
Metadata
Assignees
Labels
No labels