From 85f392f3b69604c7ff2da75e67b4d5c12b9ae7e5 Mon Sep 17 00:00:00 2001 From: Patrick Ng Date: Mon, 29 Apr 2019 16:18:09 -0400 Subject: [PATCH] Add document embedding support to Object2Vec algorithm --- CHANGELOG.md | 4 ++++ src/sagemaker/amazon/object2vec.py | 35 ++++++++++++++++++++++++++++++ tests/integ/test_object2vec.py | 4 ++++ tests/unit/test_object2vec.py | 15 +++++++++++-- 4 files changed, 56 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 75dc5b62e2..7195d78936 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ # Changelog +## v1.18.19 (2019-04-30) + +* feature: Add document embedding support to Object2Vec algorithm. + ## v1.18.18 (2019-04-29) ### Bug fixes and other changes diff --git a/src/sagemaker/amazon/object2vec.py b/src/sagemaker/amazon/object2vec.py index a15cb382f3..06dc965000 100644 --- a/src/sagemaker/amazon/object2vec.py +++ b/src/sagemaker/amazon/object2vec.py @@ -21,6 +21,19 @@ from sagemaker.vpc_utils import VPC_CONFIG_DEFAULT +def _list_check_subset(valid_super_list): + valid_superset = set(valid_super_list) + + def validate(value): + if not isinstance(value, str): + return False + + val_list = [s.strip() for s in value.split(',')] + return set(val_list).issubset(valid_superset) + + return validate + + class Object2Vec(AmazonAlgorithmEstimatorBase): repo_name = 'object2vec' @@ -57,6 +70,14 @@ class Object2Vec(AmazonAlgorithmEstimatorBase): 'One of "adagrad", "adam", "rmsprop", "sgd", "adadelta"', str) learning_rate = hp('learning_rate', (ge(1e-06), le(1.0)), 'A float in [1e-06, 1.0]', float) + + negative_sampling_rate = hp('negative_sampling_rate', (ge(0), le(100)), 'An integer in [0, 100]', int) + comparator_list = hp('comparator_list', _list_check_subset(["hadamard", "concat", "abs_diff"]), + 'Comma-separated of hadamard, concat, abs_diff. E.g. "hadamard,abs_diff"', str) + tied_token_embedding_weight = hp('tied_token_embedding_weight', (), 'Either True or False', bool) + token_embedding_storage_type = hp('token_embedding_storage_type', isin("dense", "row_sparse"), + 'One of "dense", "row_sparse"', str) + enc0_network = hp('enc0_network', isin("hcnn", "bilstm", "pooled_embedding"), 'One of "hcnn", "bilstm", "pooled_embedding"', str) enc1_network = hp('enc1_network', isin("hcnn", "bilstm", "pooled_embedding", "enc0"), @@ -104,6 +125,10 @@ def __init__(self, role, train_instance_count, train_instance_type, output_layer=None, optimizer=None, learning_rate=None, + negative_sampling_rate=None, + comparator_list=None, + tied_token_embedding_weight=None, + token_embedding_storage_type=None, enc0_network=None, enc1_network=None, enc0_cnn_filter_width=None, @@ -164,6 +189,10 @@ def __init__(self, role, train_instance_count, train_instance_type, output_layer(str): Optional. Type of output layer optimizer(str): Optional. Type of optimizer for training learning_rate(float): Optional. Learning rate for SGD training + negative_sampling_rate(int): Optional. Negative sampling rate + comparator_list(str): Optional. Customization of comparator operator + tied_token_embedding_weight(bool): Optional. Tying of token embedding layer weight + token_embedding_storage_type(str): Optional. Type of token embedding storage enc0_network(str): Optional. Network model of encoder "enc0" enc1_network(str): Optional. Network model of encoder "enc1" enc0_cnn_filter_width(int): Optional. CNN filter width @@ -197,6 +226,12 @@ def __init__(self, role, train_instance_count, train_instance_type, self.output_layer = output_layer self.optimizer = optimizer self.learning_rate = learning_rate + + self.negative_sampling_rate = negative_sampling_rate + self.comparator_list = comparator_list + self.tied_token_embedding_weight = tied_token_embedding_weight + self.token_embedding_storage_type = token_embedding_storage_type + self.enc0_network = enc0_network self.enc1_network = enc1_network self.enc0_cnn_filter_width = enc0_cnn_filter_width diff --git a/tests/integ/test_object2vec.py b/tests/integ/test_object2vec.py index 8bc452f414..5795159585 100644 --- a/tests/integ/test_object2vec.py +++ b/tests/integ/test_object2vec.py @@ -43,6 +43,10 @@ def test_object2vec(sagemaker_session): enc0_vocab_size=45000, enc_dim=16, num_classes=3, + negative_sampling_rate=0, + comparator_list='hadamard,concat,abs_diff', + tied_token_embedding_weight=False, + token_embedding_storage_type='dense', sagemaker_session=sagemaker_session) record_set = prepare_record_set_from_local_files(data_path, object2vec.data_location, diff --git a/tests/unit/test_object2vec.py b/tests/unit/test_object2vec.py index 8d8e557014..0d5936ecef 100644 --- a/tests/unit/test_object2vec.py +++ b/tests/unit/test_object2vec.py @@ -111,6 +111,10 @@ def test_all_hyperparameters(sagemaker_session): output_layer='softmax', optimizer='adam', learning_rate=0.0001, + negative_sampling_rate=1, + comparator_list='hadamard, abs_diff', + tied_token_embedding_weight=True, + token_embedding_storage_type='row_sparse', enc0_network='bilstm', enc1_network='hcnn', enc0_cnn_filter_width=3, @@ -161,7 +165,11 @@ def test_required_hyper_parameters_value(sagemaker_session, required_hyper_param ('optimizer', 0), ('enc0_cnn_filter_width', 'string'), ('weight_decay', 'string'), - ('learning_rate', 'string') + ('learning_rate', 'string'), + ('negative_sampling_rate', 'some_string'), + ('comparator_list', 0), + ('comparator_list', ['foobar']), + ('token_embedding_storage_type', 123), ]) def test_optional_hyper_parameters_type(sagemaker_session, optional_hyper_parameters, value): with pytest.raises(ValueError): @@ -182,7 +190,10 @@ def test_optional_hyper_parameters_type(sagemaker_session, optional_hyper_parame ('weight_decay', 200000), ('enc0_cnn_filter_width', 2000), ('learning_rate', 0), - ('learning_rate', 2) + ('learning_rate', 2), + ('negative_sampling_rate', -1), + ('comparator_list', 'hadamard,foobar'), + ('token_embedding_storage_type', 'foobar'), ]) def test_optional_hyper_parameters_value(sagemaker_session, optional_hyper_parameters, value): with pytest.raises(ValueError):