|
1 | 1 | import os |
2 | 2 | from collections import OrderedDict |
| 3 | +from unittest.mock import patch |
3 | 4 |
|
4 | 5 | import torch |
5 | 6 | from torchtext import transforms |
6 | | -from torchtext.transforms import RegexTokenizer |
| 7 | +from torchtext.transforms import MaskTransform, RegexTokenizer |
7 | 8 | from torchtext.vocab import vocab |
8 | 9 |
|
9 | 10 | from .common.assets import get_asset_path |
@@ -750,3 +751,119 @@ def test_regex_tokenizer_save_load(self) -> None: |
750 | 751 | loaded_tokenizer = torch.jit.load(save_path) |
751 | 752 | results = loaded_tokenizer(self.test_sample) |
752 | 753 | self.assertEqual(results, self.ref_results) |
| 754 | + |
| 755 | + |
| 756 | +class TestMaskTransform(TorchtextTestCase): |
| 757 | + |
| 758 | + """ |
| 759 | + Testing under these assumed conditions: |
| 760 | +
|
| 761 | + Vocab maps the following tokens to the following ids: |
| 762 | + ['a', 'b', 'c', 'd', '[PAD]', '[MASK]', '[BOS]'] -> [0, 1, 2, 3, 4, 5, 6] |
| 763 | +
|
| 764 | + The sample token sequences are: |
| 765 | + [["[BOS]", "a", "b", "c", "d"], |
| 766 | + ["[BOS]", "a", "b", "[PAD]", "[PAD]"]] |
| 767 | + """ |
| 768 | + |
| 769 | + sample_token_ids = torch.tensor([[6, 0, 1, 2, 3], [6, 0, 1, 4, 4]]) |
| 770 | + |
| 771 | + vocab_len = 7 |
| 772 | + pad_idx = 4 |
| 773 | + mask_idx = 5 |
| 774 | + bos_idx = 6 |
| 775 | + |
| 776 | + @nested_params([0.0, 1.0]) |
| 777 | + def test_mask_transform_probs(self, test_mask_prob): |
| 778 | + |
| 779 | + # We pass (vocab_len - 1) into MaskTransform to test masking with a random token. |
| 780 | + # This modifies the distribution from which token ids are randomly selected such that the |
| 781 | + # largest token id availible for selection is 1 less than the actual largest token id in our |
| 782 | + # vocab, which we've assigned to the [BOS] token. This allows us to test random replacement |
| 783 | + # by ensuring that when the first token ([BOS]) in the first sample sequence is selected for random replacement, |
| 784 | + # we know with certainty the token it is replaced with is different from the [BOS] token. |
| 785 | + # In practice, however, the actual vocab length should be provided as the input parameter so that random |
| 786 | + # replacement selects from all possible tokens in the vocab. |
| 787 | + mask_transform = MaskTransform( |
| 788 | + self.vocab_len - 1, self.mask_idx, self.bos_idx, self.pad_idx, mask_bos=False, mask_prob=test_mask_prob |
| 789 | + ) |
| 790 | + |
| 791 | + # when mask_prob = 0, we expect the first token of the first sample sequence to be chosen for replacement |
| 792 | + if test_mask_prob == 0.0: |
| 793 | + |
| 794 | + # when mask_mask_prob, rand_mask_prob = 0,0 no tokens should change |
| 795 | + with patch("torchtext.transforms.MaskTransform.mask_mask_prob", 0.0), patch( |
| 796 | + "torchtext.transforms.MaskTransform.rand_mask_prob", 0.0 |
| 797 | + ): |
| 798 | + masked_tokens, _, _ = mask_transform(self.sample_token_ids) |
| 799 | + self.assertEqual(self.sample_token_ids, masked_tokens) |
| 800 | + |
| 801 | + # when mask_mask_prob, rand_mask_prob = 0,1 we expect all tokens selected for replacement to be |
| 802 | + # changed to a random token_id |
| 803 | + with patch("torchtext.transforms.MaskTransform.mask_mask_prob", 0.0), patch( |
| 804 | + "torchtext.transforms.MaskTransform.rand_mask_prob", 1.0 |
| 805 | + ): |
| 806 | + masked_tokens, _, _ = mask_transform(self.sample_token_ids) |
| 807 | + |
| 808 | + # first token in first sequence should be different |
| 809 | + self.assertNotEqual(masked_tokens[0, 0], self.sample_token_ids[0, 0]) |
| 810 | + # replaced token id should still be in vocab, not including [BOS] |
| 811 | + assert masked_tokens[0, 0] in range(self.vocab_len - 1) |
| 812 | + |
| 813 | + # all other tokens except for first token of first sequence should remain the same |
| 814 | + self.assertEqual(self.sample_token_ids[0, 1:], masked_tokens[0, 1:]) |
| 815 | + self.assertEqual(self.sample_token_ids[1], masked_tokens[1]) |
| 816 | + |
| 817 | + # when mask_mask_prob, rand_mask_prob = 1,0 we expect all tokens selected for replacement to be changed to [MASK] |
| 818 | + with patch("torchtext.transforms.MaskTransform.mask_mask_prob", 1.0), patch( |
| 819 | + "torchtext.transforms.MaskTransform.rand_mask_prob", 0.0 |
| 820 | + ): |
| 821 | + masked_tokens, _, _ = mask_transform(self.sample_token_ids) |
| 822 | + exp_tokens = torch.tensor([[5, 0, 1, 2, 3], [6, 0, 1, 4, 4]]) |
| 823 | + self.assertEqual(exp_tokens, masked_tokens) |
| 824 | + |
| 825 | + # when mask_prob = 1, we expect all tokens that are not [BOS] or [PAD] to be chosen for replacement |
| 826 | + # (under the default condition that mask_transform.mask_bos=False) |
| 827 | + if test_mask_prob == 1.0: |
| 828 | + |
| 829 | + # when mask_mask_prob, rand_mask_prob = 0,0 no tokens should change |
| 830 | + with patch("torchtext.transforms.MaskTransform.mask_mask_prob", 0.0), patch( |
| 831 | + "torchtext.transforms.MaskTransform.rand_mask_prob", 0.0 |
| 832 | + ): |
| 833 | + masked_tokens, _, _ = mask_transform(self.sample_token_ids) |
| 834 | + self.assertEqual(self.sample_token_ids, masked_tokens) |
| 835 | + |
| 836 | + # when mask_mask_prob, rand_mask_prob = 0,1 we expect all tokens selected for replacement |
| 837 | + # to be changed to random token_ids. It is possible that the randomly selected token id is the same |
| 838 | + # as the original token id, however we know deterministically that [BOS] and [PAD] tokens |
| 839 | + # in the sequences will remain unchanged. |
| 840 | + with patch("torchtext.transforms.MaskTransform.mask_mask_prob", 0.0), patch( |
| 841 | + "torchtext.transforms.MaskTransform.rand_mask_prob", 1.0 |
| 842 | + ): |
| 843 | + masked_tokens, _, _ = mask_transform(self.sample_token_ids) |
| 844 | + self.assertEqual(masked_tokens[:, 0], 6 * torch.ones_like(masked_tokens[:, 0])) |
| 845 | + self.assertEqual(masked_tokens[1, 3:], 4 * torch.ones_like(masked_tokens[1, 3:])) |
| 846 | + |
| 847 | + # when mask_mask_prob, rand_mask_prob = 1,0 we expect all tokens selected for replacement to be changed to [MASK] |
| 848 | + with patch("torchtext.transforms.MaskTransform.mask_mask_prob", 1.0), patch( |
| 849 | + "torchtext.transforms.MaskTransform.rand_mask_prob", 0.0 |
| 850 | + ): |
| 851 | + masked_tokens, _, _ = mask_transform(self.sample_token_ids) |
| 852 | + exp_tokens = torch.tensor([[6, 5, 5, 5, 5], [6, 5, 5, 4, 4]]) |
| 853 | + self.assertEqual(exp_tokens, masked_tokens) |
| 854 | + |
| 855 | + def test_mask_transform_mask_bos(self) -> None: |
| 856 | + # MaskTransform has boolean parameter mask_bos to indicate whether or not [BOS] tokens |
| 857 | + # should be eligible for replacement. The above tests of MaskTransform are under default value |
| 858 | + # mask_bos = False. Here we test the case where mask_bos = True |
| 859 | + mask_transform = MaskTransform( |
| 860 | + self.vocab_len - 1, self.mask_idx, self.bos_idx, self.pad_idx, mask_bos=True, mask_prob=1.0 |
| 861 | + ) |
| 862 | + |
| 863 | + # when mask_mask_prob, rand_mask_prob = 1,0 we expect all tokens selected for replacement to be changed to [MASK] |
| 864 | + with patch("torchtext.transforms.MaskTransform.mask_mask_prob", 1.0), patch( |
| 865 | + "torchtext.transforms.MaskTransform.rand_mask_prob", 0.0 |
| 866 | + ): |
| 867 | + masked_tokens, _, _ = mask_transform(self.sample_token_ids) |
| 868 | + exp_tokens = torch.tensor([[5, 5, 5, 5, 5], [5, 5, 5, 4, 4]]) |
| 869 | + self.assertEqual(exp_tokens, masked_tokens) |
0 commit comments