From b019861b79132e36849e5ffa0f8148165f8925e7 Mon Sep 17 00:00:00 2001 From: pmabbo13 Date: Thu, 28 Jul 2022 22:08:05 +0000 Subject: [PATCH 1/8] tutorial on using t5 model for text summarization --- examples/tutorials/cnndm_summarization.ipynb | 385 +++++++++++++++++++ 1 file changed, 385 insertions(+) create mode 100644 examples/tutorials/cnndm_summarization.ipynb diff --git a/examples/tutorials/cnndm_summarization.ipynb b/examples/tutorials/cnndm_summarization.ipynb new file mode 100644 index 0000000000..a26026038c --- /dev/null +++ b/examples/tutorials/cnndm_summarization.ipynb @@ -0,0 +1,385 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "44cc6050", + "metadata": {}, + "source": [ + "# CNNDM TEXT SUMMARIZATION WITH T5-BASE MODEL\n", + "\n", + "**Author**: Pendo Abbo\n", + "\n", + "### Overview\n", + "\n", + "This tutorial demonstrates how to use a pre-trained T5 Model for text summarization on the CNN-DailyMail dataset. We will demonstrate how to use the torchtext library to:\n", + "\n", + "1. build a text pre-processing pipeline for a T5 model\n", + "2. read in the CNN-DM dataset and pre-process the text\n", + "3. instantiate a pre-trained t5 model with base configuration, and perform text summarization on input text" + ] + }, + { + "cell_type": "markdown", + "id": "75675cbc", + "metadata": {}, + "source": [ + "### Common Imports" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a58a9095", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "import torch.nn as nn\n", + "import torch.nn.functional as F\n", + "\n", + "DEVICE = torch.device(\"cuda\") if torch.cuda.is_available() else torch.device(\"cpu\")\n" + ] + }, + { + "cell_type": "markdown", + "id": "16e9cb58", + "metadata": {}, + "source": [ + "### Data Transformation\n", + "\n", + "The T5 model does not work with raw text. Instead, it requires the text to be transformed into numerical form in order to perform training and inference. The following transformations are required for the T5 model:\n", + "\n", + "1. Tokenize text\n", + "2. Convert tokens into (integer) IDs\n", + "3. Truncate the sequences to a specified maximum length\n", + "4. Add end-of-sequence (EOS) and padding token IDs\n", + "\n", + "T5 uses a sentencepiece model for text tokenization. Below, we use a pre-trained sentencepiece model to build the text pre-processing pipeline using torchtext's `T5Transform`. Note that the transform supports both batched and non-batched text input (i.e. one can either pass a single sentence or a list of sentences), however the T5 model expects the input to be batched." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "3191de8a", + "metadata": {}, + "outputs": [], + "source": [ + "from torchtext.prototype.models import T5Transform\n", + "\n", + "padding_idx = 0\n", + "eos_idx = 1\n", + "max_seq_len = 512\n", + "t5_sp_model_path = r\"https://download.pytorch.org/models/text/t5_tokenizer_base.model\"\n", + "\n", + "\n", + "transform = T5Transform(\n", + " sp_model_path=t5_sp_model_path,\n", + " max_seq_len=max_seq_len,\n", + " eos_idx=eos_idx,\n", + " padding_idx=padding_idx,\n", + ")\n" + ] + }, + { + "cell_type": "markdown", + "id": "35d3f4dd", + "metadata": {}, + "source": [ + "Alternatively, we can also use the transform shipped with the pre-trained models that does all of the above out-of-the-box" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "095f4549", + "metadata": {}, + "outputs": [], + "source": [ + "from torchtext.prototype.models import T5_BASE_GENERATION\n", + "\n", + "transform = T5_BASE_GENERATION.transform()\n" + ] + }, + { + "cell_type": "markdown", + "id": "6d8f9438", + "metadata": {}, + "source": [ + "### Dataset\n", + "\n", + "torchtext provides several standard NLP datasets. For a complete list, refer to the documentation at https://pytorch.org/text/stable/datasets.html. These datasets are built using composable torchdata datapipes and hence support standard flow-control and mapping/transformation using user defined functions and transforms. Below, we demonstrate how to pre-process the CNNDM dataset to include the prefix necessary for the model to indentify the task it is performing.\n", + "\n", + "The CNNDM dataset has a train, validation, and test split. Below we demo on the test split." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "b000eb71", + "metadata": {}, + "outputs": [], + "source": [ + "from functools import partial\n", + "from torch.utils.data import DataLoader\n", + "from torchtext.datasets.cnndm import CNNDM\n", + "\n", + "batch_size = 5\n", + "test_datapipe = CNNDM(split=\"test\")\n", + "task = 'summarize'\n", + "\n", + "def apply_prefix(task, x):\n", + " return f'{task}: ' + x[0], x[1]\n", + "\n", + "test_datapipe = test_datapipe.map(partial(apply_prefix, task))\n", + "test_datapipe = test_datapipe.batch(batch_size)\n", + "test_datapipe = test_datapipe.rows2columnar([\"article\", \"abstract\"])\n", + "test_dataloader = DataLoader(test_datapipe, batch_size=None)\n" + ] + }, + { + "cell_type": "markdown", + "id": "67792cae", + "metadata": {}, + "source": [ + "Alternately we can also use batched API (i.e apply the prefix on the whole batch)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "738d3a51", + "metadata": {}, + "outputs": [], + "source": [ + "def batch_prefix(task, x):\n", + " return {\n", + " \"article\": [f'{task}: ' + y for y in x[\"article\"]],\n", + " \"abstract\": x[\"abstract\"]\n", + " }\n", + "\n", + "batch_size = 5\n", + "test_datapipe = CNNDM(split=\"test\")\n", + "task = 'summarize'\n", + "\n", + "test_datapipe = test_datapipe.batch(batch_size).rows2columnar([\"article\", \"abstract\"])\n", + "test_datapipe = test_datapipe.map(partial(batch_prefix, task))\n", + "test_dataloader = DataLoader(test_datapipe, batch_size=None)\n" + ] + }, + { + "cell_type": "markdown", + "id": "a8771e00", + "metadata": {}, + "source": [ + "### Model Preparation\n", + "\n", + "torchtext provides SOTA pre-trained models that can be used directly for NLP tasks or fine-tuned on downstream tasks. Below we use the pre-trained T5 model with standard base architecture to perform text summarization. For additional details on available pre-trained models, please refer to documentation at https://pytorch.org/text/main/models.html" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "54b6446a", + "metadata": {}, + "outputs": [], + "source": [ + "t5_base = T5_BASE_GENERATION\n", + "transform = t5_base.transform()\n", + "model = t5_base.get_model()\n", + "model.to(DEVICE)\n" + ] + }, + { + "cell_type": "markdown", + "id": "568b54bd", + "metadata": {}, + "source": [ + "### Sequence Generator\n", + "\n", + "We can define a sequence generator to produce an output sequence based on the input sequence provided. This calls on the model's encoder and decoder, and iteratively expands the decoded sequences until the end-of-sequence token is generated for all sequences in the batch. The `generate` method shown below uses a greedy search (i.e. expands the sequence based on the most probable next word)." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "1153c985", + "metadata": {}, + "outputs": [], + "source": [ + "from torch import Tensor\n", + "from torchtext.prototype.models import T5Model\n", + "\n", + "def generate(\n", + " encoder_tokens: Tensor,\n", + " eos_idx: int,\n", + " model: T5Model,\n", + " ) -> Tensor:\n", + " \n", + " # pass tokens through encoder\n", + " encoder_padding_mask = encoder_tokens.eq(model.padding_idx)\n", + " encoder_embeddings = model.dropout1(model.token_embeddings(encoder_tokens))\n", + " encoder_output = model.encoder(encoder_embeddings, tgt_key_padding_mask=encoder_padding_mask)[0]\n", + "\n", + " encoder_output = model.norm1(encoder_output)\n", + " encoder_output = model.dropout2(encoder_output)\n", + " \n", + " # initialize decoder input sequence; T5 uses padding index as starter index to decoder sequence\n", + " decoder_tokens = torch.ones((encoder_tokens.size(0), 1), dtype=torch.long) * model.padding_idx\n", + " \n", + " # mask to keep track of sequences for which the decoder has not produced an end-of-sequence token yet\n", + " incomplete_sentences = torch.ones((encoder_tokens.size(0), 1), dtype=torch.long)\n", + "\n", + " # iteratively generate output sequence until all sequences in the batch have generated the end-of-sequence token\n", + " for step in range(model.config.max_seq_len):\n", + " \n", + " # causal mask and padding mask for decoder sequence\n", + " tgt_len = decoder_tokens.shape[1]\n", + " decoder_mask = torch.triu(torch.ones((tgt_len, tgt_len), dtype=torch.float64), diagonal=1).bool()\n", + " decoder_padding_mask = decoder_tokens.eq(model.padding_idx)\n", + " \n", + " # T5 implemention uses padding idx to start sequence. Want to ignore this when masking\n", + " decoder_padding_mask[:, 0] = False\n", + " \n", + " # pass decoder sequence through decoder\n", + " decoder_embeddings = model.dropout3(model.token_embeddings(decoder_tokens))\n", + " decoder_output = model.decoder(\n", + " decoder_embeddings,\n", + " memory=encoder_output,\n", + " tgt_mask=decoder_mask,\n", + " tgt_key_padding_mask=decoder_padding_mask,\n", + " memory_key_padding_mask=encoder_padding_mask,\n", + " )[0]\n", + "\n", + " decoder_output = model.norm2(decoder_output)\n", + " decoder_output = model.dropout4(decoder_output)\n", + " decoder_output = decoder_output * (model.config.embedding_dim ** -0.5)\n", + " decoder_output = model.lm_head(decoder_output)\n", + " \n", + " # greedy search for next token to add to sequence\n", + " probs = F.log_softmax(decoder_output[:,-1], dim=-1)\n", + " _, next_token = torch.topk(decoder_output[:,-1], 1)\n", + " \n", + " # ignore next tokens for sentences that are already complete\n", + " next_token *= incomplete_sentences\n", + " \n", + " # update incomplete_sentences to remove those that were just ended\n", + " incomplete_sentences = incomplete_sentences - (next_token == eos_idx).long()\n", + " \n", + " # update decoder sequences to include new tokens\n", + " decoder_tokens = torch.cat((decoder_tokens, next_token), 1)\n", + " \n", + " # early stop if all sentences have been ended\n", + " if (incomplete_sentences == 0).all():\n", + " break\n", + "\n", + " return decoder_tokens\n" + ] + }, + { + "cell_type": "markdown", + "id": "9c28b6da", + "metadata": {}, + "source": [ + "### Generate Summaries\n", + "\n", + "Finally we put all of the components together the generate summaries on the first batch of articles in the CNNDM test set." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "e243aae2", + "metadata": {}, + "outputs": [], + "source": [ + "batch = next(iter(test_dataloader))\n", + "input_text = batch[\"article\"]\n", + "model_input = transform(input_text)\n", + "model_output = generate(\n", + " model=model,\n", + " encoder_tokens=model_input,\n", + " eos_idx=eos_idx\n", + ")\n", + "output_text = transform.decode(model_output.tolist())\n", + "target = batch[\"abstract\"]\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "499ae599", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Example 1:\n", + "\n", + "prediction: the Palestinians officially become the 123rd member of the international criminal court . the move gives the court jurisdiction over alleged crimes committed in the occupied Palestinian territory . the ICC opened a preliminary examination into the situation in the occupied territories .\n", + "\n", + "target: Membership gives the ICC jurisdiction over alleged crimes committed in Palestinian territories since last June . Israel and the United States opposed the move, which could open the door to war crimes investigations against Israelis .\n", + "\n", + "\n", + "Example 2:\n", + "\n", + "prediction: a stray pooch in Washington state has used up at least three of her own after being hit by a car . the dog staggers to a nearby farm, dirt-covered and emaciated, where she is found . she suffered a dislocated jaw, leg injuries and a caved-in sinus cavity .\n", + "\n", + "target: Theia, a bully breed mix, was apparently hit by a car, whacked with a hammer and buried in a field . \"She's a true miracle dog and she deserves a good life,\" says Sara Mellado, who is looking for a home for Theia .\n", + "\n", + "\n", + "Example 3:\n", + "\n", + "prediction: mohammad Javad Zarif is the foreign minister of the country . he has been a key figure in securing a breakthrough in nuclear talks . he has been a hero in the international community .\n", + "\n", + "target: Mohammad Javad Zarif has spent more time with John Kerry than any other foreign minister . He once participated in a takeover of the Iranian Consulate in San Francisco . The Iranian foreign minister tweets in English .\n", + "\n", + "\n", + "Example 4:\n", + "\n", + "prediction: five americans were monitored for three weeks after being exposed to Ebola . one of the five had a heart-related issue on Saturday and has been discharged . none of the patients developed the deadly virus .\n", + "\n", + "target: 17 Americans were exposed to the Ebola virus while in Sierra Leone in March . Another person was diagnosed with the disease and taken to hospital in Maryland . National Institutes of Health says the patient is in fair condition after weeks of treatment .\n", + "\n", + "\n", + "Example 5:\n", + "\n", + "prediction: the student was identified during an investigation by campus police and the office of student affairs . he admitted to placing the noose on the tree early Wednesday morning .\n", + "\n", + "target: Student is no longer on Duke University campus and will face disciplinary review . School officials identified student during investigation and the person admitted to hanging the noose, Duke says . The noose, made of rope, was discovered on campus about 2 a.m.\n", + "\n", + "\n" + ] + } + ], + "source": [ + "for i in range(batch_size):\n", + " \n", + " print(f\"Example {i+1}:\\n\")\n", + " print(f\"prediction: {output_text[i]}\\n\")\n", + " print(f\"target: {target[i]}\\n\\n\")\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 0ea8b1b3a184d87f8a004a3a5c49d181c57f1791 Mon Sep 17 00:00:00 2001 From: pmabbo13 Date: Fri, 29 Jul 2022 20:15:32 +0000 Subject: [PATCH 2/8] formatting tutorial as python script --- examples/tutorials/cnndm_summarization.ipynb | 385 ------------------- examples/tutorials/cnndm_summarization.py | 306 +++++++++++++++ 2 files changed, 306 insertions(+), 385 deletions(-) delete mode 100644 examples/tutorials/cnndm_summarization.ipynb create mode 100644 examples/tutorials/cnndm_summarization.py diff --git a/examples/tutorials/cnndm_summarization.ipynb b/examples/tutorials/cnndm_summarization.ipynb deleted file mode 100644 index a26026038c..0000000000 --- a/examples/tutorials/cnndm_summarization.ipynb +++ /dev/null @@ -1,385 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "44cc6050", - "metadata": {}, - "source": [ - "# CNNDM TEXT SUMMARIZATION WITH T5-BASE MODEL\n", - "\n", - "**Author**: Pendo Abbo\n", - "\n", - "### Overview\n", - "\n", - "This tutorial demonstrates how to use a pre-trained T5 Model for text summarization on the CNN-DailyMail dataset. We will demonstrate how to use the torchtext library to:\n", - "\n", - "1. build a text pre-processing pipeline for a T5 model\n", - "2. read in the CNN-DM dataset and pre-process the text\n", - "3. instantiate a pre-trained t5 model with base configuration, and perform text summarization on input text" - ] - }, - { - "cell_type": "markdown", - "id": "75675cbc", - "metadata": {}, - "source": [ - "### Common Imports" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a58a9095", - "metadata": {}, - "outputs": [], - "source": [ - "import torch\n", - "import torch.nn as nn\n", - "import torch.nn.functional as F\n", - "\n", - "DEVICE = torch.device(\"cuda\") if torch.cuda.is_available() else torch.device(\"cpu\")\n" - ] - }, - { - "cell_type": "markdown", - "id": "16e9cb58", - "metadata": {}, - "source": [ - "### Data Transformation\n", - "\n", - "The T5 model does not work with raw text. Instead, it requires the text to be transformed into numerical form in order to perform training and inference. The following transformations are required for the T5 model:\n", - "\n", - "1. Tokenize text\n", - "2. Convert tokens into (integer) IDs\n", - "3. Truncate the sequences to a specified maximum length\n", - "4. Add end-of-sequence (EOS) and padding token IDs\n", - "\n", - "T5 uses a sentencepiece model for text tokenization. Below, we use a pre-trained sentencepiece model to build the text pre-processing pipeline using torchtext's `T5Transform`. Note that the transform supports both batched and non-batched text input (i.e. one can either pass a single sentence or a list of sentences), however the T5 model expects the input to be batched." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "3191de8a", - "metadata": {}, - "outputs": [], - "source": [ - "from torchtext.prototype.models import T5Transform\n", - "\n", - "padding_idx = 0\n", - "eos_idx = 1\n", - "max_seq_len = 512\n", - "t5_sp_model_path = r\"https://download.pytorch.org/models/text/t5_tokenizer_base.model\"\n", - "\n", - "\n", - "transform = T5Transform(\n", - " sp_model_path=t5_sp_model_path,\n", - " max_seq_len=max_seq_len,\n", - " eos_idx=eos_idx,\n", - " padding_idx=padding_idx,\n", - ")\n" - ] - }, - { - "cell_type": "markdown", - "id": "35d3f4dd", - "metadata": {}, - "source": [ - "Alternatively, we can also use the transform shipped with the pre-trained models that does all of the above out-of-the-box" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "095f4549", - "metadata": {}, - "outputs": [], - "source": [ - "from torchtext.prototype.models import T5_BASE_GENERATION\n", - "\n", - "transform = T5_BASE_GENERATION.transform()\n" - ] - }, - { - "cell_type": "markdown", - "id": "6d8f9438", - "metadata": {}, - "source": [ - "### Dataset\n", - "\n", - "torchtext provides several standard NLP datasets. For a complete list, refer to the documentation at https://pytorch.org/text/stable/datasets.html. These datasets are built using composable torchdata datapipes and hence support standard flow-control and mapping/transformation using user defined functions and transforms. Below, we demonstrate how to pre-process the CNNDM dataset to include the prefix necessary for the model to indentify the task it is performing.\n", - "\n", - "The CNNDM dataset has a train, validation, and test split. Below we demo on the test split." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "b000eb71", - "metadata": {}, - "outputs": [], - "source": [ - "from functools import partial\n", - "from torch.utils.data import DataLoader\n", - "from torchtext.datasets.cnndm import CNNDM\n", - "\n", - "batch_size = 5\n", - "test_datapipe = CNNDM(split=\"test\")\n", - "task = 'summarize'\n", - "\n", - "def apply_prefix(task, x):\n", - " return f'{task}: ' + x[0], x[1]\n", - "\n", - "test_datapipe = test_datapipe.map(partial(apply_prefix, task))\n", - "test_datapipe = test_datapipe.batch(batch_size)\n", - "test_datapipe = test_datapipe.rows2columnar([\"article\", \"abstract\"])\n", - "test_dataloader = DataLoader(test_datapipe, batch_size=None)\n" - ] - }, - { - "cell_type": "markdown", - "id": "67792cae", - "metadata": {}, - "source": [ - "Alternately we can also use batched API (i.e apply the prefix on the whole batch)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "738d3a51", - "metadata": {}, - "outputs": [], - "source": [ - "def batch_prefix(task, x):\n", - " return {\n", - " \"article\": [f'{task}: ' + y for y in x[\"article\"]],\n", - " \"abstract\": x[\"abstract\"]\n", - " }\n", - "\n", - "batch_size = 5\n", - "test_datapipe = CNNDM(split=\"test\")\n", - "task = 'summarize'\n", - "\n", - "test_datapipe = test_datapipe.batch(batch_size).rows2columnar([\"article\", \"abstract\"])\n", - "test_datapipe = test_datapipe.map(partial(batch_prefix, task))\n", - "test_dataloader = DataLoader(test_datapipe, batch_size=None)\n" - ] - }, - { - "cell_type": "markdown", - "id": "a8771e00", - "metadata": {}, - "source": [ - "### Model Preparation\n", - "\n", - "torchtext provides SOTA pre-trained models that can be used directly for NLP tasks or fine-tuned on downstream tasks. Below we use the pre-trained T5 model with standard base architecture to perform text summarization. For additional details on available pre-trained models, please refer to documentation at https://pytorch.org/text/main/models.html" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "54b6446a", - "metadata": {}, - "outputs": [], - "source": [ - "t5_base = T5_BASE_GENERATION\n", - "transform = t5_base.transform()\n", - "model = t5_base.get_model()\n", - "model.to(DEVICE)\n" - ] - }, - { - "cell_type": "markdown", - "id": "568b54bd", - "metadata": {}, - "source": [ - "### Sequence Generator\n", - "\n", - "We can define a sequence generator to produce an output sequence based on the input sequence provided. This calls on the model's encoder and decoder, and iteratively expands the decoded sequences until the end-of-sequence token is generated for all sequences in the batch. The `generate` method shown below uses a greedy search (i.e. expands the sequence based on the most probable next word)." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "1153c985", - "metadata": {}, - "outputs": [], - "source": [ - "from torch import Tensor\n", - "from torchtext.prototype.models import T5Model\n", - "\n", - "def generate(\n", - " encoder_tokens: Tensor,\n", - " eos_idx: int,\n", - " model: T5Model,\n", - " ) -> Tensor:\n", - " \n", - " # pass tokens through encoder\n", - " encoder_padding_mask = encoder_tokens.eq(model.padding_idx)\n", - " encoder_embeddings = model.dropout1(model.token_embeddings(encoder_tokens))\n", - " encoder_output = model.encoder(encoder_embeddings, tgt_key_padding_mask=encoder_padding_mask)[0]\n", - "\n", - " encoder_output = model.norm1(encoder_output)\n", - " encoder_output = model.dropout2(encoder_output)\n", - " \n", - " # initialize decoder input sequence; T5 uses padding index as starter index to decoder sequence\n", - " decoder_tokens = torch.ones((encoder_tokens.size(0), 1), dtype=torch.long) * model.padding_idx\n", - " \n", - " # mask to keep track of sequences for which the decoder has not produced an end-of-sequence token yet\n", - " incomplete_sentences = torch.ones((encoder_tokens.size(0), 1), dtype=torch.long)\n", - "\n", - " # iteratively generate output sequence until all sequences in the batch have generated the end-of-sequence token\n", - " for step in range(model.config.max_seq_len):\n", - " \n", - " # causal mask and padding mask for decoder sequence\n", - " tgt_len = decoder_tokens.shape[1]\n", - " decoder_mask = torch.triu(torch.ones((tgt_len, tgt_len), dtype=torch.float64), diagonal=1).bool()\n", - " decoder_padding_mask = decoder_tokens.eq(model.padding_idx)\n", - " \n", - " # T5 implemention uses padding idx to start sequence. Want to ignore this when masking\n", - " decoder_padding_mask[:, 0] = False\n", - " \n", - " # pass decoder sequence through decoder\n", - " decoder_embeddings = model.dropout3(model.token_embeddings(decoder_tokens))\n", - " decoder_output = model.decoder(\n", - " decoder_embeddings,\n", - " memory=encoder_output,\n", - " tgt_mask=decoder_mask,\n", - " tgt_key_padding_mask=decoder_padding_mask,\n", - " memory_key_padding_mask=encoder_padding_mask,\n", - " )[0]\n", - "\n", - " decoder_output = model.norm2(decoder_output)\n", - " decoder_output = model.dropout4(decoder_output)\n", - " decoder_output = decoder_output * (model.config.embedding_dim ** -0.5)\n", - " decoder_output = model.lm_head(decoder_output)\n", - " \n", - " # greedy search for next token to add to sequence\n", - " probs = F.log_softmax(decoder_output[:,-1], dim=-1)\n", - " _, next_token = torch.topk(decoder_output[:,-1], 1)\n", - " \n", - " # ignore next tokens for sentences that are already complete\n", - " next_token *= incomplete_sentences\n", - " \n", - " # update incomplete_sentences to remove those that were just ended\n", - " incomplete_sentences = incomplete_sentences - (next_token == eos_idx).long()\n", - " \n", - " # update decoder sequences to include new tokens\n", - " decoder_tokens = torch.cat((decoder_tokens, next_token), 1)\n", - " \n", - " # early stop if all sentences have been ended\n", - " if (incomplete_sentences == 0).all():\n", - " break\n", - "\n", - " return decoder_tokens\n" - ] - }, - { - "cell_type": "markdown", - "id": "9c28b6da", - "metadata": {}, - "source": [ - "### Generate Summaries\n", - "\n", - "Finally we put all of the components together the generate summaries on the first batch of articles in the CNNDM test set." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "e243aae2", - "metadata": {}, - "outputs": [], - "source": [ - "batch = next(iter(test_dataloader))\n", - "input_text = batch[\"article\"]\n", - "model_input = transform(input_text)\n", - "model_output = generate(\n", - " model=model,\n", - " encoder_tokens=model_input,\n", - " eos_idx=eos_idx\n", - ")\n", - "output_text = transform.decode(model_output.tolist())\n", - "target = batch[\"abstract\"]\n" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "499ae599", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Example 1:\n", - "\n", - "prediction: the Palestinians officially become the 123rd member of the international criminal court . the move gives the court jurisdiction over alleged crimes committed in the occupied Palestinian territory . the ICC opened a preliminary examination into the situation in the occupied territories .\n", - "\n", - "target: Membership gives the ICC jurisdiction over alleged crimes committed in Palestinian territories since last June . Israel and the United States opposed the move, which could open the door to war crimes investigations against Israelis .\n", - "\n", - "\n", - "Example 2:\n", - "\n", - "prediction: a stray pooch in Washington state has used up at least three of her own after being hit by a car . the dog staggers to a nearby farm, dirt-covered and emaciated, where she is found . she suffered a dislocated jaw, leg injuries and a caved-in sinus cavity .\n", - "\n", - "target: Theia, a bully breed mix, was apparently hit by a car, whacked with a hammer and buried in a field . \"She's a true miracle dog and she deserves a good life,\" says Sara Mellado, who is looking for a home for Theia .\n", - "\n", - "\n", - "Example 3:\n", - "\n", - "prediction: mohammad Javad Zarif is the foreign minister of the country . he has been a key figure in securing a breakthrough in nuclear talks . he has been a hero in the international community .\n", - "\n", - "target: Mohammad Javad Zarif has spent more time with John Kerry than any other foreign minister . He once participated in a takeover of the Iranian Consulate in San Francisco . The Iranian foreign minister tweets in English .\n", - "\n", - "\n", - "Example 4:\n", - "\n", - "prediction: five americans were monitored for three weeks after being exposed to Ebola . one of the five had a heart-related issue on Saturday and has been discharged . none of the patients developed the deadly virus .\n", - "\n", - "target: 17 Americans were exposed to the Ebola virus while in Sierra Leone in March . Another person was diagnosed with the disease and taken to hospital in Maryland . National Institutes of Health says the patient is in fair condition after weeks of treatment .\n", - "\n", - "\n", - "Example 5:\n", - "\n", - "prediction: the student was identified during an investigation by campus police and the office of student affairs . he admitted to placing the noose on the tree early Wednesday morning .\n", - "\n", - "target: Student is no longer on Duke University campus and will face disciplinary review . School officials identified student during investigation and the person admitted to hanging the noose, Duke says . The noose, made of rope, was discovered on campus about 2 a.m.\n", - "\n", - "\n" - ] - } - ], - "source": [ - "for i in range(batch_size):\n", - " \n", - " print(f\"Example {i+1}:\\n\")\n", - " print(f\"prediction: {output_text[i]}\\n\")\n", - " print(f\"target: {target[i]}\\n\\n\")\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.12" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/examples/tutorials/cnndm_summarization.py b/examples/tutorials/cnndm_summarization.py new file mode 100644 index 0000000000..9628f80816 --- /dev/null +++ b/examples/tutorials/cnndm_summarization.py @@ -0,0 +1,306 @@ +""" +CNNDM Text Summarization with T5-Base model +======================================================= + +**Author**: `Pendo Abbo `__ + +""" + +###################################################################### +# Overview +# -------- +# +# This tutorial demonstrates how to use a pre-trained T5 Model for text summarization on the CNN-DailyMail dataset. +# We will demonstrate how to use the torchtext library to: + +# 1. Build a text pre-processing pipeline for a T5 model +# 2. Read in the CNNDM dataset and pre-process the text +# 3. Instantiate a pre-trained T5 model with base configuration, and perform text summarization on input text +# +# + +###################################################################### +# Common imports +# -------------- +import torch +import torch.nn as nn +import torch.nn.functional as F + +DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") + +####################################################################### +# Data Transformation +# ------------------- +# +# The T5 model does not work with raw text. Instead, it requires the text to be transformed into numerical form +# in order to perform training and inference. The following transformations are required for the T5 model: + +# 1. Tokenize text +# 2. Convert tokens into (integer) IDs +# 3. Truncate the sequences to a specified maximum length +# 4. Add end-of-sequence (EOS) and padding token IDs + +# T5 uses a SentencePiece model for text tokenization. Below, we use a pre-trained SentencePiece model to build +# the text pre-processing pipeline using torchtext's `T5Transform`. Note that the transform supports both +# batched and non-batched text input (i.e. one can either pass a single sentence or a list of sentences), however +# the T5 model expects the input to be batched. + +from torchtext.prototype.models import T5Transform + +padding_idx = 0 +eos_idx = 1 +max_seq_len = 512 +t5_sp_model_path = r"https://download.pytorch.org/models/text/t5_tokenizer_base.model" + +transform = T5Transform( + sp_model_path=t5_sp_model_path, + max_seq_len=max_seq_len, + eos_idx=eos_idx, + padding_idx=padding_idx, +) + +####################################################################### +# Alternatively, we can also use the transform shipped with the pre-trained models that does all of the above out-of-the-box +# +# :: +# +# from torchtext.prototype.models import T5_BASE_GENERATION +# transform = T5_BASE_GENERATION.transform() +# + +####################################################################### +# Dataset +# ------- +# torchtext provides several standard NLP datasets. For a complete list, refer to the documentation at https://pytorch.org/text/stable/datasets.html. +# These datasets are built using composable torchdata datapipes and hence support standard flow-control and mapping/transformation +# using user defined functions and transforms. Below, we demonstrate how to pre-process the CNNDM dataset to include the prefix necessary +# for the model to identify the task it is performing. + +# The CNNDM dataset has a train, validation, and test split. Below we demo on the test split. +# +# .. note:: +# Using datapipes is still currently subject to a few caveats. If you wish +# to extend this example to include shuffling, multi-processing, or +# distributed learning, please see :ref:`this note ` +# for further instructions. + +from functools import partial +from torch.utils.data import DataLoader +from torchtext.datasets.cnndm import CNNDM + +batch_size = 5 +test_datapipe = CNNDM(split="test") +task = 'summarize' + +def apply_prefix(task, x): + return f'{task}: ' + x[0], x[1] + +test_datapipe = test_datapipe.map(partial(apply_prefix, task)) +test_datapipe = test_datapipe.batch(batch_size) +test_datapipe = test_datapipe.rows2columnar(["article", "abstract"]) +test_dataloader = DataLoader(test_datapipe, batch_size=None) + +####################################################################### +# Alternately we can also use batched API (i.e apply the prefix on the whole batch) +# +# :: +# +# def batch_prefix(task, x): +# return { +# "article": [f'{task}: ' + y for y in x["article"]], +# "abstract": x["abstract"] +# } +# +# batch_size = 5 +# test_datapipe = CNNDM(split="test") +# task = 'summarize' +# +# test_datapipe = test_datapipe.batch(batch_size).rows2columnar(["article", "abstract"]) +# test_datapipe = test_datapipe.map(partial(batch_prefix, task)) +# test_dataloader = DataLoader(test_datapipe, batch_size=None) +# + +###################################################################### +# Model Preparation +# ----------------- +# +# torchtext provides SOTA pre-trained models that can be used directly for NLP tasks or fine-tuned on downstream tasks. Below +# we use the pre-trained T5 model with standard base architecture to perform text summarization. For additional details on +# available pre-trained models, please refer to documentation at https://pytorch.org/text/main/models.html +# +# + +t5_base = T5_BASE_GENERATION +transform = t5_base.transform() +model = t5_base.get_model() +model.to(DEVICE) + + +####################################################################### +# Sequence Generator +# ------------------ +# +# We can define a sequence generator to produce an output sequence based on the input sequence provided. This calls on the +# model's encoder and decoder, and iteratively expands the decoded sequences until the end-of-sequence token is generated +# for all sequences in the batch. The `greedy_generator` method shown below uses a greedy search (i.e. expands the sequence +# based on the most probable next word). +# + +from torch import Tensor +from torchtext.prototype.models import T5Model + +def greedy_generator( + encoder_tokens: Tensor, + eos_idx: int, + model: T5Model, + ) -> Tensor: + + # pass tokens through encoder + encoder_padding_mask = encoder_tokens.eq(model.padding_idx) + encoder_embeddings = model.dropout1(model.token_embeddings(encoder_tokens)) + encoder_output = model.encoder(encoder_embeddings, tgt_key_padding_mask=encoder_padding_mask)[0] + + encoder_output = model.norm1(encoder_output) + encoder_output = model.dropout2(encoder_output) + + # initialize decoder input sequence; T5 uses padding index as starter index to decoder sequence + decoder_tokens = torch.ones((encoder_tokens.size(0), 1), dtype=torch.long) * model.padding_idx + + # mask to keep track of sequences for which the decoder has not produced an end-of-sequence token yet + incomplete_sentences = torch.ones((encoder_tokens.size(0), 1), dtype=torch.long) + + # iteratively generate output sequence until all sequences in the batch have generated the end-of-sequence token + for step in range(model.config.max_seq_len): + + # causal mask and padding mask for decoder sequence + tgt_len = decoder_tokens.shape[1] + decoder_mask = torch.triu(torch.ones((tgt_len, tgt_len), dtype=torch.float64), diagonal=1).bool() + decoder_padding_mask = decoder_tokens.eq(model.padding_idx) + + # T5 implemention uses padding idx to start sequence. Want to ignore this when masking + decoder_padding_mask[:, 0] = False + + # pass decoder sequence through decoder + decoder_embeddings = model.dropout3(model.token_embeddings(decoder_tokens)) + decoder_output = model.decoder( + decoder_embeddings, + memory=encoder_output, + tgt_mask=decoder_mask, + tgt_key_padding_mask=decoder_padding_mask, + memory_key_padding_mask=encoder_padding_mask, + )[0] + + decoder_output = model.norm2(decoder_output) + decoder_output = model.dropout4(decoder_output) + decoder_output = decoder_output * (model.config.embedding_dim ** -0.5) + decoder_output = model.lm_head(decoder_output) + + # greedy search for next token to add to sequence + probs = F.log_softmax(decoder_output[:,-1], dim=-1) + _, next_token = torch.topk(decoder_output[:,-1], 1) + + # ignore next tokens for sentences that are already complete + next_token *= incomplete_sentences + + # update incomplete_sentences to remove those that were just ended + incomplete_sentences = incomplete_sentences - (next_token == eos_idx).long() + + # update decoder sequences to include new tokens + decoder_tokens = torch.cat((decoder_tokens, next_token), 1) + + # early stop if all sentences have been ended + if (incomplete_sentences == 0).all(): + break + + return decoder_tokens + + +####################################################################### +# Generate Summaries +# ------------------ +# +# Finally we put all of the components together to generate summaries on the first batch of articles in the CNNDM test set. +# + +batch = next(iter(test_dataloader)) +input_text = batch["article"] +model_input = transform(input_text) + +model_output = greedy_generator( + model=model, + encoder_tokens=model_input, + eos_idx=eos_idx +) +output_text = transform.decode(model_output.tolist()) + +for i in range(batch_size): + + print(f"Example {i+1}:\n") + print(f"greedy prediction: {output_text[i]}\n") + print(f"target: {target[i]}\n\n") + + +####################################################################### +# Output +# ------ +# +# :: +# +# Example 1: +# +# prediction: the Palestinians officially become the 123rd member of the international +# criminal court . the move gives the court jurisdiction over alleged crimes committed +# in the occupied Palestinian territory . the ICC opened a preliminary examination into +# the situation in the occupied territories . +# +# target: Membership gives the ICC jurisdiction over alleged crimes committed in +# Palestinian territories since last June . Israel and the United States opposed the +# move, which could open the door to war crimes investigations against Israelis . +# +# +# Example 2: +# +# prediction: a stray pooch in Washington state has used up at least three of her own +# after being hit by a car . the dog staggers to a nearby farm, dirt-covered and +# emaciated, where she is found . she suffered a dislocated jaw, leg injuries and a +# caved-in sinus cavity . +# +# target: Theia, a bully breed mix, was apparently hit by a car, whacked with a hammer +# and buried in a field . "She's a true miracle dog and she deserves a good life," says +# Sara Mellado, who is looking for a home for Theia . +# +# +# Example 3: +# +# prediction: mohammad Javad Zarif is the foreign minister of the country . he has been +# a key figure in securing a breakthrough in nuclear talks . he has been a hero in the +# international community . +# +# target: Mohammad Javad Zarif has spent more time with John Kerry than any other +# foreign minister . He once participated in a takeover of the Iranian Consulate in San +# Francisco . The Iranian foreign minister tweets in English . +# +# +# Example 4: +# +# prediction: five americans were monitored for three weeks after being exposed to +# Ebola . one of the five had a heart-related issue on Saturday and has been discharged . +# none of the patients developed the deadly virus . +# +# target: 17 Americans were exposed to the Ebola virus while in Sierra Leone in March . +# Another person was diagnosed with the disease and taken to hospital in Maryland . +# National Institutes of Health says the patient is in fair condition after weeks of +# treatment . +# +# +# Example 5: +# +# prediction: the student was identified during an investigation by campus police and +# the office of student affairs . he admitted to placing the noose on the tree early +# Wednesday morning . +# +# target: Student is no longer on Duke University campus and will face disciplinary +# review . School officials identified student during investigation and the person +# admitted to hanging the noose, Duke says . The noose, made of rope, was discovered on +# campus about 2 a.m. +# From 0ce630b7f09259b3beb600ec93c48e13f677d139 Mon Sep 17 00:00:00 2001 From: pmabbo13 Date: Fri, 29 Jul 2022 16:31:38 -0400 Subject: [PATCH 3/8] pre-commit --- examples/tutorials/cnndm_summarization.py | 151 +++++++++++----------- 1 file changed, 75 insertions(+), 76 deletions(-) diff --git a/examples/tutorials/cnndm_summarization.py b/examples/tutorials/cnndm_summarization.py index 9628f80816..31aba3e06f 100644 --- a/examples/tutorials/cnndm_summarization.py +++ b/examples/tutorials/cnndm_summarization.py @@ -23,7 +23,6 @@ # Common imports # -------------- import torch -import torch.nn as nn import torch.nn.functional as F DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") @@ -53,10 +52,10 @@ t5_sp_model_path = r"https://download.pytorch.org/models/text/t5_tokenizer_base.model" transform = T5Transform( - sp_model_path=t5_sp_model_path, - max_seq_len=max_seq_len, - eos_idx=eos_idx, - padding_idx=padding_idx, + sp_model_path=t5_sp_model_path, + max_seq_len=max_seq_len, + eos_idx=eos_idx, + padding_idx=padding_idx, ) ####################################################################### @@ -85,15 +84,18 @@ # for further instructions. from functools import partial + from torch.utils.data import DataLoader from torchtext.datasets.cnndm import CNNDM batch_size = 5 test_datapipe = CNNDM(split="test") -task = 'summarize' +task = "summarize" + def apply_prefix(task, x): - return f'{task}: ' + x[0], x[1] + return f"{task}: " + x[0], x[1] + test_datapipe = test_datapipe.map(partial(apply_prefix, task)) test_datapipe = test_datapipe.batch(batch_size) @@ -149,70 +151,71 @@ def apply_prefix(task, x): from torch import Tensor from torchtext.prototype.models import T5Model + def greedy_generator( - encoder_tokens: Tensor, - eos_idx: int, - model: T5Model, - ) -> Tensor: - - # pass tokens through encoder - encoder_padding_mask = encoder_tokens.eq(model.padding_idx) - encoder_embeddings = model.dropout1(model.token_embeddings(encoder_tokens)) - encoder_output = model.encoder(encoder_embeddings, tgt_key_padding_mask=encoder_padding_mask)[0] - - encoder_output = model.norm1(encoder_output) - encoder_output = model.dropout2(encoder_output) - - # initialize decoder input sequence; T5 uses padding index as starter index to decoder sequence - decoder_tokens = torch.ones((encoder_tokens.size(0), 1), dtype=torch.long) * model.padding_idx - - # mask to keep track of sequences for which the decoder has not produced an end-of-sequence token yet - incomplete_sentences = torch.ones((encoder_tokens.size(0), 1), dtype=torch.long) - - # iteratively generate output sequence until all sequences in the batch have generated the end-of-sequence token - for step in range(model.config.max_seq_len): - - # causal mask and padding mask for decoder sequence - tgt_len = decoder_tokens.shape[1] - decoder_mask = torch.triu(torch.ones((tgt_len, tgt_len), dtype=torch.float64), diagonal=1).bool() - decoder_padding_mask = decoder_tokens.eq(model.padding_idx) - - # T5 implemention uses padding idx to start sequence. Want to ignore this when masking - decoder_padding_mask[:, 0] = False - - # pass decoder sequence through decoder - decoder_embeddings = model.dropout3(model.token_embeddings(decoder_tokens)) - decoder_output = model.decoder( - decoder_embeddings, - memory=encoder_output, - tgt_mask=decoder_mask, - tgt_key_padding_mask=decoder_padding_mask, - memory_key_padding_mask=encoder_padding_mask, - )[0] - - decoder_output = model.norm2(decoder_output) - decoder_output = model.dropout4(decoder_output) - decoder_output = decoder_output * (model.config.embedding_dim ** -0.5) - decoder_output = model.lm_head(decoder_output) - - # greedy search for next token to add to sequence - probs = F.log_softmax(decoder_output[:,-1], dim=-1) - _, next_token = torch.topk(decoder_output[:,-1], 1) - - # ignore next tokens for sentences that are already complete - next_token *= incomplete_sentences - - # update incomplete_sentences to remove those that were just ended - incomplete_sentences = incomplete_sentences - (next_token == eos_idx).long() - - # update decoder sequences to include new tokens - decoder_tokens = torch.cat((decoder_tokens, next_token), 1) - - # early stop if all sentences have been ended - if (incomplete_sentences == 0).all(): - break - - return decoder_tokens + encoder_tokens: Tensor, + eos_idx: int, + model: T5Model, +) -> Tensor: + + # pass tokens through encoder + encoder_padding_mask = encoder_tokens.eq(model.padding_idx) + encoder_embeddings = model.dropout1(model.token_embeddings(encoder_tokens)) + encoder_output = model.encoder(encoder_embeddings, tgt_key_padding_mask=encoder_padding_mask)[0] + + encoder_output = model.norm1(encoder_output) + encoder_output = model.dropout2(encoder_output) + + # initialize decoder input sequence; T5 uses padding index as starter index to decoder sequence + decoder_tokens = torch.ones((encoder_tokens.size(0), 1), dtype=torch.long) * model.padding_idx + + # mask to keep track of sequences for which the decoder has not produced an end-of-sequence token yet + incomplete_sentences = torch.ones((encoder_tokens.size(0), 1), dtype=torch.long) + + # iteratively generate output sequence until all sequences in the batch have generated the end-of-sequence token + for step in range(model.config.max_seq_len): + + # causal mask and padding mask for decoder sequence + tgt_len = decoder_tokens.shape[1] + decoder_mask = torch.triu(torch.ones((tgt_len, tgt_len), dtype=torch.float64), diagonal=1).bool() + decoder_padding_mask = decoder_tokens.eq(model.padding_idx) + + # T5 implemention uses padding idx to start sequence. Want to ignore this when masking + decoder_padding_mask[:, 0] = False + + # pass decoder sequence through decoder + decoder_embeddings = model.dropout3(model.token_embeddings(decoder_tokens)) + decoder_output = model.decoder( + decoder_embeddings, + memory=encoder_output, + tgt_mask=decoder_mask, + tgt_key_padding_mask=decoder_padding_mask, + memory_key_padding_mask=encoder_padding_mask, + )[0] + + decoder_output = model.norm2(decoder_output) + decoder_output = model.dropout4(decoder_output) + decoder_output = decoder_output * (model.config.embedding_dim ** -0.5) + decoder_output = model.lm_head(decoder_output) + + # greedy search for next token to add to sequence + probs = F.log_softmax(decoder_output[:, -1], dim=-1) + _, next_token = torch.topk(probs, 1) + + # ignore next tokens for sentences that are already complete + next_token *= incomplete_sentences + + # update incomplete_sentences to remove those that were just ended + incomplete_sentences = incomplete_sentences - (next_token == eos_idx).long() + + # update decoder sequences to include new tokens + decoder_tokens = torch.cat((decoder_tokens, next_token), 1) + + # early stop if all sentences have been ended + if (incomplete_sentences == 0).all(): + break + + return decoder_tokens ####################################################################### @@ -226,15 +229,11 @@ def greedy_generator( input_text = batch["article"] model_input = transform(input_text) -model_output = greedy_generator( - model=model, - encoder_tokens=model_input, - eos_idx=eos_idx -) +model_output = greedy_generator(model=model, encoder_tokens=model_input, eos_idx=eos_idx) output_text = transform.decode(model_output.tolist()) for i in range(batch_size): - + print(f"Example {i+1}:\n") print(f"greedy prediction: {output_text[i]}\n") print(f"target: {target[i]}\n\n") From 03315f8c4083a92074791c2cb44fb73e255f772a Mon Sep 17 00:00:00 2001 From: pmabbo13 Date: Mon, 1 Aug 2022 11:10:33 -0400 Subject: [PATCH 4/8] correcting indentation errors --- examples/tutorials/cnndm_summarization.py | 77 +++++++++++------------ 1 file changed, 38 insertions(+), 39 deletions(-) diff --git a/examples/tutorials/cnndm_summarization.py b/examples/tutorials/cnndm_summarization.py index 31aba3e06f..1947b719bb 100644 --- a/examples/tutorials/cnndm_summarization.py +++ b/examples/tutorials/cnndm_summarization.py @@ -245,61 +245,60 @@ def greedy_generator( # # :: # -# Example 1: +# Example 1: # -# prediction: the Palestinians officially become the 123rd member of the international -# criminal court . the move gives the court jurisdiction over alleged crimes committed -# in the occupied Palestinian territory . the ICC opened a preliminary examination into -# the situation in the occupied territories . +# prediction: the Palestinians officially become the 123rd member of the international +# criminal court . the move gives the court jurisdiction over alleged crimes committed +# in the occupied Palestinian territory . the ICC opened a preliminary examination into +# the situation in the occupied territories . # -# target: Membership gives the ICC jurisdiction over alleged crimes committed in -# Palestinian territories since last June . Israel and the United States opposed the -# move, which could open the door to war crimes investigations against Israelis . +# target: Membership gives the ICC jurisdiction over alleged crimes committed in +# Palestinian territories since last June . Israel and the United States opposed the +# move, which could open the door to war crimes investigations against Israelis . # # -# Example 2: +# Example 2: # -# prediction: a stray pooch in Washington state has used up at least three of her own +# prediction: a stray pooch in Washington state has used up at least three of her own # after being hit by a car . the dog staggers to a nearby farm, dirt-covered and -# emaciated, where she is found . she suffered a dislocated jaw, leg injuries and a -# caved-in sinus cavity . +# emaciated, where she is found . she suffered a dislocated jaw, leg injuries and a +# caved-in sinus cavity . # -# target: Theia, a bully breed mix, was apparently hit by a car, whacked with a hammer -# and buried in a field . "She's a true miracle dog and she deserves a good life," says -# Sara Mellado, who is looking for a home for Theia . +# target: Theia, a bully breed mix, was apparently hit by a car, whacked with a hammer +# and buried in a field . "She's a true miracle dog and she deserves a good life," says +# Sara Mellado, who is looking for a home for Theia . # # -# Example 3: +# Example 3: # -# prediction: mohammad Javad Zarif is the foreign minister of the country . he has been -# a key figure in securing a breakthrough in nuclear talks . he has been a hero in the -# international community . +# prediction: mohammad Javad Zarif is the foreign minister of the country . he has been +# a key figure in securing a breakthrough in nuclear talks . he has been a hero in the +# international community . # -# target: Mohammad Javad Zarif has spent more time with John Kerry than any other -# foreign minister . He once participated in a takeover of the Iranian Consulate in San -# Francisco . The Iranian foreign minister tweets in English . +# target: Mohammad Javad Zarif has spent more time with John Kerry than any other +# foreign minister . He once participated in a takeover of the Iranian Consulate in San +# Francisco . The Iranian foreign minister tweets in English . # # -# Example 4: +# Example 4: # -# prediction: five americans were monitored for three weeks after being exposed to -# Ebola . one of the five had a heart-related issue on Saturday and has been discharged . -# none of the patients developed the deadly virus . +# prediction: five americans were monitored for three weeks after being exposed to +# Ebola . one of the five had a heart-related issue on Saturday and has been discharged . +# none of the patients developed the deadly virus . # -# target: 17 Americans were exposed to the Ebola virus while in Sierra Leone in March . -# Another person was diagnosed with the disease and taken to hospital in Maryland . -# National Institutes of Health says the patient is in fair condition after weeks of -# treatment . +# target: 17 Americans were exposed to the Ebola virus while in Sierra Leone in March . +# Another person was diagnosed with the disease and taken to hospital in Maryland . +# National Institutes of Health says the patient is in fair condition after weeks of +# treatment . # # -# Example 5: +# Example 5: # -# prediction: the student was identified during an investigation by campus police and -# the office of student affairs . he admitted to placing the noose on the tree early -# Wednesday morning . -# -# target: Student is no longer on Duke University campus and will face disciplinary -# review . School officials identified student during investigation and the person -# admitted to hanging the noose, Duke says . The noose, made of rope, was discovered on -# campus about 2 a.m. +# prediction: the student was identified during an investigation by campus police and +# the office of student affairs . he admitted to placing the noose on the tree early +# Wednesday morning . # +# target: Student is no longer on Duke University campus and will face disciplinary +# review . School officials identified student during investigation and the person +# admitted to hanging the noose, Duke says . The noose, made of rope, was discovered on +# campus about 2 a.m. From 88579cac27a26419cc46f15b7e71eaffb12286f3 Mon Sep 17 00:00:00 2001 From: pmabbo13 Date: Mon, 1 Aug 2022 11:21:14 -0400 Subject: [PATCH 5/8] add missing variables --- examples/tutorials/cnndm_summarization.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/examples/tutorials/cnndm_summarization.py b/examples/tutorials/cnndm_summarization.py index 1947b719bb..3fb117d343 100644 --- a/examples/tutorials/cnndm_summarization.py +++ b/examples/tutorials/cnndm_summarization.py @@ -131,6 +131,8 @@ def apply_prefix(task, x): # available pre-trained models, please refer to documentation at https://pytorch.org/text/main/models.html # # +from torchtext.prototype.models import T5_BASE_GENERATION + t5_base = T5_BASE_GENERATION transform = t5_base.transform() @@ -228,6 +230,7 @@ def greedy_generator( batch = next(iter(test_dataloader)) input_text = batch["article"] model_input = transform(input_text) +target = batch["abstract"] model_output = greedy_generator(model=model, encoder_tokens=model_input, eos_idx=eos_idx) output_text = transform.decode(model_output.tolist()) From 6f5796ffa5dad7d1c810b5b18f46cb54016bad2d Mon Sep 17 00:00:00 2001 From: pmabbo13 Date: Mon, 1 Aug 2022 11:24:54 -0400 Subject: [PATCH 6/8] add cnndm_summarization to index.rst --- docs/source/index.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/index.rst b/docs/source/index.rst index 5cfe626650..9ebc235d57 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -51,6 +51,7 @@ Getting Started :caption: Getting Started tutorials/sst2_classification_non_distributed + tutorials/cnndm_summarization .. automodule:: torchtext From cfc8de5ee48f387f9922b87fff0c9d964d1bd4c0 Mon Sep 17 00:00:00 2001 From: pmabbo13 Date: Mon, 1 Aug 2022 13:28:06 -0400 Subject: [PATCH 7/8] correcting formatting errors --- examples/tutorials/cnndm_summarization.py | 28 ++++++++++++----------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/examples/tutorials/cnndm_summarization.py b/examples/tutorials/cnndm_summarization.py index 3fb117d343..4c96343ed6 100644 --- a/examples/tutorials/cnndm_summarization.py +++ b/examples/tutorials/cnndm_summarization.py @@ -12,7 +12,7 @@ # # This tutorial demonstrates how to use a pre-trained T5 Model for text summarization on the CNN-DailyMail dataset. # We will demonstrate how to use the torchtext library to: - +# # 1. Build a text pre-processing pipeline for a T5 model # 2. Read in the CNNDM dataset and pre-process the text # 3. Instantiate a pre-trained T5 model with base configuration, and perform text summarization on input text @@ -27,22 +27,24 @@ DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") + ####################################################################### # Data Transformation # ------------------- # # The T5 model does not work with raw text. Instead, it requires the text to be transformed into numerical form # in order to perform training and inference. The following transformations are required for the T5 model: - +# # 1. Tokenize text # 2. Convert tokens into (integer) IDs # 3. Truncate the sequences to a specified maximum length # 4. Add end-of-sequence (EOS) and padding token IDs - +# # T5 uses a SentencePiece model for text tokenization. Below, we use a pre-trained SentencePiece model to build -# the text pre-processing pipeline using torchtext's `T5Transform`. Note that the transform supports both +# the text pre-processing pipeline using torchtext's T5Transform. Note that the transform supports both # batched and non-batched text input (i.e. one can either pass a single sentence or a list of sentences), however # the T5 model expects the input to be batched. +# from torchtext.prototype.models import T5Transform @@ -74,7 +76,7 @@ # These datasets are built using composable torchdata datapipes and hence support standard flow-control and mapping/transformation # using user defined functions and transforms. Below, we demonstrate how to pre-process the CNNDM dataset to include the prefix necessary # for the model to identify the task it is performing. - +# # The CNNDM dataset has a train, validation, and test split. Below we demo on the test split. # # .. note:: @@ -113,13 +115,13 @@ def apply_prefix(task, x): # "abstract": x["abstract"] # } # -# batch_size = 5 -# test_datapipe = CNNDM(split="test") -# task = 'summarize' +# batch_size = 5 +# test_datapipe = CNNDM(split="test") +# task = 'summarize' # -# test_datapipe = test_datapipe.batch(batch_size).rows2columnar(["article", "abstract"]) -# test_datapipe = test_datapipe.map(partial(batch_prefix, task)) -# test_dataloader = DataLoader(test_datapipe, batch_size=None) +# test_datapipe = test_datapipe.batch(batch_size).rows2columnar(["article", "abstract"]) +# test_datapipe = test_datapipe.map(partial(batch_prefix, task)) +# test_dataloader = DataLoader(test_datapipe, batch_size=None) # ###################################################################### @@ -127,7 +129,7 @@ def apply_prefix(task, x): # ----------------- # # torchtext provides SOTA pre-trained models that can be used directly for NLP tasks or fine-tuned on downstream tasks. Below -# we use the pre-trained T5 model with standard base architecture to perform text summarization. For additional details on +# we use the pre-trained T5 model with standard base configuration to perform text summarization. For additional details on # available pre-trained models, please refer to documentation at https://pytorch.org/text/main/models.html # # @@ -238,7 +240,7 @@ def greedy_generator( for i in range(batch_size): print(f"Example {i+1}:\n") - print(f"greedy prediction: {output_text[i]}\n") + print(f"prediction: {output_text[i]}\n") print(f"target: {target[i]}\n\n") From 608ccb4934de8961ddef9c1229898d4a0c24549b Mon Sep 17 00:00:00 2001 From: pmabbo13 Date: Tue, 2 Aug 2022 10:46:56 -0400 Subject: [PATCH 8/8] setting model.eval() --- examples/tutorials/cnndm_summarization.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/tutorials/cnndm_summarization.py b/examples/tutorials/cnndm_summarization.py index 4c96343ed6..9bc06b06e1 100644 --- a/examples/tutorials/cnndm_summarization.py +++ b/examples/tutorials/cnndm_summarization.py @@ -51,7 +51,7 @@ padding_idx = 0 eos_idx = 1 max_seq_len = 512 -t5_sp_model_path = r"https://download.pytorch.org/models/text/t5_tokenizer_base.model" +t5_sp_model_path = "https://download.pytorch.org/models/text/t5_tokenizer_base.model" transform = T5Transform( sp_model_path=t5_sp_model_path, @@ -139,6 +139,7 @@ def apply_prefix(task, x): t5_base = T5_BASE_GENERATION transform = t5_base.transform() model = t5_base.get_model() +model.eval() model.to(DEVICE)