diff --git a/.github/workflows/translation.yml b/.github/workflows/translation.yml new file mode 100644 index 0000000..a5978f5 --- /dev/null +++ b/.github/workflows/translation.yml @@ -0,0 +1,39 @@ +name: Translation + +on: + pull_request: + branches: + - master # this can be main + paths: + - "translation/**" + +jobs: + translation: + runs-on: ubuntu-latest + defaults: + run: + working-directory: ./translation + steps: + - name: Checkout + uses: actions/checkout@v2 + with: + ref: ${{ github.ref }} + - name: Build container + run: | + docker build --tag translation:latest . + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@v1 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-region: us-east-1 + - name: Push2ECR + id: ecr + uses: jwalton/gh-ecr-push@v1 + with: + access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + region: us-east-1 + image: translation:latest + - name: Update lambda with image + run: aws lambda update-function-code --function-name translation --image-uri 968911158010.dkr.ecr.us-east-1.amazonaws.com/translation:latest \ No newline at end of file diff --git a/.gitignore b/.gitignore index b6e4761..1774af1 100644 --- a/.gitignore +++ b/.gitignore @@ -127,3 +127,5 @@ dmypy.json # Pyre type checker .pyre/ + +.DS_Store \ No newline at end of file diff --git a/translation/Dockerfile b/translation/Dockerfile new file mode 100644 index 0000000..fad6ac1 --- /dev/null +++ b/translation/Dockerfile @@ -0,0 +1,32 @@ +FROM amazon/aws-lambda-python + +RUN mkdir /cache +RUN mkdir /cache/easynmt +RUN mkdir /cache/easynmt/opus-mt_part +RUN mkdir /cache/transformers +RUN mkdir /cache/torch + +ENV EASYNMT_CACHE=/cache/easynmt +ENV TRANSFORMERS_CACHE=/cache/transformers +ENV TRANSFORMERS_VERBOSITY=error +ENV TORCH_CACHE=/cache/torch +ENV NLTK_DATA=/tmp + +RUN yum -y install gcc-c++ + +COPY requirements.txt requirements.txt +RUN pip install torch==1.8+cpu -f https://download.pytorch.org/whl/torch_stable.html --no-cache-dir +RUN pip install -r requirements.txt --no-cache-dir\ + && python -m nltk.downloader 'punkt' + +COPY ./ ./ + +# Run test cases and this saves the transformer model in the container +RUN pip install pytest --no-cache-dir && pytest tests -s -vv + +RUN chmod -R 0777 /cache/easynmt +# RUN chmod -R 0777 /cache/easynmt/opus-mt_part +RUN chmod -R 0777 /cache/transformers +RUN chmod -R 0777 /cache/torch + +CMD [ "lambda/main.lambda_handler"] \ No newline at end of file diff --git a/translation/README.MD b/translation/README.MD new file mode 100644 index 0000000..86f6e91 --- /dev/null +++ b/translation/README.MD @@ -0,0 +1,2 @@ +## Translation service +Serverless translation using AWS Lambda & EasyNMT & Transformers & Helsinki OPUS models \ No newline at end of file diff --git a/translation/lambda/__init__.py b/translation/lambda/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/translation/lambda/main.py b/translation/lambda/main.py new file mode 100644 index 0000000..f946208 --- /dev/null +++ b/translation/lambda/main.py @@ -0,0 +1,8 @@ +from src.easy_nmt import translate_records + + +def lambda_handler(event, context): + try: + return translate_records(**event) + except Exception as e: + raise diff --git a/translation/requirements.txt b/translation/requirements.txt new file mode 100644 index 0000000..2dd14b3 --- /dev/null +++ b/translation/requirements.txt @@ -0,0 +1 @@ +easynmt==2.0.1 \ No newline at end of file diff --git a/translation/src/__init__.py b/translation/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/translation/src/easy_nmt.py b/translation/src/easy_nmt.py new file mode 100644 index 0000000..a3cec48 --- /dev/null +++ b/translation/src/easy_nmt.py @@ -0,0 +1,58 @@ +import json +import os +import time +import warnings +from typing import List, Optional + +warnings.filterwarnings("ignore") + +from easynmt import EasyNMT + +model_name = os.getenv('EASYNMT_MODEL', 'opus-mt') +model_args = json.loads(os.getenv('EASYNMT_MODEL_ARGS', '{}')) +print("Load model: "+ model_name) +model = EasyNMT(model_name, load_translator=True, **model_args) + + +def translate_records(target_lang: str, records: List[dict], source_lang: Optional[str] = '', beam_size: Optional[int] = 5, perform_sentence_splitting: Optional[bool] = True): + """ + Translates the records to the given target language. + :param records: Record that should be translated + :param target_lang: Target language + :param source_lang: Language of text. Optional, if empty: Automatic language detection + :param beam_size: Beam size. Optional + :param perform_sentence_splitting: Split longer documents into individual sentences for translation. Optional + :return: Returns a json with the translated records + """ + + + # if 'EASYNMT_MAX_TEXT_LEN' in os.environ and len(text) > int(os.getenv('EASYNMT_MAX_TEXT_LEN')): + # raise ValueError("Text was too long. Only texts up to {} characters are allowed".format(os.getenv('EASYNMT_MAX_TEXT_LEN'))) + + # if beam_size < 1 or ('EASYNMT_MAX_BEAM_SIZE' in os.environ and beam_size > int(os.getenv('EASYNMT_MAX_BEAM_SIZE'))): + # raise ValueError("Illegal beam size") + + if len(source_lang.strip()) == 0: + source_lang = None + + + start_time = time.time() + output = {"target_lang": target_lang} + + texts = [record["text"] for record in records] + + detected_langs = model.language_detection(texts) + + translations = model.translate(texts, target_lang=target_lang, source_lang=source_lang, beam_size=beam_size, perform_sentence_splitting=perform_sentence_splitting, batch_size=int(os.getenv('EASYNMT_BATCH_SIZE', 16))) + for translation, detected_lang, record in zip(translations, detected_langs, records): + record["translated"] = translation + if translation == record["text"]: + record["source_lang"] = target_lang + record["translated"] = "" + else: + record["source_lang"] = detected_lang + del record["text"] + + output["records"] = records + output['translation_time'] = time.time()-start_time + return output diff --git a/translation/tests/__init__.py b/translation/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/translation/tests/test_translation.py b/translation/tests/test_translation.py new file mode 100644 index 0000000..8ee9af8 --- /dev/null +++ b/translation/tests/test_translation.py @@ -0,0 +1,37 @@ +from src.easy_nmt import translate_records + +requests = { + "records": [ + { + "id": "11", + "text": "Nunca volveré a bajar su app se roba tu información bancaria y se autoriza una suscripción que nunca solicitas", + }, + { + "id": "12", + "text": "I will never download your app will steal your bank information and authorizes a subscription you never request", + }, + { + "id": "13", + "text": "Je ne téléchargerai jamais votre application volera" + } +], + "target_lang": "en", + "source_lang": "ROMANCE" +} + +expected_response = {'target_lang': 'en', + 'records': [ + {'id': '11', 'translated': "I'll never download your app again, steal your bank information and authorize a subscription that you never ask for.", 'source_lang': 'es'}, + {'id': '12', 'translated': '', 'source_lang': 'en'}, + {'id': '13', 'translated': 'I will never download your app will fly', 'source_lang': 'fr'}] + } + + + +def test_response(): + global expected_response + response = translate_records(**requests) + del response["translation_time"] + assert expected_response == response + +