Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 39 additions & 0 deletions .github/workflows/translation.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
name: Translation

on:
pull_request:
branches:
- master # this can be main
paths:
- "translation/**"

jobs:
translation:
runs-on: ubuntu-latest
defaults:
run:
working-directory: ./translation
steps:
- name: Checkout
uses: actions/checkout@v2
with:
ref: ${{ github.ref }}
- name: Build container
run: |
docker build --tag translation:latest .
- name: Configure AWS Credentials
uses: aws-actions/configure-aws-credentials@v1
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: us-east-1
- name: Push2ECR
id: ecr
uses: jwalton/gh-ecr-push@v1
with:
access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
region: us-east-1
image: translation:latest
- name: Update lambda with image
run: aws lambda update-function-code --function-name translation --image-uri 968911158010.dkr.ecr.us-east-1.amazonaws.com/translation:latest
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -127,3 +127,5 @@ dmypy.json

# Pyre type checker
.pyre/

.DS_Store
32 changes: 32 additions & 0 deletions translation/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
FROM amazon/aws-lambda-python

RUN mkdir /cache
RUN mkdir /cache/easynmt
RUN mkdir /cache/easynmt/opus-mt_part
RUN mkdir /cache/transformers
RUN mkdir /cache/torch

ENV EASYNMT_CACHE=/cache/easynmt
ENV TRANSFORMERS_CACHE=/cache/transformers
ENV TRANSFORMERS_VERBOSITY=error
ENV TORCH_CACHE=/cache/torch
ENV NLTK_DATA=/tmp

RUN yum -y install gcc-c++

COPY requirements.txt requirements.txt
RUN pip install torch==1.8+cpu -f https://download.pytorch.org/whl/torch_stable.html --no-cache-dir
RUN pip install -r requirements.txt --no-cache-dir\
&& python -m nltk.downloader 'punkt'

COPY ./ ./

# Run test cases and this saves the transformer model in the container
RUN pip install pytest --no-cache-dir && pytest tests -s -vv

RUN chmod -R 0777 /cache/easynmt
# RUN chmod -R 0777 /cache/easynmt/opus-mt_part
RUN chmod -R 0777 /cache/transformers
RUN chmod -R 0777 /cache/torch

CMD [ "lambda/main.lambda_handler"]
2 changes: 2 additions & 0 deletions translation/README.MD
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
## Translation service
Serverless translation using AWS Lambda & EasyNMT & Transformers & Helsinki OPUS models
Empty file added translation/lambda/__init__.py
Empty file.
8 changes: 8 additions & 0 deletions translation/lambda/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from src.easy_nmt import translate_records


def lambda_handler(event, context):
try:
return translate_records(**event)
except Exception as e:
raise
1 change: 1 addition & 0 deletions translation/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
easynmt==2.0.1
Empty file added translation/src/__init__.py
Empty file.
58 changes: 58 additions & 0 deletions translation/src/easy_nmt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import json
import os
import time
import warnings
from typing import List, Optional

warnings.filterwarnings("ignore")

from easynmt import EasyNMT

model_name = os.getenv('EASYNMT_MODEL', 'opus-mt')
model_args = json.loads(os.getenv('EASYNMT_MODEL_ARGS', '{}'))
print("Load model: "+ model_name)
model = EasyNMT(model_name, load_translator=True, **model_args)


def translate_records(target_lang: str, records: List[dict], source_lang: Optional[str] = '', beam_size: Optional[int] = 5, perform_sentence_splitting: Optional[bool] = True):
"""
Translates the records to the given target language.
:param records: Record that should be translated
:param target_lang: Target language
:param source_lang: Language of text. Optional, if empty: Automatic language detection
:param beam_size: Beam size. Optional
:param perform_sentence_splitting: Split longer documents into individual sentences for translation. Optional
:return: Returns a json with the translated records
"""


# if 'EASYNMT_MAX_TEXT_LEN' in os.environ and len(text) > int(os.getenv('EASYNMT_MAX_TEXT_LEN')):
# raise ValueError("Text was too long. Only texts up to {} characters are allowed".format(os.getenv('EASYNMT_MAX_TEXT_LEN')))

# if beam_size < 1 or ('EASYNMT_MAX_BEAM_SIZE' in os.environ and beam_size > int(os.getenv('EASYNMT_MAX_BEAM_SIZE'))):
# raise ValueError("Illegal beam size")

if len(source_lang.strip()) == 0:
source_lang = None


start_time = time.time()
output = {"target_lang": target_lang}

texts = [record["text"] for record in records]

detected_langs = model.language_detection(texts)

translations = model.translate(texts, target_lang=target_lang, source_lang=source_lang, beam_size=beam_size, perform_sentence_splitting=perform_sentence_splitting, batch_size=int(os.getenv('EASYNMT_BATCH_SIZE', 16)))
for translation, detected_lang, record in zip(translations, detected_langs, records):
record["translated"] = translation
if translation == record["text"]:
record["source_lang"] = target_lang
record["translated"] = ""
else:
record["source_lang"] = detected_lang
del record["text"]

output["records"] = records
output['translation_time'] = time.time()-start_time
return output
Empty file added translation/tests/__init__.py
Empty file.
37 changes: 37 additions & 0 deletions translation/tests/test_translation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
from src.easy_nmt import translate_records

requests = {
"records": [
{
"id": "11",
"text": "Nunca volveré a bajar su app se roba tu información bancaria y se autoriza una suscripción que nunca solicitas",
},
{
"id": "12",
"text": "I will never download your app will steal your bank information and authorizes a subscription you never request",
},
{
"id": "13",
"text": "Je ne téléchargerai jamais votre application volera"
}
],
"target_lang": "en",
"source_lang": "ROMANCE"
}

expected_response = {'target_lang': 'en',
'records': [
{'id': '11', 'translated': "I'll never download your app again, steal your bank information and authorize a subscription that you never ask for.", 'source_lang': 'es'},
{'id': '12', 'translated': '', 'source_lang': 'en'},
{'id': '13', 'translated': 'I will never download your app will fly', 'source_lang': 'fr'}]
}



def test_response():
global expected_response
response = translate_records(**requests)
del response["translation_time"]
assert expected_response == response