From 5638fa11f1cb485c45218be80ac63565a86c0209 Mon Sep 17 00:00:00 2001 From: Manuel Date: Sat, 24 Apr 2021 00:10:45 +0200 Subject: [PATCH 1/2] Adding pandarallel library Pandarallel allows to execute pandas apply method in parallel, which allows to do data preprocessing faster and easier. This is very he;pful in kernel only competiotions. See: https://github.com/nalepae/pandarallel --- Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/Dockerfile b/Dockerfile index 21c8e975..6bd15db6 100644 --- a/Dockerfile +++ b/Dockerfile @@ -427,6 +427,7 @@ RUN pip install flashtext && \ pip install jax==0.2.12 jaxlib==0.1.64 && \ # ipympl adds interactive widget support for matplotlib pip install ipympl==0.7.0 && \ + pip install pandarallel && \ /tmp/clean-layer.sh # Download base easyocr models. From 8b379085e99d46e6efcf50dcb8eda4197311bbbc Mon Sep 17 00:00:00 2001 From: Manuel Date: Wed, 28 Apr 2021 09:34:56 +0200 Subject: [PATCH 2/2] Adding test to pandarallel Adding a simple test --- tests/test_pandarralel.py | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 tests/test_pandarralel.py diff --git a/tests/test_pandarralel.py b/tests/test_pandarralel.py new file mode 100644 index 00000000..fe74b0e1 --- /dev/null +++ b/tests/test_pandarralel.py @@ -0,0 +1,11 @@ +import unittest + +import pandas as pd +from pandarallel import pandarallel + +pandarallel.initialize() + +class TestPandarallel(unittest.TestCase): + def test_pandarallel(self): + data = pd.read_csv("/input/tests/data/train.csv") + data['label_converted'] = data['label'].parallel_apply(lambda x: x+1)