diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..b64c7455 --- /dev/null +++ b/.gitignore @@ -0,0 +1,11 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# Environments +.env +.venv +env/ +venv/ +ENV/ \ No newline at end of file diff --git a/docs/2_scripting.md b/docs/2_scripting.md index 6bceb0ce..c4226524 100644 --- a/docs/2_scripting.md +++ b/docs/2_scripting.md @@ -6,11 +6,19 @@ Python is the primary scripting language used in this project. To ensure consist To use the scripts in the scripts directory, be sure you have the dependencies installed: +```sh +pip install mysql-connector-python +pip install future +pip install pysword +pip install PyYAML +pip install pyarrow +pip install pymysql ``` -pip install mysql.connector -pip install past.builtins -pip install pysword.modules -pip install yaml + +You may also install all PyPI dependencies from the root project directory with: + +```sh +python -m pip install -r requirements.txt ``` ### Scripts Breakdown @@ -59,6 +67,10 @@ pip install yaml - **Description**: Generates MySQL SQL dump files for Bible translations. Each translation is processed and output as an SQL dump file. - **Usage**: Run the script to create SQL dump files for each translation. +#### `generate_parquet.py` +- **Description**: Generate Parquet files for Bible translations. Each translation is processed and output as a .parquet file. +- **Usage**: Run the script to create .parquet files for each translation. + #### `generate_psql.py` - **Description**: Generates PostgreSQL SQL dump files for Bible translations. Each translation is processed and output as an SQL dump file. - **Usage**: Run the script to create SQL dump files for each translation. diff --git a/docs/4_adding_texts.md b/docs/4_adding_texts.md index f5aca96f..73a17917 100644 --- a/docs/4_adding_texts.md +++ b/docs/4_adding_texts.md @@ -5,11 +5,19 @@ unique issues to overcome in conversion. Be sure you have the dependencies for this project installed: -``` +```sh pip install mysql-connector-python pip install future pip install pysword -pip install pyyaml +pip install PyYAML +pip install pyarrow +pip install pymysql +``` + +You may also install all PyPI dependencies from the root project directory with: + +```sh +python -m pip install -r requirements.txt ``` ## Step One: Find the translation... diff --git a/formats/parquet/ACV.parquet b/formats/parquet/ACV.parquet new file mode 100644 index 00000000..54dcee92 Binary files /dev/null and b/formats/parquet/ACV.parquet differ diff --git a/formats/parquet/AKJV.parquet b/formats/parquet/AKJV.parquet new file mode 100644 index 00000000..2475a4b5 Binary files /dev/null and b/formats/parquet/AKJV.parquet differ diff --git a/formats/parquet/ASV.parquet b/formats/parquet/ASV.parquet new file mode 100644 index 00000000..04aa442a Binary files /dev/null and b/formats/parquet/ASV.parquet differ diff --git a/formats/parquet/Alb.parquet b/formats/parquet/Alb.parquet new file mode 100644 index 00000000..844abe73 Binary files /dev/null and b/formats/parquet/Alb.parquet differ diff --git a/formats/parquet/Anderson.parquet b/formats/parquet/Anderson.parquet new file mode 100644 index 00000000..af16563c Binary files /dev/null and b/formats/parquet/Anderson.parquet differ diff --git a/formats/parquet/ArmEastern.parquet b/formats/parquet/ArmEastern.parquet new file mode 100644 index 00000000..74c362fb Binary files /dev/null and b/formats/parquet/ArmEastern.parquet differ diff --git a/formats/parquet/BBE.parquet b/formats/parquet/BBE.parquet new file mode 100644 index 00000000..0764f012 Binary files /dev/null and b/formats/parquet/BBE.parquet differ diff --git a/formats/parquet/BSB.parquet b/formats/parquet/BSB.parquet new file mode 100644 index 00000000..14b1bffb Binary files /dev/null and b/formats/parquet/BSB.parquet differ diff --git a/formats/parquet/BeaMRK.parquet b/formats/parquet/BeaMRK.parquet new file mode 100644 index 00000000..bf4d0db1 Binary files /dev/null and b/formats/parquet/BeaMRK.parquet differ diff --git a/formats/parquet/BurJudson.parquet b/formats/parquet/BurJudson.parquet new file mode 100644 index 00000000..087fdd37 Binary files /dev/null and b/formats/parquet/BurJudson.parquet differ diff --git a/formats/parquet/Byz.parquet b/formats/parquet/Byz.parquet new file mode 100644 index 00000000..47a40605 Binary files /dev/null and b/formats/parquet/Byz.parquet differ diff --git a/formats/parquet/CPDV.parquet b/formats/parquet/CPDV.parquet new file mode 100644 index 00000000..d660bac6 Binary files /dev/null and b/formats/parquet/CPDV.parquet differ diff --git a/formats/parquet/CSlElizabeth.parquet b/formats/parquet/CSlElizabeth.parquet new file mode 100644 index 00000000..38465032 Binary files /dev/null and b/formats/parquet/CSlElizabeth.parquet differ diff --git a/formats/parquet/CebPinadayag.parquet b/formats/parquet/CebPinadayag.parquet new file mode 100644 index 00000000..55cbe027 Binary files /dev/null and b/formats/parquet/CebPinadayag.parquet differ diff --git a/formats/parquet/Che1860.parquet b/formats/parquet/Che1860.parquet new file mode 100644 index 00000000..5bc47343 Binary files /dev/null and b/formats/parquet/Che1860.parquet differ diff --git a/formats/parquet/ChiSB.parquet b/formats/parquet/ChiSB.parquet new file mode 100644 index 00000000..a0ebb229 Binary files /dev/null and b/formats/parquet/ChiSB.parquet differ diff --git a/formats/parquet/ChiUn.parquet b/formats/parquet/ChiUn.parquet new file mode 100644 index 00000000..578da8a5 Binary files /dev/null and b/formats/parquet/ChiUn.parquet differ diff --git a/formats/parquet/ChiUnL.parquet b/formats/parquet/ChiUnL.parquet new file mode 100644 index 00000000..e027e0af Binary files /dev/null and b/formats/parquet/ChiUnL.parquet differ diff --git a/formats/parquet/CopSahBible2.parquet b/formats/parquet/CopSahBible2.parquet new file mode 100644 index 00000000..04754323 Binary files /dev/null and b/formats/parquet/CopSahBible2.parquet differ diff --git a/formats/parquet/CroSaric.parquet b/formats/parquet/CroSaric.parquet new file mode 100644 index 00000000..c30e5caf Binary files /dev/null and b/formats/parquet/CroSaric.parquet differ diff --git a/formats/parquet/CzeBKR.parquet b/formats/parquet/CzeBKR.parquet new file mode 100644 index 00000000..e9b12732 Binary files /dev/null and b/formats/parquet/CzeBKR.parquet differ diff --git a/formats/parquet/CzeCSP.parquet b/formats/parquet/CzeCSP.parquet new file mode 100644 index 00000000..9588e48d Binary files /dev/null and b/formats/parquet/CzeCSP.parquet differ diff --git a/formats/parquet/DRC.parquet b/formats/parquet/DRC.parquet new file mode 100644 index 00000000..f4d601c1 Binary files /dev/null and b/formats/parquet/DRC.parquet differ diff --git a/formats/parquet/DaOT1871NT1907.parquet b/formats/parquet/DaOT1871NT1907.parquet new file mode 100644 index 00000000..db9db6ca Binary files /dev/null and b/formats/parquet/DaOT1871NT1907.parquet differ diff --git a/formats/parquet/Darby.parquet b/formats/parquet/Darby.parquet new file mode 100644 index 00000000..e644052c Binary files /dev/null and b/formats/parquet/Darby.parquet differ diff --git a/formats/parquet/DutSVV.parquet b/formats/parquet/DutSVV.parquet new file mode 100644 index 00000000..190d91d5 Binary files /dev/null and b/formats/parquet/DutSVV.parquet differ diff --git a/formats/parquet/DutSVVA.parquet b/formats/parquet/DutSVVA.parquet new file mode 100644 index 00000000..d187b726 Binary files /dev/null and b/formats/parquet/DutSVVA.parquet differ diff --git a/formats/parquet/Esperanto.parquet b/formats/parquet/Esperanto.parquet new file mode 100644 index 00000000..0065637b Binary files /dev/null and b/formats/parquet/Esperanto.parquet differ diff --git a/formats/parquet/Est.parquet b/formats/parquet/Est.parquet new file mode 100644 index 00000000..ce911eed Binary files /dev/null and b/formats/parquet/Est.parquet differ diff --git a/formats/parquet/FinBiblia.parquet b/formats/parquet/FinBiblia.parquet new file mode 100644 index 00000000..c9a0ad9c Binary files /dev/null and b/formats/parquet/FinBiblia.parquet differ diff --git a/formats/parquet/FinPR.parquet b/formats/parquet/FinPR.parquet new file mode 100644 index 00000000..4712ae52 Binary files /dev/null and b/formats/parquet/FinPR.parquet differ diff --git a/formats/parquet/FinSTLK2017.parquet b/formats/parquet/FinSTLK2017.parquet new file mode 100644 index 00000000..ac9a8cae Binary files /dev/null and b/formats/parquet/FinSTLK2017.parquet differ diff --git a/formats/parquet/FreBBB.parquet b/formats/parquet/FreBBB.parquet new file mode 100644 index 00000000..36201120 Binary files /dev/null and b/formats/parquet/FreBBB.parquet differ diff --git a/formats/parquet/FreBDM1744.parquet b/formats/parquet/FreBDM1744.parquet new file mode 100644 index 00000000..6d2d1802 Binary files /dev/null and b/formats/parquet/FreBDM1744.parquet differ diff --git a/formats/parquet/FreCrampon.parquet b/formats/parquet/FreCrampon.parquet new file mode 100644 index 00000000..0d914396 Binary files /dev/null and b/formats/parquet/FreCrampon.parquet differ diff --git a/formats/parquet/FreGeneve1669.parquet b/formats/parquet/FreGeneve1669.parquet new file mode 100644 index 00000000..503b9083 Binary files /dev/null and b/formats/parquet/FreGeneve1669.parquet differ diff --git a/formats/parquet/FreJND.parquet b/formats/parquet/FreJND.parquet new file mode 100644 index 00000000..3043dffd Binary files /dev/null and b/formats/parquet/FreJND.parquet differ diff --git a/formats/parquet/FreLXX.parquet b/formats/parquet/FreLXX.parquet new file mode 100644 index 00000000..c2c5740d Binary files /dev/null and b/formats/parquet/FreLXX.parquet differ diff --git a/formats/parquet/FreLXXGiguet.parquet b/formats/parquet/FreLXXGiguet.parquet new file mode 100644 index 00000000..7d569960 Binary files /dev/null and b/formats/parquet/FreLXXGiguet.parquet differ diff --git a/formats/parquet/FreOltramare1874.parquet b/formats/parquet/FreOltramare1874.parquet new file mode 100644 index 00000000..edb4cbe1 Binary files /dev/null and b/formats/parquet/FreOltramare1874.parquet differ diff --git a/formats/parquet/FrePGR.parquet b/formats/parquet/FrePGR.parquet new file mode 100644 index 00000000..c2d45798 Binary files /dev/null and b/formats/parquet/FrePGR.parquet differ diff --git a/formats/parquet/FreStapfer1889.parquet b/formats/parquet/FreStapfer1889.parquet new file mode 100644 index 00000000..ea52f59c Binary files /dev/null and b/formats/parquet/FreStapfer1889.parquet differ diff --git a/formats/parquet/FreSynodale1921.parquet b/formats/parquet/FreSynodale1921.parquet new file mode 100644 index 00000000..d8193014 Binary files /dev/null and b/formats/parquet/FreSynodale1921.parquet differ diff --git a/formats/parquet/Geneva1599.parquet b/formats/parquet/Geneva1599.parquet new file mode 100644 index 00000000..e6d81843 Binary files /dev/null and b/formats/parquet/Geneva1599.parquet differ diff --git a/formats/parquet/GerAlbrecht.parquet b/formats/parquet/GerAlbrecht.parquet new file mode 100644 index 00000000..d70101fe Binary files /dev/null and b/formats/parquet/GerAlbrecht.parquet differ diff --git a/formats/parquet/GerBoLut.parquet b/formats/parquet/GerBoLut.parquet new file mode 100644 index 00000000..a593d1ac Binary files /dev/null and b/formats/parquet/GerBoLut.parquet differ diff --git a/formats/parquet/GerElb1871.parquet b/formats/parquet/GerElb1871.parquet new file mode 100644 index 00000000..5ed6b17a Binary files /dev/null and b/formats/parquet/GerElb1871.parquet differ diff --git a/formats/parquet/GerElb1905.parquet b/formats/parquet/GerElb1905.parquet new file mode 100644 index 00000000..eb8cf740 Binary files /dev/null and b/formats/parquet/GerElb1905.parquet differ diff --git a/formats/parquet/GerGruenewald.parquet b/formats/parquet/GerGruenewald.parquet new file mode 100644 index 00000000..02ca21f9 Binary files /dev/null and b/formats/parquet/GerGruenewald.parquet differ diff --git a/formats/parquet/GerLeoNA28.parquet b/formats/parquet/GerLeoNA28.parquet new file mode 100644 index 00000000..eafe562b Binary files /dev/null and b/formats/parquet/GerLeoNA28.parquet differ diff --git a/formats/parquet/GerMenge.parquet b/formats/parquet/GerMenge.parquet new file mode 100644 index 00000000..02e309c6 Binary files /dev/null and b/formats/parquet/GerMenge.parquet differ diff --git a/formats/parquet/GerOffBiSt.parquet b/formats/parquet/GerOffBiSt.parquet new file mode 100644 index 00000000..65611adb Binary files /dev/null and b/formats/parquet/GerOffBiSt.parquet differ diff --git a/formats/parquet/GerSch.parquet b/formats/parquet/GerSch.parquet new file mode 100644 index 00000000..37861d95 Binary files /dev/null and b/formats/parquet/GerSch.parquet differ diff --git a/formats/parquet/GerTafel.parquet b/formats/parquet/GerTafel.parquet new file mode 100644 index 00000000..4879ac51 Binary files /dev/null and b/formats/parquet/GerTafel.parquet differ diff --git a/formats/parquet/GerTextbibel.parquet b/formats/parquet/GerTextbibel.parquet new file mode 100644 index 00000000..8a002d9c Binary files /dev/null and b/formats/parquet/GerTextbibel.parquet differ diff --git a/formats/parquet/GerZurcher.parquet b/formats/parquet/GerZurcher.parquet new file mode 100644 index 00000000..00683f46 Binary files /dev/null and b/formats/parquet/GerZurcher.parquet differ diff --git a/formats/parquet/GreVamvas.parquet b/formats/parquet/GreVamvas.parquet new file mode 100644 index 00000000..ca727ab5 Binary files /dev/null and b/formats/parquet/GreVamvas.parquet differ diff --git a/formats/parquet/Haitian.parquet b/formats/parquet/Haitian.parquet new file mode 100644 index 00000000..9d6039e0 Binary files /dev/null and b/formats/parquet/Haitian.parquet differ diff --git a/formats/parquet/Haweis.parquet b/formats/parquet/Haweis.parquet new file mode 100644 index 00000000..9f885e62 Binary files /dev/null and b/formats/parquet/Haweis.parquet differ diff --git a/formats/parquet/HebModern.parquet b/formats/parquet/HebModern.parquet new file mode 100644 index 00000000..5daa0846 Binary files /dev/null and b/formats/parquet/HebModern.parquet differ diff --git a/formats/parquet/HunKar.parquet b/formats/parquet/HunKar.parquet new file mode 100644 index 00000000..a101d3c9 Binary files /dev/null and b/formats/parquet/HunKar.parquet differ diff --git a/formats/parquet/JPS.parquet b/formats/parquet/JPS.parquet new file mode 100644 index 00000000..55b5e5f2 Binary files /dev/null and b/formats/parquet/JPS.parquet differ diff --git a/formats/parquet/JapBungo.parquet b/formats/parquet/JapBungo.parquet new file mode 100644 index 00000000..d2f8d55f Binary files /dev/null and b/formats/parquet/JapBungo.parquet differ diff --git a/formats/parquet/JapDenmo.parquet b/formats/parquet/JapDenmo.parquet new file mode 100644 index 00000000..06ded341 Binary files /dev/null and b/formats/parquet/JapDenmo.parquet differ diff --git a/formats/parquet/JapKougo.parquet b/formats/parquet/JapKougo.parquet new file mode 100644 index 00000000..66340db0 Binary files /dev/null and b/formats/parquet/JapKougo.parquet differ diff --git a/formats/parquet/Jubilee2000.parquet b/formats/parquet/Jubilee2000.parquet new file mode 100644 index 00000000..2946265d Binary files /dev/null and b/formats/parquet/Jubilee2000.parquet differ diff --git a/formats/parquet/KJV.parquet b/formats/parquet/KJV.parquet new file mode 100644 index 00000000..7e67c228 Binary files /dev/null and b/formats/parquet/KJV.parquet differ diff --git a/formats/parquet/KJVA.parquet b/formats/parquet/KJVA.parquet new file mode 100644 index 00000000..5200489d Binary files /dev/null and b/formats/parquet/KJVA.parquet differ diff --git a/formats/parquet/KJVPCE.parquet b/formats/parquet/KJVPCE.parquet new file mode 100644 index 00000000..210ae252 Binary files /dev/null and b/formats/parquet/KJVPCE.parquet differ diff --git a/formats/parquet/KLV.parquet b/formats/parquet/KLV.parquet new file mode 100644 index 00000000..e63f2c74 Binary files /dev/null and b/formats/parquet/KLV.parquet differ diff --git a/formats/parquet/KorHKJV.parquet b/formats/parquet/KorHKJV.parquet new file mode 100644 index 00000000..96f880f1 Binary files /dev/null and b/formats/parquet/KorHKJV.parquet differ diff --git a/formats/parquet/KorRV.parquet b/formats/parquet/KorRV.parquet new file mode 100644 index 00000000..5044aaa0 Binary files /dev/null and b/formats/parquet/KorRV.parquet differ diff --git a/formats/parquet/LEB.parquet b/formats/parquet/LEB.parquet new file mode 100644 index 00000000..35f2c1a9 Binary files /dev/null and b/formats/parquet/LEB.parquet differ diff --git a/formats/parquet/LITV.parquet b/formats/parquet/LITV.parquet new file mode 100644 index 00000000..28022d4f Binary files /dev/null and b/formats/parquet/LITV.parquet differ diff --git a/formats/parquet/LvGluck8.parquet b/formats/parquet/LvGluck8.parquet new file mode 100644 index 00000000..31f8bbe4 Binary files /dev/null and b/formats/parquet/LvGluck8.parquet differ diff --git a/formats/parquet/MKJV.parquet b/formats/parquet/MKJV.parquet new file mode 100644 index 00000000..649a6d89 Binary files /dev/null and b/formats/parquet/MKJV.parquet differ diff --git a/formats/parquet/Mal1910.parquet b/formats/parquet/Mal1910.parquet new file mode 100644 index 00000000..1e5f3d66 Binary files /dev/null and b/formats/parquet/Mal1910.parquet differ diff --git a/formats/parquet/ManxGaelic.parquet b/formats/parquet/ManxGaelic.parquet new file mode 100644 index 00000000..3c765769 Binary files /dev/null and b/formats/parquet/ManxGaelic.parquet differ diff --git a/formats/parquet/Maori.parquet b/formats/parquet/Maori.parquet new file mode 100644 index 00000000..fe85cff5 Binary files /dev/null and b/formats/parquet/Maori.parquet differ diff --git a/formats/parquet/MapM.parquet b/formats/parquet/MapM.parquet new file mode 100644 index 00000000..09cce7bc Binary files /dev/null and b/formats/parquet/MapM.parquet differ diff --git a/formats/parquet/Mg1865.parquet b/formats/parquet/Mg1865.parquet new file mode 100644 index 00000000..07a7d161 Binary files /dev/null and b/formats/parquet/Mg1865.parquet differ diff --git a/formats/parquet/NHEB.parquet b/formats/parquet/NHEB.parquet new file mode 100644 index 00000000..084afc26 Binary files /dev/null and b/formats/parquet/NHEB.parquet differ diff --git a/formats/parquet/NHEBJE.parquet b/formats/parquet/NHEBJE.parquet new file mode 100644 index 00000000..c51d2bb4 Binary files /dev/null and b/formats/parquet/NHEBJE.parquet differ diff --git a/formats/parquet/NHEBME.parquet b/formats/parquet/NHEBME.parquet new file mode 100644 index 00000000..b4eed763 Binary files /dev/null and b/formats/parquet/NHEBME.parquet differ diff --git a/formats/parquet/NlCanisius1939.parquet b/formats/parquet/NlCanisius1939.parquet new file mode 100644 index 00000000..7f2678b1 Binary files /dev/null and b/formats/parquet/NlCanisius1939.parquet differ diff --git a/formats/parquet/NorSMB.parquet b/formats/parquet/NorSMB.parquet new file mode 100644 index 00000000..e4127373 Binary files /dev/null and b/formats/parquet/NorSMB.parquet differ diff --git a/formats/parquet/Norsk.parquet b/formats/parquet/Norsk.parquet new file mode 100644 index 00000000..8ff43691 Binary files /dev/null and b/formats/parquet/Norsk.parquet differ diff --git a/formats/parquet/Noyes.parquet b/formats/parquet/Noyes.parquet new file mode 100644 index 00000000..fb908aea Binary files /dev/null and b/formats/parquet/Noyes.parquet differ diff --git a/formats/parquet/OEB.parquet b/formats/parquet/OEB.parquet new file mode 100644 index 00000000..af9cc4ac Binary files /dev/null and b/formats/parquet/OEB.parquet differ diff --git a/formats/parquet/OEBcth.parquet b/formats/parquet/OEBcth.parquet new file mode 100644 index 00000000..00dd2e81 Binary files /dev/null and b/formats/parquet/OEBcth.parquet differ diff --git a/formats/parquet/Peshitta.parquet b/formats/parquet/Peshitta.parquet new file mode 100644 index 00000000..b214ef0a Binary files /dev/null and b/formats/parquet/Peshitta.parquet differ diff --git a/formats/parquet/PohnOld.parquet b/formats/parquet/PohnOld.parquet new file mode 100644 index 00000000..12dc6bbf Binary files /dev/null and b/formats/parquet/PohnOld.parquet differ diff --git a/formats/parquet/PolGdanska.parquet b/formats/parquet/PolGdanska.parquet new file mode 100644 index 00000000..a890a44e Binary files /dev/null and b/formats/parquet/PolGdanska.parquet differ diff --git a/formats/parquet/PolUGdanska.parquet b/formats/parquet/PolUGdanska.parquet new file mode 100644 index 00000000..afe48f89 Binary files /dev/null and b/formats/parquet/PolUGdanska.parquet differ diff --git a/formats/parquet/PorBLivre.parquet b/formats/parquet/PorBLivre.parquet new file mode 100644 index 00000000..8f44343e Binary files /dev/null and b/formats/parquet/PorBLivre.parquet differ diff --git a/formats/parquet/PorBLivreTR.parquet b/formats/parquet/PorBLivreTR.parquet new file mode 100644 index 00000000..d40c1140 Binary files /dev/null and b/formats/parquet/PorBLivreTR.parquet differ diff --git a/formats/parquet/PorNVA.parquet b/formats/parquet/PorNVA.parquet new file mode 100644 index 00000000..02df3e07 Binary files /dev/null and b/formats/parquet/PorNVA.parquet differ diff --git a/formats/parquet/RLT.parquet b/formats/parquet/RLT.parquet new file mode 100644 index 00000000..f902f640 Binary files /dev/null and b/formats/parquet/RLT.parquet differ diff --git a/formats/parquet/RNKJV.parquet b/formats/parquet/RNKJV.parquet new file mode 100644 index 00000000..09e0b4d7 Binary files /dev/null and b/formats/parquet/RNKJV.parquet differ diff --git a/formats/parquet/RWebster.parquet b/formats/parquet/RWebster.parquet new file mode 100644 index 00000000..2559eeea Binary files /dev/null and b/formats/parquet/RWebster.parquet differ diff --git a/formats/parquet/Rotherham.parquet b/formats/parquet/Rotherham.parquet new file mode 100644 index 00000000..8ad216c8 Binary files /dev/null and b/formats/parquet/Rotherham.parquet differ diff --git a/formats/parquet/RusMakarij.parquet b/formats/parquet/RusMakarij.parquet new file mode 100644 index 00000000..6dc78da9 Binary files /dev/null and b/formats/parquet/RusMakarij.parquet differ diff --git a/formats/parquet/RusSynodal.parquet b/formats/parquet/RusSynodal.parquet new file mode 100644 index 00000000..c48efdb1 Binary files /dev/null and b/formats/parquet/RusSynodal.parquet differ diff --git a/formats/parquet/SP.parquet b/formats/parquet/SP.parquet new file mode 100644 index 00000000..038481f7 Binary files /dev/null and b/formats/parquet/SP.parquet differ diff --git a/formats/parquet/SloChraska.parquet b/formats/parquet/SloChraska.parquet new file mode 100644 index 00000000..4b11101b Binary files /dev/null and b/formats/parquet/SloChraska.parquet differ diff --git a/formats/parquet/SloKJV.parquet b/formats/parquet/SloKJV.parquet new file mode 100644 index 00000000..f209e497 Binary files /dev/null and b/formats/parquet/SloKJV.parquet differ diff --git a/formats/parquet/SloOjacano.parquet b/formats/parquet/SloOjacano.parquet new file mode 100644 index 00000000..ffff0e07 Binary files /dev/null and b/formats/parquet/SloOjacano.parquet differ diff --git a/formats/parquet/SloStritar.parquet b/formats/parquet/SloStritar.parquet new file mode 100644 index 00000000..1f2049a6 Binary files /dev/null and b/formats/parquet/SloStritar.parquet differ diff --git a/formats/parquet/SpaPlatense.parquet b/formats/parquet/SpaPlatense.parquet new file mode 100644 index 00000000..e4a900d1 Binary files /dev/null and b/formats/parquet/SpaPlatense.parquet differ diff --git a/formats/parquet/SpaRV.parquet b/formats/parquet/SpaRV.parquet new file mode 100644 index 00000000..fbba54ed Binary files /dev/null and b/formats/parquet/SpaRV.parquet differ diff --git a/formats/parquet/SpaRV1865.parquet b/formats/parquet/SpaRV1865.parquet new file mode 100644 index 00000000..56e26921 Binary files /dev/null and b/formats/parquet/SpaRV1865.parquet differ diff --git a/formats/parquet/SpaRVG.parquet b/formats/parquet/SpaRVG.parquet new file mode 100644 index 00000000..0e49dba0 Binary files /dev/null and b/formats/parquet/SpaRVG.parquet differ diff --git a/formats/parquet/SrKDEkavski.parquet b/formats/parquet/SrKDEkavski.parquet new file mode 100644 index 00000000..5d2937cd Binary files /dev/null and b/formats/parquet/SrKDEkavski.parquet differ diff --git a/formats/parquet/SrKDIjekav.parquet b/formats/parquet/SrKDIjekav.parquet new file mode 100644 index 00000000..e8bf35c8 Binary files /dev/null and b/formats/parquet/SrKDIjekav.parquet differ diff --git a/formats/parquet/StatResGNT.parquet b/formats/parquet/StatResGNT.parquet new file mode 100644 index 00000000..01231803 Binary files /dev/null and b/formats/parquet/StatResGNT.parquet differ diff --git a/formats/parquet/Swe1917.parquet b/formats/parquet/Swe1917.parquet new file mode 100644 index 00000000..7e77d348 Binary files /dev/null and b/formats/parquet/Swe1917.parquet differ diff --git a/formats/parquet/SweKarlXII.parquet b/formats/parquet/SweKarlXII.parquet new file mode 100644 index 00000000..0bdf1588 Binary files /dev/null and b/formats/parquet/SweKarlXII.parquet differ diff --git a/formats/parquet/SweKarlXII1873.parquet b/formats/parquet/SweKarlXII1873.parquet new file mode 100644 index 00000000..9fd9b878 Binary files /dev/null and b/formats/parquet/SweKarlXII1873.parquet differ diff --git a/formats/parquet/TR.parquet b/formats/parquet/TR.parquet new file mode 100644 index 00000000..caeb43b3 Binary files /dev/null and b/formats/parquet/TR.parquet differ diff --git a/formats/parquet/TagAngBiblia.parquet b/formats/parquet/TagAngBiblia.parquet new file mode 100644 index 00000000..484d223b Binary files /dev/null and b/formats/parquet/TagAngBiblia.parquet differ diff --git a/formats/parquet/Tausug.parquet b/formats/parquet/Tausug.parquet new file mode 100644 index 00000000..29a2aec8 Binary files /dev/null and b/formats/parquet/Tausug.parquet differ diff --git a/formats/parquet/ThaiKJV.parquet b/formats/parquet/ThaiKJV.parquet new file mode 100644 index 00000000..647631a0 Binary files /dev/null and b/formats/parquet/ThaiKJV.parquet differ diff --git a/formats/parquet/TpiKJPB.parquet b/formats/parquet/TpiKJPB.parquet new file mode 100644 index 00000000..a586766f Binary files /dev/null and b/formats/parquet/TpiKJPB.parquet differ diff --git a/formats/parquet/Twenty.parquet b/formats/parquet/Twenty.parquet new file mode 100644 index 00000000..b07491d8 Binary files /dev/null and b/formats/parquet/Twenty.parquet differ diff --git a/formats/parquet/Tyndale.parquet b/formats/parquet/Tyndale.parquet new file mode 100644 index 00000000..a70493fb Binary files /dev/null and b/formats/parquet/Tyndale.parquet differ diff --git a/formats/parquet/UKJV.parquet b/formats/parquet/UKJV.parquet new file mode 100644 index 00000000..ec6eb8e6 Binary files /dev/null and b/formats/parquet/UKJV.parquet differ diff --git a/formats/parquet/UkrOgienko.parquet b/formats/parquet/UkrOgienko.parquet new file mode 100644 index 00000000..3ebe44e0 Binary files /dev/null and b/formats/parquet/UkrOgienko.parquet differ diff --git a/formats/parquet/Viet.parquet b/formats/parquet/Viet.parquet new file mode 100644 index 00000000..ac3838d6 Binary files /dev/null and b/formats/parquet/Viet.parquet differ diff --git a/formats/parquet/VulgClementine.parquet b/formats/parquet/VulgClementine.parquet new file mode 100644 index 00000000..71478ad5 Binary files /dev/null and b/formats/parquet/VulgClementine.parquet differ diff --git a/formats/parquet/VulgConte.parquet b/formats/parquet/VulgConte.parquet new file mode 100644 index 00000000..1021fbd0 Binary files /dev/null and b/formats/parquet/VulgConte.parquet differ diff --git a/formats/parquet/VulgHetzenauer.parquet b/formats/parquet/VulgHetzenauer.parquet new file mode 100644 index 00000000..70672897 Binary files /dev/null and b/formats/parquet/VulgHetzenauer.parquet differ diff --git a/formats/parquet/VulgSistine.parquet b/formats/parquet/VulgSistine.parquet new file mode 100644 index 00000000..26807de1 Binary files /dev/null and b/formats/parquet/VulgSistine.parquet differ diff --git a/formats/parquet/Vulgate.parquet b/formats/parquet/Vulgate.parquet new file mode 100644 index 00000000..9a39c4f7 Binary files /dev/null and b/formats/parquet/Vulgate.parquet differ diff --git a/formats/parquet/WLC.parquet b/formats/parquet/WLC.parquet new file mode 100644 index 00000000..ca4e846c Binary files /dev/null and b/formats/parquet/WLC.parquet differ diff --git a/formats/parquet/Webster.parquet b/formats/parquet/Webster.parquet new file mode 100644 index 00000000..44d4af67 Binary files /dev/null and b/formats/parquet/Webster.parquet differ diff --git a/formats/parquet/Wulfila.parquet b/formats/parquet/Wulfila.parquet new file mode 100644 index 00000000..b89ac134 Binary files /dev/null and b/formats/parquet/Wulfila.parquet differ diff --git a/formats/parquet/Wycliffe.parquet b/formats/parquet/Wycliffe.parquet new file mode 100644 index 00000000..ecf16771 Binary files /dev/null and b/formats/parquet/Wycliffe.parquet differ diff --git a/formats/parquet/YLT.parquet b/formats/parquet/YLT.parquet new file mode 100644 index 00000000..c7300167 Binary files /dev/null and b/formats/parquet/YLT.parquet differ diff --git a/formats/parquet/sml_BL_2008.parquet b/formats/parquet/sml_BL_2008.parquet new file mode 100644 index 00000000..ad25c11b Binary files /dev/null and b/formats/parquet/sml_BL_2008.parquet differ diff --git a/formats/parquet/vlsJoNT.parquet b/formats/parquet/vlsJoNT.parquet new file mode 100644 index 00000000..0a195535 Binary files /dev/null and b/formats/parquet/vlsJoNT.parquet differ diff --git a/generators/parquet/parquet_generator.py b/generators/parquet/parquet_generator.py new file mode 100644 index 00000000..5d0e9fa3 --- /dev/null +++ b/generators/parquet/parquet_generator.py @@ -0,0 +1,48 @@ +import os +import pyarrow as pa +import pyarrow.parquet as pq +from generators.base_generator import BaseGenerator + + +class ParquetGenerator(BaseGenerator): + """ + Write a single Parquet file: + columns = book_index, book_name, chapter, verse, text + """ + + def __init__(self, source_dir, format_dir) -> None: + super().__init__(source_dir, format_dir) + + def generate(self, language: str, translation: str) -> None: + data = self.load_json(language, translation) + prepared = self.prepare_data(data) + + # Build list-of-dicts for Parquet + rows = [] + for b_idx, book in enumerate(prepared.get("books", []), start=1): + bname = book.get("name") + for ch in book.get("chapters", []): + cnum = ch.get("chapter") + for v in ch.get("verses", []): + rows.append({ + "book_index": b_idx, + "book_name": bname, + "chapter": v.get("chapter", cnum), + "verse": v.get("verse"), + "text": v.get("text"), + }) + + table = pa.Table.from_pylist(rows, schema=pa.schema([ + pa.field("book_index", pa.int32()), + pa.field("book_name", pa.string()), + pa.field("chapter", pa.int32()), + pa.field("verse", pa.int32()), + pa.field("text", pa.string()), + ])) + + out_dir = os.path.join(self.format_dir, "parquet") + os.makedirs(out_dir, exist_ok=True) + out_path = os.path.join(out_dir, f"{translation}.parquet") + + pq.write_table(table, out_path, compression="snappy") + print(f"Parquet file for {translation} written to {out_path}") diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 00000000..55fd0b51 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,6 @@ +future +mysql-connector-python +pyarrow +pymysql +pysword +PyYAML \ No newline at end of file diff --git a/scripts/README.md b/scripts/README.md index b027a253..52173bfc 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -46,6 +46,10 @@ This repository contains a collection of scripts designed to generate various fo - **Description**: Generates MySQL SQL dump files for Bible translations. Each translation is processed and output as an SQL dump file. - **Usage**: Run the script to create SQL dump files for each translation. +- **generate_parquet.py** + - **Description**: Generate Parquet files for Bible translations. Each translation is processed and output as a .parquet file. + - **Usage**: Run the script to create .parquet files for each translation. + - **generate_sqlite.py** - **Description**: Generates SQLite database files for Bible translations. Each translation is processed and output as an SQLite database file. - **Usage**: Run the script to create SQLite database files for each translation. diff --git a/scripts/generate_all_versions.py b/scripts/generate_all_versions.py index 66bdd242..5ef2ff73 100644 --- a/scripts/generate_all_versions.py +++ b/scripts/generate_all_versions.py @@ -12,6 +12,8 @@ from generators.text.plaintext_generator import TextGenerator from generators.text.yaml_generator import YAMLGenerator from generators.text.markdown_generator import MDGenerator +from generators.parquet.parquet_generator import ParquetGenerator + def create_format_directories(format_directory): formats = ['sql', 'sqlite', 'csv', 'txt', 'json', 'yaml', 'md'] @@ -34,7 +36,7 @@ def generate_all_versions(): # Generate all formats for each language and translation for language in languages: language_path = os.path.join(source_directory, language) - + # List all translations for the current language translations = [d for d in os.listdir(language_path) if os.path.isdir(os.path.join(language_path, d))] @@ -70,6 +72,10 @@ def generate_all_versions(): md_generator = MDGenerator(source_directory, format_directory) md_generator.generate(language, translation) + # Generate Parquet format + parquet_generator = ParquetGenerator(source_directory, format_directory) + parquet_generator.generate(language, translation) + print(f"Completed generating formats for {translation} in {language}") except Exception as e: print(f"Error generating formats for {translation} in {language}: {e}") diff --git a/scripts/generate_parquet.py b/scripts/generate_parquet.py new file mode 100644 index 00000000..a7708a0e --- /dev/null +++ b/scripts/generate_parquet.py @@ -0,0 +1,51 @@ +import sys +import os + +# Add the parent directory to the system path +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) + +from generators.parquet.parquet_generator import ParquetGenerator + + +def list_options(options, prompt): + for i, option in enumerate(options, 1): + print(f"{i}. {option}") + choice = int(input(prompt)) - 1 + return options[choice] + + +def main(): + # Set base directories relative to the script location + base_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) + source_directory = os.path.join(base_dir, 'sources') + format_directory = os.path.join(base_dir, 'formats') + + # Step 1: Select Language + languages = [ + d for d in os.listdir(source_directory) + if os.path.isdir(os.path.join(source_directory, d)) and d != "extras" + ] + print("Choose your language:") + language = list_options( + languages, + "Enter the number corresponding to your language: " + ) + + # Step 2: Select Translation + translations = [ + d for d in os.listdir(os.path.join(source_directory, language)) + if os.path.isdir(os.path.join(source_directory, language, d)) + ] + print(f"Choose your translation for {language}:") + translation = list_options( + translations, + "Enter the number corresponding to your translation: " + ) + + # Step 3: Generate Parquet + parquet_generator = ParquetGenerator(source_directory, format_directory) + parquet_generator.generate(language, translation) + + +if __name__ == "__main__": + main()