From c16bda7bbccbb9f6e10cae0269ca32c112cc53ec Mon Sep 17 00:00:00 2001 From: Ruslan Iushchenko Date: Thu, 19 Dec 2024 09:21:55 +0100 Subject: [PATCH 1/8] #730 Add code page CP1146. --- .../parser/encoding/codepage/CodePage.scala | 1 + .../encoding/codepage/CodePage1145.scala | 5 +- .../encoding/codepage/CodePage1146.scala | 55 +++++++++++++++++++ .../parser/decoders/StringDecodersSpec.scala | 11 ++++ .../codepage/CodePageSingleByteSpec.scala | 5 ++ 5 files changed, 75 insertions(+), 2 deletions(-) create mode 100644 cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage1146.scala diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage.scala index 2c0d9b8a2..6667f1ee9 100644 --- a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage.scala +++ b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage.scala @@ -63,6 +63,7 @@ object CodePage extends Logging { case "cp1140" => new CodePage1140 case "cp1141" => new CodePage1141 case "cp1145" => new CodePage1145 + case "cp1146" => new CodePage1146 case "cp1148" => new CodePage1148 case "cp1364" => new CodePage1364 case "cp1388" => new CodePage1388 diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage1145.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage1145.scala index fcb86ed8e..36568bc36 100644 --- a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage1145.scala +++ b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage1145.scala @@ -17,7 +17,8 @@ package za.co.absa.cobrix.cobol.parser.encoding.codepage /** - * EBCDIC code page 284 is used to represent characters of Spain and Latin America. + * EBCDIC code page 1145 is used to represent characters of Spain and Latin America + * with € at the position of the international currency symbol ¤. */ class CodePage1145 extends SingleByteCodePage(CodePage1145.ebcdicToAsciiMapping) { override def codePageShortName: String = "cp1145" @@ -27,7 +28,7 @@ object CodePage1145 { val ebcdicToAsciiMapping: Array[Char] = { import EbcdicNonPrintable._ - /* This is the EBCDIC Code Page 1145 which is the euro currency update of code page CCSID 284 + /* This is the EBCDIC Code Page 1145 which is the euro currency update of code page CCSID 284 from https://en.wikibooks.org/wiki/Character_Encodings/Code_Tables/EBCDIC/EBCDIC_284 */ val ebcdic2ascii: Array[Char] = { // Non-printable characters map used: http://www.pacsys.com/asciitab.htm diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage1146.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage1146.scala new file mode 100644 index 000000000..0dce6b10a --- /dev/null +++ b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage1146.scala @@ -0,0 +1,55 @@ +/* + * Copyright 2018 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package za.co.absa.cobrix.cobol.parser.encoding.codepage + +/** + * EBCDIC code page 1146 is used to represent characters of the United Kingdom + * with € at the position of the international currency symbol ¤. + */ +class CodePage1146 extends SingleByteCodePage(CodePage1146.ebcdicToAsciiMapping) { + override def codePageShortName: String = "cp1146" +} + +object CodePage1146 { + val ebcdicToAsciiMapping: Array[Char] = { + import EbcdicNonPrintable._ + + /* This is the EBCDIC Code Page 1146 which is the euro currency update of code page CCSID 285 + from https://en.wikibooks.org/wiki/Character_Encodings/Code_Tables/EBCDIC/EBCDIC_285 */ + val ebcdic2ascii: Array[Char] = { + // Non-printable characters map used: http://www.pacsys.com/asciitab.htm + Array[Char]( + c00, c01, c02, c03, spc, c09, spc, del, spc, spc, spc, c0b, c0c, ccr, c0e, c0f, // 0 - 15 + c10, c11, c12, c13, spc, nel, c08, spc, c18, c19, spc, spc, c1c, c1d, c1e, c1f, // 16 - 31 + spc, spc, spc, spc, spc, clf, c17, c1b, spc, spc, spc, spc, spc, c05, c06, c07, // 32 - 47 + spc, spc, c16, spc, spc, spc, spc, c04, spc, spc, spc, spc, c14, c15, spc, c1a, // 48 - 63 + ' ', rsp, 'â', 'ä', 'à', 'á', 'ã', 'å', 'ç', 'ñ', '$', '.', '<', '(', '+', '|', // 64 - 79 + '&', 'é', 'ê', 'ë', 'è', 'í', 'î', 'ï', 'ì', 'ß', '!', '£', '*', ')', ';', '¬', // 80 - 95 + '-', '/', 'Â', 'Ä', 'À', 'Á', 'Ã', 'Å', 'Ç', 'Ñ', '¦', ',', '%', '_', '>', '?', // 96 - 111 + 'ø', 'É', 'Ê', 'Ë', 'È', 'Í', 'Î', 'Ï', 'Ì', '`', ':', '#', '@', qts, '=', qtd, // 112 - 127 + 'Ø', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', '«', '»', 'ð', 'ý', 'þ', '±', // 128 - 143 + '°', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 'ª', 'º', 'æ', '¸', 'Æ', '€', // 144 - 159 + 'µ', '¯', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '¡', '¿', 'Ð', 'Ý', 'Þ', '®', // 160 - 175 + '¢', '[', '¥', '·', '©', '§', '¶', '¼', '½', '¾', '^', ']', '~', '¨', '´', '×', // 176 - 191 + '{', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', shy, 'ô', 'ö', 'ò', 'ó', 'õ', // 192 - 207 + '}', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', '¹', 'û', 'ü', 'ù', 'ú', 'ÿ', // 208 - 223 + bsh, '÷', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '²', 'Ô', 'Ö', 'Ò', 'Ó', 'Õ', // 224 - 239 + '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '³', 'Û', 'Ü', 'Ù', 'Ú', spc) // 240 - 255 + } + ebcdic2ascii + } +} diff --git a/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/decoders/StringDecodersSpec.scala b/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/decoders/StringDecodersSpec.scala index 150658e0a..2c7917be6 100644 --- a/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/decoders/StringDecodersSpec.scala +++ b/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/decoders/StringDecodersSpec.scala @@ -285,6 +285,17 @@ class StringDecodersSpec extends AnyWordSpec { assert(actual == expected) } + "decode a CP1146 string special characters" in { + val expected = " $£¯¢[^~ä#|üܬ§¦!ߢ$Ö{}æö¯å@ÆØÅÄÉ€ " + val bytes = Array(0x40, 0x4A, 0x5B, 0xA1, 0xB0, 0xB1, 0xBA, 0xBC, 0x43, 0x7B, 0x4F, 0xDC, 0xFC, 0x5F, 0xB5, 0x6A, 0x5A, 0x59, + 0xB0, 0x4A, 0xEC, 0xC0, 0xD0, 0x9C, 0xCC, 0xA1, 0x47, 0x7C, 0x9E, 0x80, 0x67, 0x63, + 0x71, 0x9F, 0x40).map(_.toByte) + + val actual = decodeEbcdicString(bytes, KeepAll, new CodePage1146, improvedNullDetection = false) + + assert(actual == expected) + } + "decode a CP1148 string special characters" in { val expected = "âäàáãåçñ[.<(+!&éêëèíîïìß]$*);^-/ÂÄÀÁÃÅÇѦ,%_>?øÉÊËÈÍÎÏÌ`:#@'=\"Øabcdefghi«»ðýþ±°jklmnopqrªºæ¸Æ€µ~stuvwxyz¡¿ÐÝÞ®¢£¥·©§¶¼½¾¬|¯¨´×{ABCDEFGHI\u00ADôöòóõ}JKLMNOPQR¹ûüùúÿ\\÷STUVWXYZ²ÔÖÒÓÕ0123456789³ÛÜÙÚ" val bytes = Array( diff --git a/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePageSingleByteSpec.scala b/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePageSingleByteSpec.scala index 4d1d933df..4be8be0d7 100644 --- a/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePageSingleByteSpec.scala +++ b/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePageSingleByteSpec.scala @@ -132,6 +132,11 @@ class CodePageSingleByteSpec extends AnyFunSuite { assert(codePage.codePageShortName == "cp1145") } + test("Ensure codepage 'cp1146' gives the associated CodePage") { + val codePage = CodePage.getCodePageByName("cp1146") + assert(codePage.codePageShortName == "cp1146") + } + test("Ensure codepage 'cp1148' gives the associated CodePage") { val codePage = CodePage.getCodePageByName("cp1148") assert(codePage.codePageShortName == "cp1148") From bd4bb170611be2443ebb97738b71a680b2fc56c5 Mon Sep 17 00:00:00 2001 From: Ruslan Iushchenko Date: Thu, 19 Dec 2024 11:32:47 +0100 Subject: [PATCH 2/8] #730 Add CP274 (Belgium). --- README.md | 1 + .../parser/encoding/codepage/CodePage.scala | 1 + .../encoding/codepage/CodePage274.scala | 54 +++++++++++++++++++ .../parser/decoders/StringDecodersSpec.scala | 11 ++++ .../codepage/CodePageSingleByteSpec.scala | 5 ++ 5 files changed, 72 insertions(+) create mode 100644 cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage274.scala diff --git a/README.md b/README.md index 53167d512..54cf5bc36 100644 --- a/README.md +++ b/README.md @@ -1625,6 +1625,7 @@ The output looks like this: | .option("ebcdic_code_page", "common") | Common | (Default) Only characters common across EBCDIC code pages are decoded. | | .option("ebcdic_code_page", "cp037") | EBCDIC 037 | Australia, Brazil, Canada, New Zealand, Portugal, South Africa, USA. | | .option("ebcdic_code_page", "cp273") | EBCDIC 273 | Germany, Austria. | +| .option("ebcdic_code_page", "cp274") | EBCDIC 274 | Belgium. | | .option("ebcdic_code_page", "cp275") | EBCDIC 275 | Brazil. | | .option("ebcdic_code_page", "cp277") | EBCDIC 277 | Denmark and Norway. | | .option("ebcdic_code_page", "cp278") | EBCDIC 278 | Finland and Sweden. | diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage.scala index 6667f1ee9..35bbbe7c3 100644 --- a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage.scala +++ b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage.scala @@ -46,6 +46,7 @@ object CodePage extends Logging { case "cp037_extended" => new CodePage037Ext case "cp00300" => new CodePage300 // This is the same as cp300 case "cp273" => new CodePage273 + case "cp274" => new CodePage274 case "cp275" => new CodePage275 case "cp277" => new CodePage277 case "cp278" => new CodePage278 diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage274.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage274.scala new file mode 100644 index 000000000..8cfc9706c --- /dev/null +++ b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage274.scala @@ -0,0 +1,54 @@ +/* + * Copyright 2018 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package za.co.absa.cobrix.cobol.parser.encoding.codepage + +/** + * EBCDIC code page 274. Belgium. + */ +class CodePage274 extends SingleByteCodePage(CodePage274.ebcdicToAsciiMapping) { + override def codePageShortName: String = "cp274" +} + +object CodePage274 { + val ebcdicToAsciiMapping: Array[Char] = { + import EbcdicNonPrintable._ + + /* This is the EBCDIC Code Page 274 to ASCII conversion table + from https://en.wikibooks.org/wiki/Character_Encodings/Code_Tables/EBCDIC/EBCDIC_274 */ + val ebcdic2ascii: Array[Char] = { + // Non-printable characters map used: http://www.pacsys.com/asciitab.htm + Array[Char]( + c00, c01, c02, c03, spc, c09, spc, del, spc, spc, spc, c0b, c0c, ccr, c0e, c0f, // 0 - 15 + c10, c11, c12, c13, spc, nel, c08, spc, c18, c19, spc, spc, c1c, c1d, c1e, c1f, // 16 - 31 + spc, spc, spc, spc, spc, clf, c17, c1b, spc, spc, spc, spc, spc, c05, c06, c07, // 32 - 47 + spc, spc, c16, spc, spc, spc, spc, c04, spc, spc, spc, spc, c14, c15, spc, c1a, // 48 - 63 + ' ', rsp, 'â', 'ä', '@', 'á', 'ã', 'å', bsh, 'ñ', '[', '.', '<', '(', '+', '!', // 64 - 79 + '&', '{', 'ê', 'ë', '}', 'í', 'î', 'ï', 'ì', 'ß', ']', '$', '*', ')', ';', '^', // 80 - 95 + '-', '/', 'Â', 'Ä', 'À', 'Á', 'Ã', 'Å', 'Ç', 'Ñ', 'ù', ',', '%', '_', '>', '?', // 96 - 111 + 'ø', 'É', 'Ê', 'Ë', 'È', 'Í', 'Î', 'Ï', 'Ì', '`', ':', '#', 'à', qts, '=', qtd, // 112 - 127 + 'Ø', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', '«', '»', 'ð', 'ý', 'þ', '±', // 128 - 143 + '°', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 'ª', 'º', 'æ', '¸', 'Æ', '¤', // 144 - 159 + 'µ', '¨', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '¡', '¿', 'Ð', 'Ý', 'Þ', '®', // 160 - 175 + '¢', '£', '¥', '·', '©', '§', '¶', '¼', '½', '¾', '¬', '|', '¯', '~', '´', '×', // 176 - 191 + 'é', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', shy, 'ô', 'ö', 'ò', 'ó', 'õ', // 192 - 207 + 'è', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', '¹', 'û', 'ü', '¦', 'ú', 'ÿ', // 208 - 223 + 'ç', '÷', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '²', 'Ô', 'Ö', 'Ò', 'Ó', 'Õ', // 224 - 239 + '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '³', 'Û', 'Ü', 'Ù', 'Ú', spc) // 240 - 255 + } + ebcdic2ascii + } +} diff --git a/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/decoders/StringDecodersSpec.scala b/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/decoders/StringDecodersSpec.scala index 2c7917be6..c47dc0c42 100644 --- a/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/decoders/StringDecodersSpec.scala +++ b/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/decoders/StringDecodersSpec.scala @@ -105,6 +105,17 @@ class StringDecodersSpec extends AnyWordSpec { assert(actual == expected) } + "decode a CP274 string special characters" in { + val expected = " æÄ!üÜ^Æö]ߢ§Øäèéø¨åÖ#àÅ[ç¤Ç " + val bytes = Array(0x40, 0x9C, 0x63, 0x4F, 0xDC, 0xFC, 0x5F, 0x9E, 0xCC, 0x5A, 0x59, + 0xB0, 0xB5, 0x80, 0x43, 0xD0, 0xC0, 0x70, 0xA1, 0x47, 0xEC, 0x7B, 0x7C, 0x67, 0x4A, + 0xE0, 0x9F, 0x68, 0x40).map(_.toByte) + + val actual = decodeEbcdicString(bytes, KeepAll, new CodePage274, improvedNullDetection = false) + + assert(actual == expected) + } + "decode a CP275 string special characters" in { val expected = " æÄ!üÜ^Æö$ߢ§Øäéõø~åÖÕÃÅÉ\\¤] " val bytes = Array(0x40, 0x9C, 0x63, 0x4F, 0xDC, 0xFC, 0x5F, 0x9E, 0xCC, 0x5A, 0x59, diff --git a/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePageSingleByteSpec.scala b/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePageSingleByteSpec.scala index 4be8be0d7..3fe3253c6 100644 --- a/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePageSingleByteSpec.scala +++ b/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePageSingleByteSpec.scala @@ -47,6 +47,11 @@ class CodePageSingleByteSpec extends AnyFunSuite { assert(codePage.codePageShortName == "cp273") } + test("Ensure codepage 'cp274' gives the associated CodePage") { + val codePage = CodePage.getCodePageByName("cp274") + assert(codePage.codePageShortName == "cp274") + } + test("Ensure codepage 'cp275' gives the associated CodePage") { val codePage = CodePage.getCodePageByName("cp275") assert(codePage.codePageShortName == "cp275") From 0ff28fd2aac49c56c201ce447f3785e16bd07878 Mon Sep 17 00:00:00 2001 From: Ruslan Iushchenko Date: Tue, 7 Jan 2025 09:56:14 +0100 Subject: [PATCH 3/8] =?UTF-8?q?Add=20CP1142=20code=20page=20(Denmark=20and?= =?UTF-8?q?=20Norway)=20which=20is=20same=20as=20CP277=20with=205A=20is=20?= =?UTF-8?q?replaced=20with=20the=20"=E2=82=AC"=20(euro)=20character.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 1 + .../parser/encoding/codepage/CodePage.scala | 1 + .../encoding/codepage/CodePage1142.scala | 57 +++++++++++++++++++ .../parser/decoders/StringDecodersSpec.scala | 22 +++++++ .../codepage/CodePageSingleByteSpec.scala | 5 ++ 5 files changed, 86 insertions(+) create mode 100644 cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage1142.scala diff --git a/README.md b/README.md index 54cf5bc36..399f79920 100644 --- a/README.md +++ b/README.md @@ -1642,6 +1642,7 @@ The output looks like this: | .option("ebcdic_code_page", "cp1047") | EBCDIC 1047 | A code page containing all of the Latin-1/Open System characters. | | .option("ebcdic_code_page", "cp1140") | EBCDIC 1140 | Same as code page 037 with € at the position of the international currency symbol ¤. | | .option("ebcdic_code_page", "cp1141") | EBCDIC 1141 | Same as code page 273 with € at the position of the international currency symbol ¤. | +| .option("ebcdic_code_page", "cp1142") | EBCDIC 1142 | Same as code page 277 with € at the position of the international currency symbol ¤. | | .option("ebcdic_code_page", "cp1145") | EBCDIC 1145 | Same as code page 284 with € at the position of the international currency symbol ¤. | | .option("ebcdic_code_page", "cp1148") | EBCDIC 1148 | Same as code page 500 with € at the position of the international currency symbol ¤. | | .option("ebcdic_code_page", "cp1364") | EBCDIC 1364 | Double-byte code page CCSID-1364, Korean. | diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage.scala index 35bbbe7c3..5ff82c26a 100644 --- a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage.scala +++ b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage.scala @@ -63,6 +63,7 @@ object CodePage extends Logging { case "cp1047" => new CodePage1047 case "cp1140" => new CodePage1140 case "cp1141" => new CodePage1141 + case "cp1142" => new CodePage1142 case "cp1145" => new CodePage1145 case "cp1146" => new CodePage1146 case "cp1148" => new CodePage1148 diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage1142.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage1142.scala new file mode 100644 index 000000000..f3c449771 --- /dev/null +++ b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage1142.scala @@ -0,0 +1,57 @@ +/* + * Copyright 2018 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package za.co.absa.cobrix.cobol.parser.encoding.codepage + +/** + * EBCDIC code page 1142. Denmark and Norway. + * + * It corresponds to code page 277 and only differs from it in position 5A, where the euro sign € is located instead + * of the international currency symbol ¤. + */ +class CodePage1142 extends SingleByteCodePage(CodePage1142.ebcdicToAsciiMapping) { + override def codePageShortName: String = "cp1142" +} + +object CodePage1142 { + val ebcdicToAsciiMapping: Array[Char] = { + import EbcdicNonPrintable._ + + /* This is the EBCDIC Code Page 1142 to ASCII conversion table + from https://en.wikibooks.org/wiki/Character_Encodings/Code_Tables/EBCDIC/EBCDIC_277 */ + val ebcdic2ascii: Array[Char] = { + // Non-printable characters map used: http://www.pacsys.com/asciitab.htm + Array[Char]( + c00, c01, c02, c03, spc, c09, spc, del, spc, spc, spc, c0b, c0c, ccr, c0e, c0f, // 0 - 15 + c10, c11, c12, c13, spc, nel, c08, spc, c18, c19, spc, spc, c1c, c1d, c1e, c1f, // 16 - 31 + spc, spc, spc, spc, spc, clf, c17, c1b, spc, spc, spc, spc, spc, c05, c06, c07, // 32 - 47 + spc, spc, c16, spc, spc, spc, spc, c04, spc, spc, spc, spc, c14, c15, spc, c1a, // 48 - 63 + ' ', rsp, 'â', 'ä', 'à', 'á', 'ã', '}', 'ç', 'ñ', '#', '.', '<', '(', '+', '!', // 64 - 79 + '&', 'é', 'ê', 'ë', 'è', 'í', 'î', 'ï', 'ì', 'ß', '€', 'Å', '*', ')', ';', '^', // 80 - 95 + '-', '/', 'Â', 'Ä', 'À', 'Á', 'Ã', '$', 'Ç', 'Ñ', 'ø', ',', '%', '_', '>', '?', // 96 - 111 + '¦', 'É', 'Ê', 'Ë', 'È', 'Í', 'Î', 'Ï', 'Ì', '`', ':', 'Æ', 'Ø', qts, '=', qtd, // 112 - 127 + '@', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', '«', '»', 'ð', 'ý', 'þ', '±', // 128 - 143 + '°', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 'ª', 'º', '{', '¸', '[', ']', // 144 - 159 + 'µ', 'ü', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '¡', '¿', 'Ð', 'Ý', 'Þ', '®', // 160 - 175 + '¢', '£', '¥', '·', '©', '§', '¶', '¼', '½', '¾', '¬', '|', '¯', '¨', '´', '×', // 176 - 191 + 'æ', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', shy, 'ô', 'ö', 'ò', 'ó', 'õ', // 192 - 207 + 'å', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', '¹', 'û', '~', 'ù', 'ú', 'ÿ', // 208 - 223 + bsh, '÷', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '²', 'Ô', 'Ö', 'Ò', 'Ó', 'Õ', // 224 - 239 + '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '³', 'Û', 'Ü', 'Ù', 'Ú', spc) // 240 - 255 + } + ebcdic2ascii + } +} diff --git a/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/decoders/StringDecodersSpec.scala b/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/decoders/StringDecodersSpec.scala index c47dc0c42..c20a432f6 100644 --- a/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/decoders/StringDecodersSpec.scala +++ b/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/decoders/StringDecodersSpec.scala @@ -285,6 +285,28 @@ class StringDecodersSpec extends AnyWordSpec { assert(actual == expected) } + "decode a CP1142 string example" in { + val expected = "âäàáã}çñ#.<(+!&éêëèíîïì߀Å*);^-/ÂÄÀÁÃ$ÇÑø,%_>?¦ÉÊËÈÍÎÏÌ`:ÆØ'=\"@abcdefghi«»ðýþ±°jklmnopqrªº{¸[]µüstuvwxyz¡¿ÐÝÞ®¢£¥·©§¶¼½¾¬|¯¨´×æABCDEFGHI\u00ADôöòóõåJKLMNOPQR¹û~ùúÿ\\÷STUVWXYZ²ÔÖÒÓÕ0123456789³ÛÜÙÚ" + val bytes = Array( + 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F, + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F, + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F, + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B, 0x9C, 0x9D, 0x9E, 0x9F, + 0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF, + 0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF, + 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF, + 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF, + 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, + 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE + ).map(_.toByte) + + val actual = decodeEbcdicString(bytes, KeepAll, new CodePage1142, improvedNullDetection = false) + + assert(actual == expected) + } + "decode a CP1145 string special characters" in { val expected = " äÑ|üܬ§ñ]ߢ[Ö{}æö¨å@ÆØÅÄÉ€ " val bytes = Array(0x40, 0x43, 0x7B, 0x4F, 0xDC, 0xFC, 0x5F, 0xB5, 0x6A, 0x5A, 0x59, diff --git a/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePageSingleByteSpec.scala b/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePageSingleByteSpec.scala index 3fe3253c6..ae93375f7 100644 --- a/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePageSingleByteSpec.scala +++ b/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePageSingleByteSpec.scala @@ -132,6 +132,11 @@ class CodePageSingleByteSpec extends AnyFunSuite { assert(codePage.codePageShortName == "cp1141") } + test("Ensure codepage 'cp1142' gives the associated CodePage") { + val codePage = CodePage.getCodePageByName("cp1142") + assert(codePage.codePageShortName == "cp1142") + } + test("Ensure codepage 'cp1145' gives the associated CodePage") { val codePage = CodePage.getCodePageByName("cp1145") assert(codePage.codePageShortName == "cp1145") From c4a32291f7011fe1392a6ade68114da6e4be7851 Mon Sep 17 00:00:00 2001 From: Ruslan Iushchenko Date: Wed, 8 Jan 2025 09:05:53 +0100 Subject: [PATCH 4/8] =?UTF-8?q?Add=20CP1143=20code=20page=20(Finland=20and?= =?UTF-8?q?=20Sweden)=20which=20is=20same=20as=20CP278=20with=205A=20is=20?= =?UTF-8?q?replaced=20with=20the=20"=E2=82=AC"=20(euro)=20character.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 1 + .../parser/encoding/codepage/CodePage.scala | 1 + .../encoding/codepage/CodePage1143.scala | 57 +++++++++++++++++++ .../parser/decoders/StringDecodersSpec.scala | 22 +++++++ .../codepage/CodePageSingleByteSpec.scala | 5 ++ 5 files changed, 86 insertions(+) create mode 100644 cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage1143.scala diff --git a/README.md b/README.md index 399f79920..3b02246fa 100644 --- a/README.md +++ b/README.md @@ -1643,6 +1643,7 @@ The output looks like this: | .option("ebcdic_code_page", "cp1140") | EBCDIC 1140 | Same as code page 037 with € at the position of the international currency symbol ¤. | | .option("ebcdic_code_page", "cp1141") | EBCDIC 1141 | Same as code page 273 with € at the position of the international currency symbol ¤. | | .option("ebcdic_code_page", "cp1142") | EBCDIC 1142 | Same as code page 277 with € at the position of the international currency symbol ¤. | +| .option("ebcdic_code_page", "cp1143") | EBCDIC 1143 | Same as code page 278 with € at the position of the international currency symbol ¤. | | .option("ebcdic_code_page", "cp1145") | EBCDIC 1145 | Same as code page 284 with € at the position of the international currency symbol ¤. | | .option("ebcdic_code_page", "cp1148") | EBCDIC 1148 | Same as code page 500 with € at the position of the international currency symbol ¤. | | .option("ebcdic_code_page", "cp1364") | EBCDIC 1364 | Double-byte code page CCSID-1364, Korean. | diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage.scala index 5ff82c26a..3432da412 100644 --- a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage.scala +++ b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage.scala @@ -64,6 +64,7 @@ object CodePage extends Logging { case "cp1140" => new CodePage1140 case "cp1141" => new CodePage1141 case "cp1142" => new CodePage1142 + case "cp1143" => new CodePage1143 case "cp1145" => new CodePage1145 case "cp1146" => new CodePage1146 case "cp1148" => new CodePage1148 diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage1143.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage1143.scala new file mode 100644 index 000000000..27b92ee29 --- /dev/null +++ b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage1143.scala @@ -0,0 +1,57 @@ +/* + * Copyright 2018 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package za.co.absa.cobrix.cobol.parser.encoding.codepage + +/** + * EBCDIC code page 1143. Finland and Sweden. + * + * It corresponds to code page 278 and only differs from it in position 5A, where the euro sign € is located instead + * of the international currency symbol ¤. + */ +class CodePage1143 extends SingleByteCodePage(CodePage1143.ebcdicToAsciiMapping) { + override def codePageShortName: String = "cp1143" +} + +object CodePage1143 { + val ebcdicToAsciiMapping: Array[Char] = { + import EbcdicNonPrintable._ + + /* This is the EBCDIC Code Page 1143 to ASCII conversion table + from https://en.wikibooks.org/wiki/Character_Encodings/Code_Tables/EBCDIC/EBCDIC_278 */ + val ebcdic2ascii: Array[Char] = { + // Non-printable characters map used: http://www.pacsys.com/asciitab.htm + Array[Char]( + c00, c01, c02, c03, spc, c09, spc, del, spc, spc, spc, c0b, c0c, ccr, c0e, c0f, // 0 - 15 + c10, c11, c12, c13, spc, nel, c08, spc, c18, c19, spc, spc, c1c, c1d, c1e, c1f, // 16 - 31 + spc, spc, spc, spc, spc, clf, c17, c1b, spc, spc, spc, spc, spc, c05, c06, c07, // 32 - 47 + spc, spc, c16, spc, spc, spc, spc, c04, spc, spc, spc, spc, c14, c15, spc, c1a, // 48 - 63 + ' ', rsp, 'â', '{', 'à', 'á', 'ã', '}', 'ç', 'ñ', '§', '.', '<', '(', '+', '!', // 64 - 79 + '&', '`', 'ê', 'ë', 'è', 'í', 'î', 'ï', 'ì', 'ß', '€', 'Å', '*', ')', ';', '^', // 80 - 95 + '-', '/', 'Â', '#', 'À', 'Á', 'Ã', '$', 'Ç', 'Ñ', 'ö', ',', '%', '_', '>', '?', // 96 - 111 + 'ø', bsh, 'Ê', 'Ë', 'È', 'Í', 'Î', 'Ï', 'Ì', 'é', ':', 'Ä', 'Ö', qts, '=', qtd, // 112 - 127 + 'Ø', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', '«', '»', 'ð', 'ý', 'þ', '±', // 128 - 143 + '°', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 'ª', 'º', 'æ', '¸', 'Æ', ']', // 144 - 159 + 'µ', 'ü', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '¡', '¿', 'Ð', 'Ý', 'Þ', '®', // 160 - 175 + '¢', '£', '¥', '·', '©', '[', '¶', '¼', '½', '¾', '¬', '|', '¯', '¨', '´', '×', // 176 - 191 + 'ä', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', shy, 'ô', '¦', 'ò', 'ó', 'õ', // 192 - 207 + 'å', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', '¹', 'û', '~', 'ù', 'ú', 'ÿ', // 208 - 223 + 'É', '÷', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '²', 'Ô', '@', 'Ò', 'Ó', 'Õ', // 224 - 239 + '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '³', 'Û', 'Ü', 'Ù', 'Ú', spc) // 240 - 255 + } + ebcdic2ascii + } +} diff --git a/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/decoders/StringDecodersSpec.scala b/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/decoders/StringDecodersSpec.scala index c20a432f6..78023ba0f 100644 --- a/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/decoders/StringDecodersSpec.scala +++ b/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/decoders/StringDecodersSpec.scala @@ -307,6 +307,28 @@ class StringDecodersSpec extends AnyWordSpec { assert(actual == expected) } + "decode a CP1143 string example" in { + val expected = "â{àáã}çñ§.<(+!&`êëèíîïì߀Å*);^-/Â#ÀÁÃ$ÇÑö,%_>?ø\\ÊËÈÍÎÏÌé:ÄÖ'=\"Øabcdefghi«»ðýþ±°jklmnopqrªºæ¸Æ]µüstuvwxyz¡¿ÐÝÞ®¢£¥·©[¶¼½¾¬|¯¨´×äABCDEFGHI\u00ADô¦òóõåJKLMNOPQR¹û~ùúÿÉ÷STUVWXYZ²Ô@ÒÓÕ0123456789³ÛÜÙÚ" + val bytes = Array( + 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F, + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F, + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F, + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B, 0x9C, 0x9D, 0x9E, 0x9F, + 0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF, + 0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF, + 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF, + 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF, + 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, + 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE + ).map(_.toByte) + + val actual = decodeEbcdicString(bytes, KeepAll, new CodePage1143, improvedNullDetection = false) + + assert(actual == expected) + } + "decode a CP1145 string special characters" in { val expected = " äÑ|üܬ§ñ]ߢ[Ö{}æö¨å@ÆØÅÄÉ€ " val bytes = Array(0x40, 0x43, 0x7B, 0x4F, 0xDC, 0xFC, 0x5F, 0xB5, 0x6A, 0x5A, 0x59, diff --git a/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePageSingleByteSpec.scala b/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePageSingleByteSpec.scala index ae93375f7..4b2aac2bb 100644 --- a/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePageSingleByteSpec.scala +++ b/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePageSingleByteSpec.scala @@ -137,6 +137,11 @@ class CodePageSingleByteSpec extends AnyFunSuite { assert(codePage.codePageShortName == "cp1142") } + test("Ensure codepage 'cp1143' gives the associated CodePage") { + val codePage = CodePage.getCodePageByName("cp1143") + assert(codePage.codePageShortName == "cp1143") + } + test("Ensure codepage 'cp1145' gives the associated CodePage") { val codePage = CodePage.getCodePageByName("cp1145") assert(codePage.codePageShortName == "cp1145") From e6907086e450e8e68a02f43f885d9e0b066f2ee5 Mon Sep 17 00:00:00 2001 From: Ruslan Iushchenko Date: Wed, 8 Jan 2025 09:45:10 +0100 Subject: [PATCH 5/8] Update scodec dependencies to latest versions. --- project/Dependencies.scala | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/project/Dependencies.scala b/project/Dependencies.scala index cb7449204..17b4e8299 100644 --- a/project/Dependencies.scala +++ b/project/Dependencies.scala @@ -18,8 +18,7 @@ import sbt._ object Dependencies { private val guavaVersion = "15.0" - private val scodecBitsVersion = "1.1.4" - private val scodecCoreVersion = "1.11.4" + private val scodecCoreVersion = "1.11.10" private val antlrValue = "4.8" private val slf4jVersion = "1.7.25" private val jacksonVersion = "2.13.0" From 2f9309af59c7366a7408801d79da9778f7d2cc33 Mon Sep 17 00:00:00 2001 From: Ruslan Iushchenko Date: Tue, 14 Jan 2025 11:09:55 +0100 Subject: [PATCH 6/8] =?UTF-8?q?Add=20CP1144=20code=20page=20(Italy)=20whic?= =?UTF-8?q?h=20is=20same=20as=20CP280=20with=205A=20is=20replaced=20with?= =?UTF-8?q?=20the=20"=E2=82=AC"=20(euro)=20character.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 1 + .../parser/encoding/codepage/CodePage.scala | 1 + .../encoding/codepage/CodePage1144.scala | 57 +++++++++++++++++++ .../parser/decoders/StringDecodersSpec.scala | 22 +++++++ .../codepage/CodePageSingleByteSpec.scala | 5 ++ .../cobol/source/index/IndexBuilder.scala | 1 - 6 files changed, 86 insertions(+), 1 deletion(-) create mode 100644 cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage1144.scala diff --git a/README.md b/README.md index 3b02246fa..191132864 100644 --- a/README.md +++ b/README.md @@ -1644,6 +1644,7 @@ The output looks like this: | .option("ebcdic_code_page", "cp1141") | EBCDIC 1141 | Same as code page 273 with € at the position of the international currency symbol ¤. | | .option("ebcdic_code_page", "cp1142") | EBCDIC 1142 | Same as code page 277 with € at the position of the international currency symbol ¤. | | .option("ebcdic_code_page", "cp1143") | EBCDIC 1143 | Same as code page 278 with € at the position of the international currency symbol ¤. | +| .option("ebcdic_code_page", "cp1144") | EBCDIC 1144 | Same as code page 280 with € at the position of the international currency symbol ¤. | | .option("ebcdic_code_page", "cp1145") | EBCDIC 1145 | Same as code page 284 with € at the position of the international currency symbol ¤. | | .option("ebcdic_code_page", "cp1148") | EBCDIC 1148 | Same as code page 500 with € at the position of the international currency symbol ¤. | | .option("ebcdic_code_page", "cp1364") | EBCDIC 1364 | Double-byte code page CCSID-1364, Korean. | diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage.scala index 3432da412..fbb0622c9 100644 --- a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage.scala +++ b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage.scala @@ -65,6 +65,7 @@ object CodePage extends Logging { case "cp1141" => new CodePage1141 case "cp1142" => new CodePage1142 case "cp1143" => new CodePage1143 + case "cp1144" => new CodePage1144 case "cp1145" => new CodePage1145 case "cp1146" => new CodePage1146 case "cp1148" => new CodePage1148 diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage1144.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage1144.scala new file mode 100644 index 000000000..31de8e161 --- /dev/null +++ b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage1144.scala @@ -0,0 +1,57 @@ +/* + * Copyright 2018 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package za.co.absa.cobrix.cobol.parser.encoding.codepage + +/** + * EBCDIC code page 1144. Italy. + * + * It corresponds to code page 280 and only differs from it in position 9F, where the euro sign € is located instead + * of the international currency symbol ¤. + */ +class CodePage1144 extends SingleByteCodePage(CodePage1144.ebcdicToAsciiMapping) { + override def codePageShortName: String = "cp1144" +} + +object CodePage1144 { + val ebcdicToAsciiMapping: Array[Char] = { + import EbcdicNonPrintable._ + + /* This is the EBCDIC Code Page 1144 to ASCII conversion table + from https://en.wikibooks.org/wiki/Character_Encodings/Code_Tables/EBCDIC/EBCDIC_280 */ + val ebcdic2ascii: Array[Char] = { + // Non-printable characters map used: http://www.pacsys.com/asciitab.htm + Array[Char]( + c00, c01, c02, c03, spc, c09, spc, del, spc, spc, spc, c0b, c0c, ccr, c0e, c0f, // 0 - 15 + c10, c11, c12, c13, spc, nel, c08, spc, c18, c19, spc, spc, c1c, c1d, c1e, c1f, // 16 - 31 + spc, spc, spc, spc, spc, clf, c17, c1b, spc, spc, spc, spc, spc, c05, c06, c07, // 32 - 47 + spc, spc, c16, spc, spc, spc, spc, c04, spc, spc, spc, spc, c14, c15, spc, c1a, // 48 - 63 + ' ', rsp, 'â', 'ä', '{', 'á', 'ã', 'å', bsh, 'ñ', '°', '.', '<', '(', '+', '!', // 64 - 79 + '&', ']', 'ê', 'ë', '}', 'í', 'î', 'ï', '~', 'ß', 'é', '$', '*', ')', ';', '^', // 80 - 95 + '-', '/', 'Â', 'Ä', 'À', 'Á', 'Ã', 'Å', 'Ç', 'Ñ', 'ò', ',', '%', '_', '>', '?', // 96 - 111 + 'ø', 'É', 'Ê', 'Ë', 'È', 'Í', 'Î', 'Ï', 'Ì', 'ù', ':', '£', '§', qts, '=', qtd, // 112 - 127 + 'Ø', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', '«', '»', 'ð', 'ý', 'þ', '±', // 128 - 143 + '[', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 'ª', 'º', 'æ', '¸', 'Æ', '€', // 144 - 159 + 'µ', 'ì', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '¡', '¿', 'Ð', 'Ý', 'Þ', '®', // 160 - 175 + '¢', '#', '¥', '·', '©', '@', '¶', '¼', '½', '¾', '¬', '|', '¯', '¨', '´', '×', // 176 - 191 + 'à', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', shy, 'ô', 'ö', '¦', 'ó', 'õ', // 192 - 207 + 'è', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', '¹', 'û', 'ü', '`', 'ú', 'ÿ', // 208 - 223 + 'ç', '÷', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '²', 'Ô', 'Ö', 'Ò', 'Ó', 'Õ', // 224 - 239 + '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '³', 'Û', 'Ü', 'Ù', 'Ú', spc) // 240 - 255 + } + ebcdic2ascii + } +} diff --git a/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/decoders/StringDecodersSpec.scala b/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/decoders/StringDecodersSpec.scala index 78023ba0f..449f52540 100644 --- a/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/decoders/StringDecodersSpec.scala +++ b/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/decoders/StringDecodersSpec.scala @@ -329,6 +329,28 @@ class StringDecodersSpec extends AnyWordSpec { assert(actual == expected) } + "decode a CP1144 string example" in { + val expected = "âä{áãå\\ñ°.<(+!&]êë}íîï~ßé$*);^-/ÂÄÀÁÃÅÇÑò,%_>?øÉÊËÈÍÎÏÌù:£§'=\"Øabcdefghi«»ðýþ±[jklmnopqrªºæ¸Æ€µìstuvwxyz¡¿ÐÝÞ®¢#¥·©@¶¼½¾¬|¯¨´×àABCDEFGHI\u00ADôö¦óõèJKLMNOPQR¹ûü`úÿç÷STUVWXYZ²ÔÖÒÓÕ0123456789³ÛÜÙÚ" + val bytes = Array( + 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F, + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F, + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F, + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B, 0x9C, 0x9D, 0x9E, 0x9F, + 0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF, + 0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF, + 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF, + 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF, + 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, + 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE + ).map(_.toByte) + + val actual = decodeEbcdicString(bytes, KeepAll, new CodePage1144, improvedNullDetection = false) + + assert(actual == expected) + } + "decode a CP1145 string special characters" in { val expected = " äÑ|üܬ§ñ]ߢ[Ö{}æö¨å@ÆØÅÄÉ€ " val bytes = Array(0x40, 0x43, 0x7B, 0x4F, 0xDC, 0xFC, 0x5F, 0xB5, 0x6A, 0x5A, 0x59, diff --git a/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePageSingleByteSpec.scala b/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePageSingleByteSpec.scala index 4b2aac2bb..485b1a5c9 100644 --- a/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePageSingleByteSpec.scala +++ b/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePageSingleByteSpec.scala @@ -142,6 +142,11 @@ class CodePageSingleByteSpec extends AnyFunSuite { assert(codePage.codePageShortName == "cp1143") } + test("Ensure codepage 'cp1144' gives the associated CodePage") { + val codePage = CodePage.getCodePageByName("cp1144") + assert(codePage.codePageShortName == "cp1144") + } + test("Ensure codepage 'cp1145' gives the associated CodePage") { val codePage = CodePage.getCodePageByName("cp1145") assert(codePage.codePageShortName == "cp1145") diff --git a/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/source/index/IndexBuilder.scala b/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/source/index/IndexBuilder.scala index 45667a7a5..413a0f359 100644 --- a/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/source/index/IndexBuilder.scala +++ b/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/source/index/IndexBuilder.scala @@ -56,7 +56,6 @@ private[source] object IndexBuilder extends Logging { buildIndexForVarLenReader(filesList, reader, sqlContext) case _ => buildIndexForFullFiles(filesList, sqlContext) - case _ => null } } From 42cdebb309b7837c9ca5ccb10fb9440544f6e672 Mon Sep 17 00:00:00 2001 From: Ruslan Iushchenko Date: Wed, 15 Jan 2025 16:03:46 +0100 Subject: [PATCH 7/8] =?UTF-8?q?Add=20CP1147=20code=20page=20(France)=20whi?= =?UTF-8?q?ch=20is=20same=20as=20CP297=20with=209F=20is=20replaced=20with?= =?UTF-8?q?=20the=20"=E2=82=AC"=20(euro)=20character.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 1 + .../parser/encoding/codepage/CodePage.scala | 1 + .../encoding/codepage/CodePage1147.scala | 55 +++++++++++++++++++ .../parser/decoders/StringDecodersSpec.scala | 11 ++++ .../codepage/CodePageSingleByteSpec.scala | 5 ++ project/Dependencies.scala | 2 +- 6 files changed, 74 insertions(+), 1 deletion(-) create mode 100644 cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage1147.scala diff --git a/README.md b/README.md index 191132864..dc6da74ba 100644 --- a/README.md +++ b/README.md @@ -1646,6 +1646,7 @@ The output looks like this: | .option("ebcdic_code_page", "cp1143") | EBCDIC 1143 | Same as code page 278 with € at the position of the international currency symbol ¤. | | .option("ebcdic_code_page", "cp1144") | EBCDIC 1144 | Same as code page 280 with € at the position of the international currency symbol ¤. | | .option("ebcdic_code_page", "cp1145") | EBCDIC 1145 | Same as code page 284 with € at the position of the international currency symbol ¤. | +| .option("ebcdic_code_page", "cp1147") | EBCDIC 1147 | Same as code page 297 with € at the position of the international currency symbol ¤. | | .option("ebcdic_code_page", "cp1148") | EBCDIC 1148 | Same as code page 500 with € at the position of the international currency symbol ¤. | | .option("ebcdic_code_page", "cp1364") | EBCDIC 1364 | Double-byte code page CCSID-1364, Korean. | | .option("ebcdic_code_page", "cp1388") | EBCDIC 1388 | Double-byte code page CCSID-1388, Simplified Chinese. | diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage.scala index fbb0622c9..043c61acf 100644 --- a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage.scala +++ b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage.scala @@ -68,6 +68,7 @@ object CodePage extends Logging { case "cp1144" => new CodePage1144 case "cp1145" => new CodePage1145 case "cp1146" => new CodePage1146 + case "cp1147" => new CodePage1147 case "cp1148" => new CodePage1148 case "cp1364" => new CodePage1364 case "cp1388" => new CodePage1388 diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage1147.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage1147.scala new file mode 100644 index 000000000..3e3d33c0c --- /dev/null +++ b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage1147.scala @@ -0,0 +1,55 @@ +/* + * Copyright 2018 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package za.co.absa.cobrix.cobol.parser.encoding.codepage + +/** + * EBCDIC code page 1147 of France is the same as 297. + * with € at the position of the international currency symbol ¤. + */ +class CodePage1147 extends SingleByteCodePage(CodePage1147.ebcdicToAsciiMapping) { + override def codePageShortName: String = "cp1147" +} + +object CodePage1147 { + val ebcdicToAsciiMapping: Array[Char] = { + import EbcdicNonPrintable._ + + /* This is the EBCDIC Code Page 1147 to ASCII conversion table + from https://en.wikibooks.org/wiki/Character_Encodings/Code_Tables/EBCDIC/EBCDIC_297 */ + val ebcdic2ascii: Array[Char] = { + // Non-printable characters map used: http://www.pacsys.com/asciitab.htm + Array[Char]( + c00, c01, c02, c03, spc, c09, spc, del, spc, spc, spc, c0b, c0c, ccr, c0e, c0f, // 0 - 15 + c10, c11, c12, c13, spc, nel, c08, spc, c18, c19, spc, spc, c1c, c1d, c1e, c1f, // 16 - 31 + spc, spc, spc, spc, spc, clf, c17, c1b, spc, spc, spc, spc, spc, c05, c06, c07, // 32 - 47 + spc, spc, c16, spc, spc, spc, spc, c04, spc, spc, spc, spc, c14, c15, spc, c1a, // 48 - 63 + ' ', rsp, 'â', 'ä', '@', 'á', 'ã', 'å', bsh, 'ñ', '°', '.', '<', '(', '+', '!', // 64 - 79 + '&', '{', 'ê', 'ë', '}', 'í', 'î', 'ï', 'ì', 'ß', '§', '$', '*', ')', ';', '^', // 80 - 95 + '-', '/', 'Â', 'Ä', 'À', 'Á', 'Ã', 'Å', 'Ç', 'Ñ', 'ù', ',', '%', '_', '>', '?', // 96 - 111 + 'ø', 'É', 'Ê', 'Ë', 'È', 'Í', 'Î', 'Ï', 'Ì', 'µ', ':', '£', 'à', qts, '=', qtd, // 112 - 127 + 'Ø', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', '«', '»', 'ð', 'ý', 'þ', '±', // 128 - 143 + '[', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 'ª', 'º', 'æ', '¸', 'Æ', '€', // 144 - 159 + '`', '¨', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '¡', '¿', 'Ð', 'Ý', 'Þ', '®', // 160 - 175 + '¢', '#', '¥', '·', '©', ']', '¶', '¼', '½', '¾', '¬', '|', '¯', '~', '´', '×', // 176 - 191 + 'é', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', shy, 'ô', 'ö', 'ò', 'ó', 'õ', // 192 - 207 + 'è', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', '¹', 'û', 'ü', '¦', 'ú', 'ÿ', // 208 - 223 + 'ç', '÷', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '²', 'Ô', 'Ö', 'Ò', 'Ó', 'Õ', // 224 - 239 + '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '³', 'Û', 'Ü', 'Ù', 'Ú', spc) // 240 - 255 + } + ebcdic2ascii + } +} diff --git a/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/decoders/StringDecodersSpec.scala b/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/decoders/StringDecodersSpec.scala index 449f52540..87ef58abf 100644 --- a/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/decoders/StringDecodersSpec.scala +++ b/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/decoders/StringDecodersSpec.scala @@ -373,6 +373,17 @@ class StringDecodersSpec extends AnyWordSpec { assert(actual == expected) } + "decode a CP1147 string special characters" in { + val expected = " °$¨¢#¬¯ä£!üÜ^]ù§ß¢°Öéèæö¨åàÆØÅÄÉ€ " + val bytes = Array(0x40, 0x4A, 0x5B, 0xA1, 0xB0, 0xB1, 0xBA, 0xBC, 0x43, 0x7B, 0x4F, 0xDC, 0xFC, 0x5F, 0xB5, 0x6A, 0x5A, 0x59, + 0xB0, 0x4A, 0xEC, 0xC0, 0xD0, 0x9C, 0xCC, 0xA1, 0x47, 0x7C, 0x9E, 0x80, 0x67, 0x63, + 0x71, 0x9F, 0x40).map(_.toByte) + + val actual = decodeEbcdicString(bytes, KeepAll, new CodePage1147, improvedNullDetection = false) + + assert(actual == expected) + } + "decode a CP1148 string special characters" in { val expected = "âäàáãåçñ[.<(+!&éêëèíîïìß]$*);^-/ÂÄÀÁÃÅÇѦ,%_>?øÉÊËÈÍÎÏÌ`:#@'=\"Øabcdefghi«»ðýþ±°jklmnopqrªºæ¸Æ€µ~stuvwxyz¡¿ÐÝÞ®¢£¥·©§¶¼½¾¬|¯¨´×{ABCDEFGHI\u00ADôöòóõ}JKLMNOPQR¹ûüùúÿ\\÷STUVWXYZ²ÔÖÒÓÕ0123456789³ÛÜÙÚ" val bytes = Array( diff --git a/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePageSingleByteSpec.scala b/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePageSingleByteSpec.scala index 485b1a5c9..6368b075c 100644 --- a/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePageSingleByteSpec.scala +++ b/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePageSingleByteSpec.scala @@ -157,6 +157,11 @@ class CodePageSingleByteSpec extends AnyFunSuite { assert(codePage.codePageShortName == "cp1146") } + test("Ensure codepage 'cp1147' gives the associated CodePage") { + val codePage = CodePage.getCodePageByName("cp1147") + assert(codePage.codePageShortName == "cp1147") + } + test("Ensure codepage 'cp1148' gives the associated CodePage") { val codePage = CodePage.getCodePageByName("cp1148") assert(codePage.codePageShortName == "cp1148") diff --git a/project/Dependencies.scala b/project/Dependencies.scala index 17b4e8299..85c18ffa1 100644 --- a/project/Dependencies.scala +++ b/project/Dependencies.scala @@ -18,7 +18,7 @@ import sbt._ object Dependencies { private val guavaVersion = "15.0" - private val scodecCoreVersion = "1.11.10" + private val scodecCoreVersion = "1.11.4" private val antlrValue = "4.8" private val slf4jVersion = "1.7.25" private val jacksonVersion = "2.13.0" From bee7dfbe5b680dc3f0fa09ab9202c3c95bb0785a Mon Sep 17 00:00:00 2001 From: Ruslan Iushchenko Date: Thu, 16 Jan 2025 14:11:47 +0100 Subject: [PATCH 8/8] =?UTF-8?q?Add=20CP1160=20code=20page=20(Thai)=20which?= =?UTF-8?q?=20is=20same=20as=20CP838=20with=20FE=20is=20replaced=20with=20?= =?UTF-8?q?the=20"=E2=82=AC"=20(euro)=20character.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 1 + .../parser/encoding/codepage/CodePage.scala | 1 + .../encoding/codepage/CodePage1160.scala | 72 +++++++++++++++++++ .../encoding/codepage/CodePage838.scala | 8 +-- .../parser/decoders/StringDecodersSpec.scala | 20 ++++++ .../codepage/CodePageSingleByteSpec.scala | 5 ++ 6 files changed, 101 insertions(+), 6 deletions(-) create mode 100644 cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage1160.scala diff --git a/README.md b/README.md index dc6da74ba..5294ca2a4 100644 --- a/README.md +++ b/README.md @@ -1648,6 +1648,7 @@ The output looks like this: | .option("ebcdic_code_page", "cp1145") | EBCDIC 1145 | Same as code page 284 with € at the position of the international currency symbol ¤. | | .option("ebcdic_code_page", "cp1147") | EBCDIC 1147 | Same as code page 297 with € at the position of the international currency symbol ¤. | | .option("ebcdic_code_page", "cp1148") | EBCDIC 1148 | Same as code page 500 with € at the position of the international currency symbol ¤. | +| .option("ebcdic_code_page", "cp1160") | EBCDIC 1160 | Same as code page 838 with € at the position 0xFE. | | .option("ebcdic_code_page", "cp1364") | EBCDIC 1364 | Double-byte code page CCSID-1364, Korean. | | .option("ebcdic_code_page", "cp1388") | EBCDIC 1388 | Double-byte code page CCSID-1388, Simplified Chinese. | diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage.scala index 043c61acf..0e4a65892 100644 --- a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage.scala +++ b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage.scala @@ -70,6 +70,7 @@ object CodePage extends Logging { case "cp1146" => new CodePage1146 case "cp1147" => new CodePage1147 case "cp1148" => new CodePage1148 + case "cp1160" => new CodePage1160 case "cp1364" => new CodePage1364 case "cp1388" => new CodePage1388 case codePage => throw new IllegalArgumentException(s"The code page '$codePage' is not one of the builtin EBCDIC code pages.") diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage1160.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage1160.scala new file mode 100644 index 000000000..19e38cd14 --- /dev/null +++ b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage1160.scala @@ -0,0 +1,72 @@ +/* + * Copyright 2018 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package za.co.absa.cobrix.cobol.parser.encoding.codepage + +/** + * EBCDIC code page 1160 with support for Thai script used in IBM mainframes which is same as 838 + * with € at the position 0xFE. + */ +class CodePage1160 extends SingleByteCodePage(CodePage1160.ebcdicToAsciiMapping) { + override def codePageShortName: String = "cp1160" +} + +object CodePage1160 { + val ebcdicToAsciiMapping: Array[Char] = { + import EbcdicNonPrintable._ + + /* This is the EBCDIC Code Page 1160 to ASCII conversion table + from https://en.wikibooks.org/wiki/Character_Encodings/Code_Tables/EBCDIC/EBCDIC_838 */ + val ebcdic2ascii: Array[Char] = { + val c01 = '\u0E48' + val c02 = '\u0E4E' + val c03 = '\u0E31' + val c04 = '\u0E34' + val c05 = '\u0E49' + val c06 = '\u0E35' + val c07 = '\u0E36' + val c08 = '\u0E37' + val c09 = '\u0E38' + val c10 = '\u0E39' + val c11 = '\u0E3A' + val c12 = '\u0E47' + val c13 = '\u0E48' + val c14 = '\u0E49' + val c15 = '\u0E4A' + val c16 = '\u0E4B' + val c18 = '\u0E4D' + + Array[Char]( + spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, ccr, spc, spc, // 0 - 15 + spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, // 16 - 31 + spc, spc, spc, spc, spc, clf, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, // 32 - 47 + spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, // 48 - 63 + spc, spc, 'ก', 'ข', 'ฃ', 'ค', 'ฅ', 'ฆ', 'ง', '[', '¢', '.', '<', '(', '+', '|', // 64 - 79 + '&', c01, 'จ', 'ฉ', 'ช', 'ซ', 'ฌ', 'ญ', 'ฎ', ']', '!', '$', '*', ')', ';', '¬', // 80 - 95 + '-', '/', 'ฏ', 'ฐ', 'ฑ', 'ฒ', 'ณ', 'ด', 'ต', '^', '¦', ',', '%', '_', '>', '?', // 96 - 111 + '฿', c02, 'ถ', 'ท', 'ธ', 'น', 'บ', 'ป', 'ผ', '`', ':', '#', '@', qts, '=', qtd, // 112 - 127 + '๏', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'ฝ', 'พ', 'ฟ', 'ภ', 'ม', 'ย', // 128 - 143 + '๚', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 'ร', 'ฤ', 'ล', 'ฦ', 'ว', 'ศ', // 144 - 159 + '๛', '~', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'ษ', 'ส', 'ห', 'ฬ', 'อ', 'ฮ', // 160 - 175 + '๐', '๑', '๒', '๓', '๔', '๕', '๖', '๗', '๘', '๙', 'ฯ', 'ะ', c03, 'า', 'ำ', c04, // 176 - 191 + '{', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', c05, c06, c07, c08, c09, c10, // 192 - 207 + '}', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', c11, 'เ', 'แ', 'โ', 'ใ', 'ไ', // 208 - 223 + bsh, c15, 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'ๅ', 'ๆ', c12, c13, c14, c15, // 224 - 239 + '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', c16, c17, c18, c16, '€', spc) // 240 - 255 + } + ebcdic2ascii + } +} diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage838.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage838.scala index e06524325..9bfd5cfb3 100644 --- a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage838.scala +++ b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage838.scala @@ -25,15 +25,11 @@ class CodePage838 extends SingleByteCodePage(CodePage838.ebcdicToAsciiMapping) { object CodePage838 { val ebcdicToAsciiMapping: Array[Char] = { + import EbcdicNonPrintable._ + /* This is the EBCDIC Code Page 838 to ASCII conversion table with non-printable characters mapping from https://en.everybodywiki.com/EBCDIC_838 */ val ebcdic2ascii: Array[Char] = { - val clf = '\r' - val ccr = '\n' - val spc = ' ' - val qts = '\'' - val qtd = '\"' - val bsh = '\\' val c01 = '\u0E48' val c02 = '\u0E4E' val c03 = '\u0E31' diff --git a/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/decoders/StringDecodersSpec.scala b/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/decoders/StringDecodersSpec.scala index 87ef58abf..613b12de4 100644 --- a/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/decoders/StringDecodersSpec.scala +++ b/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/decoders/StringDecodersSpec.scala @@ -241,6 +241,16 @@ class StringDecodersSpec extends AnyWordSpec { assert(actual == expected) } + "decode a CP838 string special characters" in { + val expected = " ¢$~๐๑ฯัข#|แํ¬๕¦!]๐¢็{}ลึ~ฆ@ว๏ดฐ " + val bytes = Array(0x40, 0x4A, 0x5B, 0xA1, 0xB0, 0xB1, 0xBA, 0xBC, 0x43, 0x7B, 0x4F, 0xDC, 0xFC, 0x5F, 0xB5, 0x6A, 0x5A, 0x59, + 0xB0, 0x4A, 0xEC, 0xC0, 0xD0, 0x9C, 0xCC, 0xA1, 0x47, 0x7C, 0x9E, 0x80, 0x67, 0x63, 0x40).map(_.toByte) + + val actual = decodeEbcdicString(bytes, KeepAll, new CodePage838, improvedNullDetection = false) + + assert(actual == expected) + } + "decode a CP1140 string special characters" in { val expected = "âäàáãåçñ¢.<(+|&éêëèíîïìß!$*);¬-/ÂÄÀÁÃÅÇѦ,%_>?øÉÊËÈÍÎÏÌ`:#@'=\"Øabcdefghi«»ðýþ±°jklmnopqrªºæ¸Æ€µ~stuvwxyz¡¿ÐÝÞ®^£¥·©§¶¼½¾[]¯¨´×{ABCDEFGHI\u00ADôöòóõ}JKLMNOPQR¹ûüùúÿ\\÷STUVWXYZ²ÔÖÒÓÕ0123456789³ÛÜÙÚ" val bytes = Array( @@ -405,6 +415,16 @@ class StringDecodersSpec extends AnyWordSpec { assert(actual == expected) } + + "decode a CP1160 string special characters" in { + val expected = " ¢$~๐๑ฯัข#|แํ¬๕¦!]๐¢็{}ลึ~ฆ@ว๏ดฐ€ " + val bytes = Array(0x40, 0x4A, 0x5B, 0xA1, 0xB0, 0xB1, 0xBA, 0xBC, 0x43, 0x7B, 0x4F, 0xDC, 0xFC, 0x5F, 0xB5, 0x6A, 0x5A, 0x59, + 0xB0, 0x4A, 0xEC, 0xC0, 0xD0, 0x9C, 0xCC, 0xA1, 0x47, 0x7C, 0x9E, 0x80, 0x67, 0x63, 0xFE, 0x40).map(_.toByte) + + val actual = decodeEbcdicString(bytes, KeepAll, new CodePage1160, improvedNullDetection = false) + + assert(actual == expected) + } } } diff --git a/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePageSingleByteSpec.scala b/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePageSingleByteSpec.scala index 6368b075c..b637ee4f2 100644 --- a/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePageSingleByteSpec.scala +++ b/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePageSingleByteSpec.scala @@ -167,6 +167,11 @@ class CodePageSingleByteSpec extends AnyFunSuite { assert(codePage.codePageShortName == "cp1148") } + test("Ensure codepage 'cp1160' gives the associated CodePage") { + val codePage = CodePage.getCodePageByName("cp1160") + assert(codePage.codePageShortName == "cp1160") + } + test("Ensure codepage 'cp1364' gives the associated CodePage") { val codePage = CodePage.getCodePageByName("cp1364") assert(codePage.codePageShortName == "cp1364")