-
Notifications
You must be signed in to change notification settings - Fork 6.2k
8360459: UNICODE_CASE and character class with non-ASCII range does not match ASCII char #26285
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Closed
Closed
Changes from all commits
Commits
Show all changes
6 commits
Select commit
Hold shift + click to select a range
640d7a6
8360459: UNICODE_CASE and character class with non-ASCII range does n…
xuemingshen-oracle 735bd72
update to address the review comments
xuemingshen-oracle e18d266
update to address the review comments
xuemingshen-oracle c2afc42
update and add more test cases, and fix a test failure
xuemingshen-oracle b85f581
improve the lookup logic and test case for +00df
xuemingshen-oracle a090888
update to fix the typo
xuemingshen-oracle File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
73 changes: 73 additions & 0 deletions
73
make/jdk/src/classes/build/tools/generatecharacter/CaseFolding.java
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,73 @@ | ||
| /* | ||
| * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved. | ||
| * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. | ||
| * | ||
| * This code is free software; you can redistribute it and/or modify it | ||
| * under the terms of the GNU General Public License version 2 only, as | ||
| * published by the Free Software Foundation. Oracle designates this | ||
| * particular file as subject to the "Classpath" exception as provided | ||
| * by Oracle in the LICENSE file that accompanied this code. | ||
| * | ||
| * This code is distributed in the hope that it will be useful, but WITHOUT | ||
| * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | ||
| * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License | ||
| * version 2 for more details (a copy is included in the LICENSE file that | ||
| * accompanied this code). | ||
| * | ||
| * You should have received a copy of the GNU General Public License version | ||
| * 2 along with this work; if not, write to the Free Software Foundation, | ||
| * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. | ||
| * | ||
| * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA | ||
| * or visit www.oracle.com if you need additional information or have any | ||
| * questions. | ||
| */ | ||
|
|
||
| package build.tools.generatecharacter; | ||
|
|
||
| import java.io.IOException; | ||
| import java.nio.file.Files; | ||
| import java.nio.file.Paths; | ||
| import java.nio.file.StandardOpenOption; | ||
| import java.util.stream.Collectors; | ||
| import java.util.stream.Stream; | ||
|
|
||
| public class CaseFolding { | ||
|
|
||
| public static void main(String[] args) throws Throwable { | ||
| if (args.length != 3) { | ||
| System.err.println("Usage: java CaseFolding TemplateFile CaseFolding.txt CaseFolding.java"); | ||
| System.exit(1); | ||
| } | ||
| var templateFile = Paths.get(args[0]); | ||
| var caseFoldingTxt = Paths.get(args[1]); | ||
| var genSrcFile = Paths.get(args[2]); | ||
| var supportedTypes = "^.*; [CTS]; .*$"; | ||
| var caseFoldingEntries = Files.lines(caseFoldingTxt) | ||
| .filter(line -> !line.startsWith("#") && line.matches(supportedTypes)) | ||
| .map(line -> { | ||
| String[] cols = line.split("; "); | ||
| return new String[] {cols[0], cols[1], cols[2]}; | ||
| }) | ||
| .filter(cols -> { | ||
| // the folding case doesn't map back to the original char. | ||
| var cp1 = Integer.parseInt(cols[0], 16); | ||
| var cp2 = Integer.parseInt(cols[2], 16); | ||
| return Character.toUpperCase(cp2) != cp1 && Character.toLowerCase(cp2) != cp1; | ||
| }) | ||
| .map(cols -> String.format(" entry(0x%s, 0x%s)", cols[0], cols[2])) | ||
| .collect(Collectors.joining(",\n", "", "")); | ||
|
|
||
| // hack, hack, hack! the logic does not pick 0131. just add manually to support 'I's. | ||
| // 0049; T; 0131; # LATIN CAPITAL LETTER I | ||
| final String T_0x0131_0x49 = String.format(" entry(0x%04x, 0x%04x),\n", 0x0131, 0x49); | ||
|
|
||
| // Generate .java file | ||
| Files.write( | ||
| genSrcFile, | ||
| Files.lines(templateFile) | ||
| .map(line -> line.contains("%%%Entries") ? T_0x0131_0x49 + caseFoldingEntries : line) | ||
| .collect(Collectors.toList()), | ||
| StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING); | ||
| } | ||
| } | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
116 changes: 116 additions & 0 deletions
116
src/java.base/share/classes/jdk/internal/util/regex/CaseFolding.java.template
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,116 @@ | ||
| /* | ||
| * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved. | ||
| * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. | ||
| * | ||
| * This code is free software; you can redistribute it and/or modify it | ||
| * under the terms of the GNU General Public License version 2 only, as | ||
| * published by the Free Software Foundation. Oracle designates this | ||
| * particular file as subject to the "Classpath" exception as provided | ||
| * by Oracle in the LICENSE file that accompanied this code. | ||
| * | ||
| * This code is distributed in the hope that it will be useful, but WITHOUT | ||
| * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | ||
| * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License | ||
| * version 2 for more details (a copy is included in the LICENSE file that | ||
| * accompanied this code). | ||
| * | ||
| * You should have received a copy of the GNU General Public License version | ||
| * 2 along with this work; if not, write to the Free Software Foundation, | ||
| * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. | ||
| * | ||
| * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA | ||
| * or visit www.oracle.com if you need additional information or have any | ||
| * questions. | ||
| */ | ||
|
|
||
| package jdk.internal.util.regex; | ||
|
|
||
| import java.util.Arrays; | ||
| import java.util.Map; | ||
| import java.util.Objects; | ||
|
|
||
| import static java.util.Map.entry; | ||
|
|
||
| public final class CaseFolding { | ||
|
|
||
| private static final Map<Integer, Integer> expanded_case_map = Map.ofEntries( | ||
| %%%Entries | ||
| ); | ||
|
|
||
| private static final int[] expanded_case_cps = expanded_case_map.keySet() | ||
| .stream() | ||
| .mapToInt(Integer::intValue) | ||
| .toArray(); | ||
|
|
||
| private CaseFolding() {} | ||
|
|
||
| /** | ||
| * Returns an expansion set to "close" a given regex Unicode character class range for case-sensitive | ||
| * matching, according to the | ||
| * <a href="https://www.unicode.org/reports/tr18/#Simple_Loose_Matches">Simple Loose Matches</a> | ||
| * rule defined in Unicode Technical Standard #18: Unicode Regular Expressions. | ||
| * <p> | ||
| * To conform with Level 1 of UTS #18, specifically RL1.5: Simple Loose Matches, simple case folding must | ||
| * be applied to literals and (optionally) to character classes. When applied to character classes, each | ||
| * character class is expected to be closed under simple case folding. See the standard for the | ||
| * detailed explanation and example of "closed". | ||
| * <p> | ||
| * RL1.5 states: To meet this requirement, an implementation that supports case-sensitive matching should | ||
| * <ol> | ||
| * <li>Provide at least the simple, default Unicode case-insensitive matching, and</li> | ||
| * <li>Specify which character properties or constructs are closed under the matching.</li> | ||
| * </ol> | ||
| * <p> | ||
| * In the {@code Pattern} implementation, 5 types of constructs maybe case-sensitive when matching: | ||
| * back-refs, string slice (sequences), single, family(char-property) and class range. Single and | ||
| * family may appears independently or within a class. | ||
| * <p> | ||
| * For loose/case-insensitive matching, the back-refs, slices and singles apply {code toUpperCase} and | ||
| * {@code toLowerCase} to both the pattern and the input string. This effectively 'close' the class for | ||
| * matching. | ||
| * <p> | ||
| * The family/char-properties are not "closed" and should remain unchanged. This is acceptable per RL1.5, | ||
| * if their behavior is clearly specified. | ||
| * <p> | ||
| * This method addresses that requirement for the "range" construct within in character class by computing | ||
| * the additional characters that should be included to close the range under simple case folding: | ||
| * <p> | ||
| * For each character in the input range {@code [start, end]} (inclusive), if the character has a simple | ||
| * case folding mapping in Unicode's CaseFolding.txt, the mapping is not a round-trip map, and the mapped | ||
| * character is not already in the range, then that mapped character (typically lowercase) is added to | ||
| * the expansion set. | ||
| * <p> | ||
| * This allows regex character class "range" implementation to use the returned expansion set to support | ||
| * additional case-insensitive matching, without duplicating characters already covered by the existing | ||
| * regex range implementation. The expectation is the matching is done using both the uppercase and | ||
| * lowercase forms of the input character, for example | ||
| * | ||
| * <pre>{@code | ||
| * | ||
| * ch -> inRange(lower, Character.toUpperCase(ch), upper) || | ||
| * inRange(lower, Character.toLower(ch), upper) || | ||
| * additionalClosingCharacters.contains(Character.toUpperCase(ch)) || | ||
| * additionalClosingCharacters.contains(Character.toUpperCase(ch)) | ||
| * }</pre> | ||
| * | ||
| * <p> | ||
| * @spec https://www.unicode.org/reports/tr18/#Simple_Loose_Matches | ||
| * @param start the starting code point of the character range | ||
| * @param end the ending code point of the character range | ||
| * @return a {@code int[]} containing the all simple case equivalents of characters in the range, excluding | ||
| * those already in the range | ||
| */ | ||
| public static int[] getClassRangeClosingCharacters(int start, int end) { | ||
| int[] expanded = new int[expanded_case_cps.length]; | ||
| int off = 0; | ||
| for (int cp : expanded_case_cps) { | ||
| if (cp >= start && cp <= end) { | ||
| int folding = expanded_case_map.get(cp); | ||
| if (folding < start || folding > end) { | ||
| expanded[off++] = folding; | ||
| } | ||
| } | ||
| } | ||
| return Arrays.copyOf(expanded, off); | ||
| } | ||
| } |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.