Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions make/ToolsJdk.gmk
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,9 @@ TOOL_GENERATECACERTS = $(JAVA_SMALL) -cp $(BUILDTOOLS_OUTPUTDIR)/jdk_tools_class
TOOL_GENERATEEXTRAPROPERTIES = $(JAVA_SMALL) -cp $(BUILDTOOLS_OUTPUTDIR)/jdk_tools_classes \
build.tools.generateextraproperties.GenerateExtraProperties

TOOL_GENERATECASEFOLDING = $(JAVA_SMALL) -cp $(BUILDTOOLS_OUTPUTDIR)/jdk_tools_classes \
build.tools.generatecharacter.CaseFolding

TOOL_MAKEZIPREPRODUCIBLE = $(JAVA_SMALL) -cp $(BUILDTOOLS_OUTPUTDIR)/jdk_tools_classes \
build.tools.makezipreproducible.MakeZipReproducible

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
/*
* Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/

package build.tools.generatecharacter;

import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.nio.file.StandardOpenOption;
import java.util.stream.Collectors;
import java.util.stream.Stream;

public class CaseFolding {

public static void main(String[] args) throws Throwable {
if (args.length != 3) {
System.err.println("Usage: java CaseFolding TemplateFile CaseFolding.txt CaseFolding.java");
System.exit(1);
}
var templateFile = Paths.get(args[0]);
var caseFoldingTxt = Paths.get(args[1]);
var genSrcFile = Paths.get(args[2]);
var supportedTypes = "^.*; [CTS]; .*$";
var caseFoldingEntries = Files.lines(caseFoldingTxt)
.filter(line -> !line.startsWith("#") && line.matches(supportedTypes))
.map(line -> {
String[] cols = line.split("; ");
return new String[] {cols[0], cols[1], cols[2]};
})
.filter(cols -> {
// the folding case doesn't map back to the original char.
var cp1 = Integer.parseInt(cols[0], 16);
var cp2 = Integer.parseInt(cols[2], 16);
return Character.toUpperCase(cp2) != cp1 && Character.toLowerCase(cp2) != cp1;
})
.map(cols -> String.format(" entry(0x%s, 0x%s)", cols[0], cols[2]))
.collect(Collectors.joining(",\n", "", ""));

// hack, hack, hack! the logic does not pick 0131. just add manually to support 'I's.
// 0049; T; 0131; # LATIN CAPITAL LETTER I
final String T_0x0131_0x49 = String.format(" entry(0x%04x, 0x%04x),\n", 0x0131, 0x49);

// Generate .java file
Files.write(
genSrcFile,
Files.lines(templateFile)
.map(line -> line.contains("%%%Entries") ? T_0x0131_0x49 + caseFoldingEntries : line)
.collect(Collectors.toList()),
StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING);
}
}
17 changes: 17 additions & 0 deletions make/modules/java.base/gensrc/GensrcRegex.gmk
Original file line number Diff line number Diff line change
Expand Up @@ -50,5 +50,22 @@ TARGETS += $(GENSRC_INDICCONJUNCTBREAK)

################################################################################

GENSRC_CASEFOLDING := $(SUPPORT_OUTPUTDIR)/gensrc/java.base/jdk/internal/util/regex/CaseFolding.java

CASEFOLDINGTEMP := $(MODULE_SRC)/share/classes/jdk/internal/util/regex/CaseFolding.java.template
CASEFOLDINGTXT := $(MODULE_SRC)/share/data/unicodedata/CaseFolding.txt

$(GENSRC_CASEFOLDING): $(BUILD_TOOLS_JDK) $(CASEFOLDINGTEMP) $(CASEFOLDINGTXT)
$(call LogInfo, Generating $@)
$(call MakeTargetDir)
$(TOOL_GENERATECASEFOLDING) \
$(CASEFOLDINGTEMP) \
$(CASEFOLDINGTXT) \
$(GENSRC_CASEFOLDING)

TARGETS += $(GENSRC_CASEFOLDING)

################################################################################

endif # include guard
include MakeIncludeEnd.gmk
32 changes: 28 additions & 4 deletions src/java.base/share/classes/java/util/regex/Pattern.java
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
import java.util.stream.StreamSupport;

import jdk.internal.util.ArraysSupport;
import jdk.internal.util.regex.CaseFolding;
import jdk.internal.util.regex.Grapheme;

/**
Expand Down Expand Up @@ -2915,14 +2916,20 @@ private CharPredicate bitsOrSingle(BitClass bits, int ch) {
toLowerCase(u+212a) ==> u+006B
(6)AngstromSign u+212b
toLowerCase(u+212b) ==> u+00e5
(7) Latin Capital Letter Sharp S u+1e0e, was added in version 5.1
toLowerCase(u+1e9e) ==> u+00df
*/
if (ch < 256 &&
!(has(CASE_INSENSITIVE) && has(UNICODE_CASE) &&
(ch == 0xff || ch == 0xb5 ||
ch == 0x49 || ch == 0x69 || //I and i
ch == 0x53 || ch == 0x73 || //S and s
ch == 0x4b || ch == 0x6b || //K and k
ch == 0xc5 || ch == 0xe5))) { //A+ring
ch == 0xc5 || ch == 0xe5 || //A+ring
// need to force single() to use SingleU specifically for u+00df.
// u+00df <-> u+1e9e, see https://codepoints.net/U+00DF.
// Character.toUpperCase('u+00df') still returns u+00df for now.
ch == 0xdf))) { // Shape S
bits.add(ch, flags0);
return null;
}
Expand All @@ -2939,7 +2946,7 @@ private CharPredicate single(final int ch) {
upper = Character.toUpperCase(ch);
lower = Character.toLowerCase(upper);
// Unicode case insensitive matches
if (upper != lower)
if (upper != lower || ch == 0xDF)
return SingleU(lower);
} else if (ASCII.isAscii(ch)) {
lower = ASCII.toLower(ch);
Expand Down Expand Up @@ -5963,12 +5970,29 @@ static CharPredicate CIRange(int lower, int upper) {
}

static CharPredicate CIRangeU(int lower, int upper) {
int[] closingCharacters = CaseFolding.getClassRangeClosingCharacters(lower, upper);
if (closingCharacters.length == 0) {
return ch -> {
if (inRange(lower, ch, upper))
return true;
int up = Character.toUpperCase(ch);
return (inRange(lower, up, upper) ||
inRange(lower, Character.toLowerCase(up), upper));
};
}
return ch -> {
if (inRange(lower, ch, upper))
return true;
int up = Character.toUpperCase(ch);
return inRange(lower, up, upper) ||
inRange(lower, Character.toLowerCase(up), upper);
int lo = Character.toLowerCase(up);
if (inRange(lower, up, upper) ||
inRange(lower, lo, upper))
return true;
for (int cp : closingCharacters) {
if (up == cp || lo == cp)
return true;
}
return false;
};
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
/*
* Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/

package jdk.internal.util.regex;

import java.util.Arrays;
import java.util.Map;
import java.util.Objects;

import static java.util.Map.entry;

public final class CaseFolding {

private static final Map<Integer, Integer> expanded_case_map = Map.ofEntries(
%%%Entries
);

private static final int[] expanded_case_cps = expanded_case_map.keySet()
.stream()
.mapToInt(Integer::intValue)
.toArray();

private CaseFolding() {}

/**
* Returns an expansion set to "close" a given regex Unicode character class range for case-sensitive
* matching, according to the
* <a href="https://www.unicode.org/reports/tr18/#Simple_Loose_Matches">Simple Loose Matches</a>
* rule defined in Unicode Technical Standard #18: Unicode Regular Expressions.
* <p>
* To conform with Level 1 of UTS #18, specifically RL1.5: Simple Loose Matches, simple case folding must
* be applied to literals and (optionally) to character classes. When applied to character classes, each
* character class is expected to be closed under simple case folding. See the standard for the
* detailed explanation and example of "closed".
* <p>
* RL1.5 states: To meet this requirement, an implementation that supports case-sensitive matching should
* <ol>
* <li>Provide at least the simple, default Unicode case-insensitive matching, and</li>
* <li>Specify which character properties or constructs are closed under the matching.</li>
* </ol>
* <p>
* In the {@code Pattern} implementation, 5 types of constructs maybe case-sensitive when matching:
* back-refs, string slice (sequences), single, family(char-property) and class range. Single and
* family may appears independently or within a class.
* <p>
* For loose/case-insensitive matching, the back-refs, slices and singles apply {code toUpperCase} and
* {@code toLowerCase} to both the pattern and the input string. This effectively 'close' the class for
* matching.
* <p>
* The family/char-properties are not "closed" and should remain unchanged. This is acceptable per RL1.5,
* if their behavior is clearly specified.
* <p>
* This method addresses that requirement for the "range" construct within in character class by computing
* the additional characters that should be included to close the range under simple case folding:
* <p>
* For each character in the input range {@code [start, end]} (inclusive), if the character has a simple
* case folding mapping in Unicode's CaseFolding.txt, the mapping is not a round-trip map, and the mapped
* character is not already in the range, then that mapped character (typically lowercase) is added to
* the expansion set.
* <p>
* This allows regex character class "range" implementation to use the returned expansion set to support
* additional case-insensitive matching, without duplicating characters already covered by the existing
* regex range implementation. The expectation is the matching is done using both the uppercase and
* lowercase forms of the input character, for example
*
* <pre>{@code
*
* ch -> inRange(lower, Character.toUpperCase(ch), upper) ||
* inRange(lower, Character.toLower(ch), upper) ||
* additionalClosingCharacters.contains(Character.toUpperCase(ch)) ||
* additionalClosingCharacters.contains(Character.toUpperCase(ch))
* }</pre>
*
* <p>
* @spec https://www.unicode.org/reports/tr18/#Simple_Loose_Matches
* @param start the starting code point of the character range
* @param end the ending code point of the character range
* @return a {@code int[]} containing the all simple case equivalents of characters in the range, excluding
* those already in the range
*/
public static int[] getClassRangeClosingCharacters(int start, int end) {
int[] expanded = new int[expanded_case_cps.length];
int off = 0;
for (int cp : expanded_case_cps) {
if (cp >= start && cp <= end) {
int folding = expanded_case_map.get(cp);
if (folding < start || folding > end) {
expanded[off++] = folding;
}
}
}
return Arrays.copyOf(expanded, off);
}
}
Loading