@@ -39,8 +39,21 @@ public final class Utf8 {
3939 private Utf8 () {
4040 }
4141
42+ @ BasedOnJDKFile ("https://github.com/openjdk/jdk/blob/jdk-24+16/src/hotspot/share/utilities/utf8.cpp#L479-L488" )
43+ private static int utf8Size (char c ) {
44+ if ((0x0001 <= c ) && (c <= 0x007F )) {
45+ // ASCII character
46+ return 1 ;
47+ } else if (c <= 0x07FF ) {
48+ return 2 ;
49+ } else {
50+ return 3 ;
51+ }
52+ }
53+
4254 /**
43- * @return the length in bytes of the UTF8 representation of the string
55+ * @return the length as {@code int} in bytes of the UTF8 representation of the string. Might
56+ * return a truncated size if the value does not fit into {@code int} (see JDK-8328877).
4457 */
4558 public static int utf8Length (String string ) {
4659 return utf8Length (string , 0 , string .length ());
@@ -49,24 +62,27 @@ public static int utf8Length(String string) {
4962 /**
5063 * @param beginIndex first index that is part of the region, inclusive
5164 * @param endIndex index at the end of the region, exclusive
52- * @return the length in bytes of the UTF8 representation of the string region
65+ * @return the length as {@code int} in bytes of the UTF8 representation of the string. Might
66+ * return a truncated size if the value does not fit into {@code int} (see JDK-8328877).
5367 */
68+ @ BasedOnJDKFile ("https://github.com/openjdk/jdk/blob/jdk-24+16/src/hotspot/share/utilities/utf8.cpp#L511-L526" )
5469 public static int utf8Length (String s , int beginIndex , int endIndex ) {
5570 if (beginIndex < 0 || endIndex > s .length () || beginIndex > endIndex ) {
5671 throw new StringIndexOutOfBoundsException ();
5772 }
58- int length = 0 ;
59- for (int i = beginIndex ; i < endIndex ; i ++) {
60- final int c = s .charAt (i );
61- if (( c >= 0x0001 ) && ( c <= 0x007F )) {
62- length ++;
63- } else if ( c > 0x07FF ) {
64- length += 3 ;
65- } else {
66- length += 2 ;
73+ long result = 0 ;
74+ for (int index = beginIndex ; index < endIndex ; index ++) {
75+ char c = s .charAt (index );
76+ long sz = utf8Size ( c );
77+ // If the length is > INT_MAX-1 we truncate at a completed
78+ // modified-UTF8 encoding. This allows for +1 to be added
79+ // by the caller for NUL-termination, without overflow.
80+ if ( result + sz > Integer . MAX_VALUE - 1 ) {
81+ break ;
6782 }
83+ result += sz ;
6884 }
69- return length ;
85+ return ( int ) result ;
7086 }
7187
7288 /**
0 commit comments