Skip to content

Commit a20e743

Browse files
committed
[SPARK-9460] Fix prefix generation for UTF8String.
Previously we could be getting garbage data if the number of bytes is 0, or on JVMs that are 4 byte aligned, or when compressedoops is on. Author: Reynold Xin <[email protected]> Closes apache#7789 from rxin/utf8string and squashes the following commits: 86ffa3e [Reynold Xin] Mask out data outside of valid range. 4d647ed [Reynold Xin] Mask out data. c6e8794 [Reynold Xin] [SPARK-9460] Fix prefix generation for UTF8String.
1 parent 6d94bf6 commit a20e743

File tree

2 files changed

+41
-3
lines changed

2 files changed

+41
-3
lines changed

unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java

Lines changed: 33 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,19 @@ public static UTF8String fromBytes(byte[] bytes) {
6565
}
6666
}
6767

68+
/**
69+
* Creates an UTF8String from byte array, which should be encoded in UTF-8.
70+
*
71+
* Note: `bytes` will be hold by returned UTF8String.
72+
*/
73+
public static UTF8String fromBytes(byte[] bytes, int offset, int numBytes) {
74+
if (bytes != null) {
75+
return new UTF8String(bytes, BYTE_ARRAY_OFFSET + offset, numBytes);
76+
} else {
77+
return null;
78+
}
79+
}
80+
6881
/**
6982
* Creates an UTF8String from String.
7083
*/
@@ -89,10 +102,10 @@ public static UTF8String blankString(int length) {
89102
return fromBytes(spaces);
90103
}
91104

92-
protected UTF8String(Object base, long offset, int size) {
105+
protected UTF8String(Object base, long offset, int numBytes) {
93106
this.base = base;
94107
this.offset = offset;
95-
this.numBytes = size;
108+
this.numBytes = numBytes;
96109
}
97110

98111
/**
@@ -141,7 +154,24 @@ public int numChars() {
141154
* Returns a 64-bit integer that can be used as the prefix used in sorting.
142155
*/
143156
public long getPrefix() {
144-
long p = PlatformDependent.UNSAFE.getLong(base, offset);
157+
// Since JVMs are either 4-byte aligned or 8-byte aligned, we check the size of the string.
158+
// If size is 0, just return 0.
159+
// If size is between 0 and 4 (inclusive), assume data is 4-byte aligned under the hood and
160+
// use a getInt to fetch the prefix.
161+
// If size is greater than 4, assume we have at least 8 bytes of data to fetch.
162+
// After getting the data, we use a mask to mask out data that is not part of the string.
163+
long p;
164+
if (numBytes >= 8) {
165+
p = PlatformDependent.UNSAFE.getLong(base, offset);
166+
} else if (numBytes > 4) {
167+
p = PlatformDependent.UNSAFE.getLong(base, offset);
168+
p = p & ((1L << numBytes * 8) - 1);
169+
} else if (numBytes > 0) {
170+
p = (long) PlatformDependent.UNSAFE.getInt(base, offset);
171+
p = p & ((1L << numBytes * 8) - 1);
172+
} else {
173+
p = 0;
174+
}
145175
p = java.lang.Long.reverseBytes(p);
146176
return p;
147177
}

unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,14 @@ public void prefix() {
7171
fromString("abbbbbbbbbbbasdf").getPrefix() - fromString("bbbbbbbbbbbbasdf").getPrefix() < 0);
7272
assertTrue(fromString("").getPrefix() - fromString("a").getPrefix() < 0);
7373
assertTrue(fromString("你好").getPrefix() - fromString("世界").getPrefix() > 0);
74+
75+
byte[] buf1 = {1, 2, 3, 4, 5, 6, 7, 8, 9};
76+
byte[] buf2 = {1, 2, 3};
77+
UTF8String str1 = UTF8String.fromBytes(buf1, 0, 3);
78+
UTF8String str2 = UTF8String.fromBytes(buf1, 0, 8);
79+
UTF8String str3 = UTF8String.fromBytes(buf2);
80+
assertTrue(str1.getPrefix() - str2.getPrefix() < 0);
81+
assertEquals(str1.getPrefix(), str3.getPrefix());
7482
}
7583

7684
@Test

0 commit comments

Comments
 (0)