Skip to content

Commit dfdebb3

Browse files
committed
[SPARK-12070][PYSPARK] PySpark implementation of Slicing operator incorrect
1 parent bf0e85a commit dfdebb3

File tree

1 file changed

+20
-0
lines changed

1 file changed

+20
-0
lines changed

python/pyspark/sql/column.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -259,10 +259,30 @@ def substr(self, startPos, length):
259259
260260
>>> df.select(df.name.substr(1, 3).alias("col")).collect()
261261
[Row(col=u'Ali'), Row(col=u'Bob')]
262+
>>> df.select(df.name[1:3].alias("col")).collect()
263+
[Row(col=u'Ali'), Row(col=u'Bob')]
264+
>>> df.select(df.name[2:].alias("col")).collect()
265+
[Row(col=u'lice'), Row(col=u'ob')]
262266
"""
263267
if type(startPos) != type(length):
264268
raise TypeError("Can not mix the type")
265269
if isinstance(startPos, (int, long)):
270+
javaMaxInt = 2147483647
271+
if startPos > javaMaxInt:
272+
raise ValueError("startPos is larger than the java max int value "
273+
"which is not supported by pyspark, startPos=" + str(startPos))
274+
275+
if length == sys.maxint:
276+
# length == sys.maxint when using syntax str[1:]
277+
# cut it down to java max int because java api of substr only support int type
278+
# of length
279+
warnings.warn("PySpark's substr only support int type of length, "
280+
"please make sure the length you specify is less than 2147483647")
281+
length = javaMaxInt
282+
elif length > javaMaxInt:
283+
raise ValueError("length is larger than the java max int value "
284+
"which is not supported by pyspark, length=" + str(length))
285+
266286
jc = self._jc.substr(startPos, length)
267287
elif isinstance(startPos, Column):
268288
jc = self._jc.substr(startPos._jc, length._jc)

0 commit comments

Comments
 (0)