@@ -259,10 +259,30 @@ def substr(self, startPos, length):
259259
260260 >>> df.select(df.name.substr(1, 3).alias("col")).collect()
261261 [Row(col=u'Ali'), Row(col=u'Bob')]
262+ >>> df.select(df.name[1:3].alias("col")).collect()
263+ [Row(col=u'Ali'), Row(col=u'Bob')]
264+ >>> df.select(df.name[2:].alias("col")).collect()
265+ [Row(col=u'lice'), Row(col=u'ob')]
262266 """
263267 if type (startPos ) != type (length ):
264268 raise TypeError ("Can not mix the type" )
265269 if isinstance (startPos , (int , long )):
270+ javaMaxInt = 2147483647
271+ if startPos > javaMaxInt :
272+ raise ValueError ("startPos is larger than the java max int value "
273+ "which is not supported by pyspark, startPos=" + str (startPos ))
274+
275+ if length == sys .maxint :
276+ # length == sys.maxint when using syntax str[1:]
277+ # cut it down to java max int because java api of substr only support int type
278+ # of length
279+ warnings .warn ("PySpark's substr only support int type of length, "
280+ "please make sure the length you specify is less than 2147483647" )
281+ length = javaMaxInt
282+ elif length > javaMaxInt :
283+ raise ValueError ("length is larger than the java max int value "
284+ "which is not supported by pyspark, length=" + str (length ))
285+
266286 jc = self ._jc .substr (startPos , length )
267287 elif isinstance (startPos , Column ):
268288 jc = self ._jc .substr (startPos ._jc , length ._jc )
0 commit comments