diff --git a/python/pyspark/sql/column.py b/python/pyspark/sql/column.py index e753ed402cdd..b172f38ea22d 100644 --- a/python/pyspark/sql/column.py +++ b/python/pyspark/sql/column.py @@ -406,8 +406,14 @@ def substr(self, startPos, length): [Row(col=u'Ali'), Row(col=u'Bob')] """ if type(startPos) != type(length): - raise TypeError("Can not mix the type") - if isinstance(startPos, (int, long)): + raise TypeError( + "startPos and length must be the same type. " + "Got {startPos_t} and {length_t}, respectively." + .format( + startPos_t=type(startPos), + length_t=type(length), + )) + if isinstance(startPos, int): jc = self._jc.substr(startPos, length) elif isinstance(startPos, Column): jc = self._jc.substr(startPos._jc, length._jc) diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py index cf2c473a1645..45a3f9e7165f 100644 --- a/python/pyspark/sql/tests.py +++ b/python/pyspark/sql/tests.py @@ -1220,6 +1220,18 @@ def test_rand_functions(self): rndn2 = df.select('key', functions.randn(0)).collect() self.assertEqual(sorted(rndn1), sorted(rndn2)) + def test_string_functions(self): + from pyspark.sql.functions import col, lit + df = self.spark.createDataFrame([['nick']], schema=['name']) + self.assertRaisesRegexp( + TypeError, + "must be the same type", + lambda: df.select(col('name').substr(0, lit(1)))) + if sys.version_info.major == 2: + self.assertRaises( + TypeError, + lambda: df.select(col('name').substr(long(0), long(1)))) + def test_array_contains_function(self): from pyspark.sql.functions import array_contains