From 753dbe1743f552fe7b4867d3e4d24cdcc2ca1669 Mon Sep 17 00:00:00 2001 From: Nicholas Chammas Date: Fri, 11 Aug 2017 14:39:59 -0400 Subject: [PATCH 1/4] clarify type error for Column.substr() --- python/pyspark/sql/column.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/python/pyspark/sql/column.py b/python/pyspark/sql/column.py index e753ed402cdd..24681cdbb47c 100644 --- a/python/pyspark/sql/column.py +++ b/python/pyspark/sql/column.py @@ -406,7 +406,13 @@ def substr(self, startPos, length): [Row(col=u'Ali'), Row(col=u'Bob')] """ if type(startPos) != type(length): - raise TypeError("Can not mix the type") + raise TypeError( + "startPos and length must be the same type. " + "Got {startPos_t} and {length_t}, respectively." + .format( + startPos_t=type(startPos), + length_t=type(length), + )) if isinstance(startPos, (int, long)): jc = self._jc.substr(startPos, length) elif isinstance(startPos, Column): From ff9b07c1f273a42b899870e0017ea3b7f733797e Mon Sep 17 00:00:00 2001 From: Nicholas Chammas Date: Mon, 14 Aug 2017 14:40:53 -0400 Subject: [PATCH 2/4] long is not supported --- python/pyspark/sql/column.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/sql/column.py b/python/pyspark/sql/column.py index 24681cdbb47c..b172f38ea22d 100644 --- a/python/pyspark/sql/column.py +++ b/python/pyspark/sql/column.py @@ -413,7 +413,7 @@ def substr(self, startPos, length): startPos_t=type(startPos), length_t=type(length), )) - if isinstance(startPos, (int, long)): + if isinstance(startPos, int): jc = self._jc.substr(startPos, length) elif isinstance(startPos, Column): jc = self._jc.substr(startPos._jc, length._jc) From fc1d84f002f5bd66bcad038a5581a05ade8dbc35 Mon Sep 17 00:00:00 2001 From: Nicholas Chammas Date: Mon, 14 Aug 2017 14:41:21 -0400 Subject: [PATCH 3/4] add type tests for substr --- python/pyspark/sql/tests.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py index cf2c473a1645..c3d12bbe1dd3 100644 --- a/python/pyspark/sql/tests.py +++ b/python/pyspark/sql/tests.py @@ -1220,6 +1220,13 @@ def test_rand_functions(self): rndn2 = df.select('key', functions.randn(0)).collect() self.assertEqual(sorted(rndn1), sorted(rndn2)) + def test_string_functions(self): + from pyspark.sql.functions import col, lit + df = self.spark.createDataFrame([['nick']], schema=['name']) + self.assertRaises(TypeError, lambda: df.select(col('name').substr(0, lit(1)))) + if sys.version_info.major == 2: + self.assertRaises(TypeError, lambda: df.select(col('name').substr(long(0), long(1)))) + def test_array_contains_function(self): from pyspark.sql.functions import array_contains From a7fea2086b0b61a24d50a740ce2f2dcdc846337b Mon Sep 17 00:00:00 2001 From: Nicholas Chammas Date: Tue, 15 Aug 2017 09:15:42 -0400 Subject: [PATCH 4/4] check substr type error message --- python/pyspark/sql/tests.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py index c3d12bbe1dd3..45a3f9e7165f 100644 --- a/python/pyspark/sql/tests.py +++ b/python/pyspark/sql/tests.py @@ -1223,9 +1223,14 @@ def test_rand_functions(self): def test_string_functions(self): from pyspark.sql.functions import col, lit df = self.spark.createDataFrame([['nick']], schema=['name']) - self.assertRaises(TypeError, lambda: df.select(col('name').substr(0, lit(1)))) + self.assertRaisesRegexp( + TypeError, + "must be the same type", + lambda: df.select(col('name').substr(0, lit(1)))) if sys.version_info.major == 2: - self.assertRaises(TypeError, lambda: df.select(col('name').substr(long(0), long(1)))) + self.assertRaises( + TypeError, + lambda: df.select(col('name').substr(long(0), long(1)))) def test_array_contains_function(self): from pyspark.sql.functions import array_contains