Skip to content

Commit 88614dd

Browse files
lianchengyhuai
authored andcommitted
[SPARK-12624][PYSPARK] Checks row length when converting Java arrays to Python rows
When actual row length doesn't conform to specified schema field length, we should give a better error message instead of throwing an unintuitive `ArrayOutOfBoundsException`. Author: Cheng Lian <[email protected]> Closes #10886 from liancheng/spark-12624. (cherry picked from commit 3327fd2) Signed-off-by: Yin Huai <[email protected]>
1 parent f913f7e commit 88614dd

File tree

2 files changed

+17
-1
lines changed

2 files changed

+17
-1
lines changed

python/pyspark/sql/tests.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -364,6 +364,15 @@ def test_infer_schema_to_local(self):
364364
df3 = self.sqlCtx.createDataFrame(rdd, df.schema)
365365
self.assertEqual(10, df3.count())
366366

367+
def test_create_dataframe_schema_mismatch(self):
368+
input = [Row(a=1)]
369+
rdd = self.sc.parallelize(range(3)).map(lambda i: Row(a=i))
370+
schema = StructType([StructField("a", IntegerType()), StructField("b", StringType())])
371+
df = self.sqlCtx.createDataFrame(rdd, schema)
372+
message = ".*Input row doesn't have expected number of values required by the schema.*"
373+
with self.assertRaisesRegexp(Exception, message):
374+
df.show()
375+
367376
def test_serialize_nested_array_and_map(self):
368377
d = [Row(l=[Row(a=1, b='s')], d={"key": Row(c=1.0, d="2")})]
369378
rdd = self.sc.parallelize(d)

sql/core/src/main/scala/org/apache/spark/sql/execution/python.scala

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -220,7 +220,14 @@ object EvaluatePython {
220220
ArrayBasedMapData(keys, values)
221221

222222
case (c, StructType(fields)) if c.getClass.isArray =>
223-
new GenericInternalRow(c.asInstanceOf[Array[_]].zip(fields).map {
223+
val array = c.asInstanceOf[Array[_]]
224+
if (array.length != fields.length) {
225+
throw new IllegalStateException(
226+
s"Input row doesn't have expected number of values required by the schema. " +
227+
s"${fields.length} fields are required while ${array.length} values are provided."
228+
)
229+
}
230+
new GenericInternalRow(array.zip(fields).map {
224231
case (e, f) => fromJava(e, f.dataType)
225232
})
226233

0 commit comments

Comments
 (0)