-
Notifications
You must be signed in to change notification settings - Fork 28.9k
[SPARK-16700] [PYSPARK] [SQL] create DataFrame from dict/Row with schema #14469
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -384,17 +384,15 @@ def _createFromLocal(self, data, schema): | |
|
|
||
| if schema is None or isinstance(schema, (list, tuple)): | ||
| struct = self._inferSchemaFromList(data) | ||
| converter = _create_converter(struct) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why did we add this here?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is missed before. |
||
| data = map(converter, data) | ||
| if isinstance(schema, (list, tuple)): | ||
| for i, name in enumerate(schema): | ||
| struct.fields[i].name = name | ||
| struct.names[i] = name | ||
| schema = struct | ||
|
|
||
| elif isinstance(schema, StructType): | ||
| for row in data: | ||
| _verify_type(row, schema) | ||
|
|
||
| else: | ||
| elif not isinstance(schema, StructType): | ||
| raise TypeError("schema should be StructType or list or None, but got: %s" % schema) | ||
|
|
||
| # convert python objects to sql data | ||
|
|
@@ -403,7 +401,7 @@ def _createFromLocal(self, data, schema): | |
|
|
||
| @since(2.0) | ||
| @ignore_unicode_prefix | ||
| def createDataFrame(self, data, schema=None, samplingRatio=None): | ||
| def createDataFrame(self, data, schema=None, samplingRatio=None, verifySchema=True): | ||
| """ | ||
| Creates a :class:`DataFrame` from an :class:`RDD`, a list or a :class:`pandas.DataFrame`. | ||
|
|
||
|
|
@@ -432,13 +430,11 @@ def createDataFrame(self, data, schema=None, samplingRatio=None): | |
| ``byte`` instead of ``tinyint`` for :class:`pyspark.sql.types.ByteType`. We can also use | ||
| ``int`` as a short name for ``IntegerType``. | ||
| :param samplingRatio: the sample ratio of rows used for inferring | ||
| :param verifySchema: verify data types of every row against schema. | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. +1 on also adding a
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added |
||
| :return: :class:`DataFrame` | ||
|
|
||
| .. versionchanged:: 2.0 | ||
| The ``schema`` parameter can be a :class:`pyspark.sql.types.DataType` or a | ||
| datatype string after 2.0. If it's not a | ||
| :class:`pyspark.sql.types.StructType`, it will be wrapped into a | ||
| :class:`pyspark.sql.types.StructType` and each record will also be wrapped into a tuple. | ||
| .. versionchanged:: 2.1 | ||
| Added verifySchema. | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Out of interest why are we removing this note but keeping the other 2.0 change note? Just wondering so that when I'm making my changes for 2.1 I can do the right thing.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @davies, I'm also slightly confused by this documentation change since it looks like the new 2.x behavior of wrapping single-field datatypes into structtypes and values into tuples is preserved by this patch. Could you clarify?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This API is new in 2.0 (for SparkSession), so remove them. we could add a change for verifySchema. |
||
|
|
||
| >>> l = [('Alice', 1)] | ||
| >>> spark.createDataFrame(l).collect() | ||
|
|
@@ -503,17 +499,18 @@ def createDataFrame(self, data, schema=None, samplingRatio=None): | |
| schema = [str(x) for x in data.columns] | ||
| data = [r.tolist() for r in data.to_records(index=False)] | ||
|
|
||
| verify_func = _verify_type if verifySchema else lambda _, t: True | ||
| if isinstance(schema, StructType): | ||
| def prepare(obj): | ||
| _verify_type(obj, schema) | ||
| verify_func(obj, schema) | ||
| return obj | ||
| elif isinstance(schema, DataType): | ||
| datatype = schema | ||
| dataType = schema | ||
| schema = StructType().add("value", schema) | ||
|
|
||
| def prepare(obj): | ||
| _verify_type(obj, datatype) | ||
| return (obj, ) | ||
| schema = StructType().add("value", datatype) | ||
| verify_func(obj, dataType) | ||
| return obj, | ||
| else: | ||
| if isinstance(schema, list): | ||
| schema = [x.encode('utf-8') if not isinstance(x, str) else x for x in schema] | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -411,6 +411,22 @@ def test_infer_schema_to_local(self): | |
| df3 = self.spark.createDataFrame(rdd, df.schema) | ||
| self.assertEqual(10, df3.count()) | ||
|
|
||
| def test_apply_schema_to_dict_and_rows(self): | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we also add a test to exercise the
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added |
||
| schema = StructType().add("b", StringType()).add("a", IntegerType()) | ||
| input = [{"a": 1}, {"b": "coffee"}] | ||
| rdd = self.sc.parallelize(input) | ||
| for verify in [False, True]: | ||
| df = self.spark.createDataFrame(input, schema, verifySchema=verify) | ||
| df2 = self.spark.createDataFrame(rdd, schema, verifySchema=verify) | ||
| self.assertEqual(df.schema, df2.schema) | ||
|
|
||
| rdd = self.sc.parallelize(range(10)).map(lambda x: Row(a=x, b=None)) | ||
| df3 = self.spark.createDataFrame(rdd, schema, verifySchema=verify) | ||
| self.assertEqual(10, df3.count()) | ||
| input = [Row(a=x, b=str(x)) for x in range(10)] | ||
| df4 = self.spark.createDataFrame(input, schema, verifySchema=verify) | ||
| self.assertEqual(10, df4.count()) | ||
|
|
||
| def test_create_dataframe_schema_mismatch(self): | ||
| input = [Row(a=1)] | ||
| rdd = self.sc.parallelize(range(3)).map(lambda i: Row(a=i)) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe say version changed 2.1 for "Added verifySchema"?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
+1. I wasn't aware of this, but it looks like it's possible to have multiple
versionchangeddirectives in the same docstring.