-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-9116] [SQL] [PYSPARK] support Python only UDT in __main__ #7453
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
55bb86e
83d65ac
de986d6
0bcb3ef
316a394
655b8a9
63f52ef
ad528ba
a86e1fc
a9a3c40
dc65f19
793d9b2
4dfd5e1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -350,7 +350,26 @@ def save_global(self, obj, name=None, pack=struct.pack): | |
| if new_override: | ||
| d['__new__'] = obj.__new__ | ||
|
|
||
| self.save_reduce(typ, (obj.__name__, obj.__bases__, d), obj=obj) | ||
| self.save(_load_class) | ||
| self.save_reduce(typ, (obj.__name__, obj.__bases__, {"__doc__": obj.__doc__}), obj=obj) | ||
| d.pop('__doc__', None) | ||
| # handle property and staticmethod | ||
| dd = {} | ||
| for k, v in d.items(): | ||
| if isinstance(v, property): | ||
| k = ('property', k) | ||
| v = (v.fget, v.fset, v.fdel, v.__doc__) | ||
| elif isinstance(v, staticmethod) and hasattr(v, '__func__'): | ||
| k = ('staticmethod', k) | ||
| v = v.__func__ | ||
| elif isinstance(v, classmethod) and hasattr(v, '__func__'): | ||
| k = ('classmethod', k) | ||
| v = v.__func__ | ||
| dd[k] = v | ||
| self.save(dd) | ||
| self.write(pickle.TUPLE2) | ||
| self.write(pickle.REDUCE) | ||
|
|
||
| else: | ||
| raise pickle.PicklingError("Can't pickle %r" % obj) | ||
|
|
||
|
|
@@ -708,6 +727,23 @@ def _make_skel_func(code, closures, base_globals = None): | |
| None, None, closure) | ||
|
|
||
|
|
||
| def _load_class(cls, d): | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. missing doc |
||
| """ | ||
| Loads additional properties into class `cls`. | ||
| """ | ||
| for k, v in d.items(): | ||
| if isinstance(k, tuple): | ||
| typ, k = k | ||
| if typ == 'property': | ||
| v = property(*v) | ||
| elif typ == 'staticmethod': | ||
| v = staticmethod(v) | ||
| elif typ == 'classmethod': | ||
| v = classmethod(v) | ||
| setattr(cls, k, v) | ||
| return cls | ||
|
|
||
|
|
||
| """Constructors for 3rd party libraries | ||
| Note: These can never be renamed due to client compatibility issues""" | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -277,6 +277,66 @@ def applySchema(self, rdd, schema): | |
|
|
||
| return self.createDataFrame(rdd, schema) | ||
|
|
||
| def _createFromRDD(self, rdd, schema, samplingRatio): | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. missing doc, especially the return values |
||
| """ | ||
| Create an RDD for DataFrame from an existing RDD, returns the RDD and schema. | ||
| """ | ||
| if schema is None or isinstance(schema, (list, tuple)): | ||
| struct = self._inferSchema(rdd, samplingRatio) | ||
| converter = _create_converter(struct) | ||
| rdd = rdd.map(converter) | ||
| if isinstance(schema, (list, tuple)): | ||
| for i, name in enumerate(schema): | ||
| struct.fields[i].name = name | ||
| struct.names[i] = name | ||
| schema = struct | ||
|
|
||
| elif isinstance(schema, StructType): | ||
| # take the first few rows to verify schema | ||
| rows = rdd.take(10) | ||
| for row in rows: | ||
| _verify_type(row, schema) | ||
|
|
||
| else: | ||
| raise TypeError("schema should be StructType or list or None, but got: %s" % schema) | ||
|
|
||
| # convert python objects to sql data | ||
| rdd = rdd.map(schema.toInternal) | ||
| return rdd, schema | ||
|
|
||
| def _createFromLocal(self, data, schema): | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. missing doc |
||
| """ | ||
| Create an RDD for DataFrame from an list or pandas.DataFrame, returns | ||
| the RDD and schema. | ||
| """ | ||
| if has_pandas and isinstance(data, pandas.DataFrame): | ||
| if schema is None: | ||
| schema = [str(x) for x in data.columns] | ||
| data = [r.tolist() for r in data.to_records(index=False)] | ||
|
|
||
| # make sure data could consumed multiple times | ||
| if not isinstance(data, list): | ||
| data = list(data) | ||
|
|
||
| if schema is None or isinstance(schema, (list, tuple)): | ||
| struct = self._inferSchemaFromList(data) | ||
| if isinstance(schema, (list, tuple)): | ||
| for i, name in enumerate(schema): | ||
| struct.fields[i].name = name | ||
| struct.names[i] = name | ||
| schema = struct | ||
|
|
||
| elif isinstance(schema, StructType): | ||
| for row in data: | ||
| _verify_type(row, schema) | ||
|
|
||
| else: | ||
| raise TypeError("schema should be StructType or list or None, but got: %s" % schema) | ||
|
|
||
| # convert python objects to sql data | ||
| data = [schema.toInternal(row) for row in data] | ||
| return self._sc.parallelize(data), schema | ||
|
|
||
| @since(1.3) | ||
| @ignore_unicode_prefix | ||
| def createDataFrame(self, data, schema=None, samplingRatio=None): | ||
|
|
@@ -340,49 +400,15 @@ def createDataFrame(self, data, schema=None, samplingRatio=None): | |
| if isinstance(data, DataFrame): | ||
| raise TypeError("data is already a DataFrame") | ||
|
|
||
| if has_pandas and isinstance(data, pandas.DataFrame): | ||
| if schema is None: | ||
| schema = [str(x) for x in data.columns] | ||
| data = [r.tolist() for r in data.to_records(index=False)] | ||
|
|
||
| if not isinstance(data, RDD): | ||
| if not isinstance(data, list): | ||
| data = list(data) | ||
| try: | ||
| # data could be list, tuple, generator ... | ||
| rdd = self._sc.parallelize(data) | ||
| except Exception: | ||
| raise TypeError("cannot create an RDD from type: %s" % type(data)) | ||
| if isinstance(data, RDD): | ||
| rdd, schema = self._createFromRDD(data, schema, samplingRatio) | ||
| else: | ||
| rdd = data | ||
|
|
||
| if schema is None or isinstance(schema, (list, tuple)): | ||
| if isinstance(data, RDD): | ||
| struct = self._inferSchema(rdd, samplingRatio) | ||
| else: | ||
| struct = self._inferSchemaFromList(data) | ||
| if isinstance(schema, (list, tuple)): | ||
| for i, name in enumerate(schema): | ||
| struct.fields[i].name = name | ||
| schema = struct | ||
| converter = _create_converter(schema) | ||
| rdd = rdd.map(converter) | ||
|
|
||
| elif isinstance(schema, StructType): | ||
| # take the first few rows to verify schema | ||
| rows = rdd.take(10) | ||
| for row in rows: | ||
| _verify_type(row, schema) | ||
|
|
||
| else: | ||
| raise TypeError("schema should be StructType or list or None") | ||
|
|
||
| # convert python objects to sql data | ||
| rdd = rdd.map(schema.toInternal) | ||
|
|
||
| rdd, schema = self._createFromLocal(data, schema) | ||
| jrdd = self._jvm.SerDeUtil.toJavaArray(rdd._to_java_object_rdd()) | ||
| df = self._ssql_ctx.applySchemaToPythonRDD(jrdd.rdd(), schema.json()) | ||
| return DataFrame(df, self) | ||
| jdf = self._ssql_ctx.applySchemaToPythonRDD(jrdd.rdd(), schema.json()) | ||
| df = DataFrame(jdf, self) | ||
| df._schema = schema | ||
| return df | ||
|
|
||
| @since(1.3) | ||
| def registerDataFrameAsTable(self, df, tableName): | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks like PyLint is complaining about a possibly undefined loop variable
vat this line. If this isn't a legitimate error, then we can just add a comment to bypass that warning here.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@JoshRosen I think it's annoying to let PyLint report Warning as Error, should we only fail on real errors?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good point; let's see if we can update the configuration to do that.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I tried with putting
-Eto pylint, it seems even worse, lots of false-negtive errors.