Skip to content

Commit 58dee20

Browse files
author
Davies Liu
committed
clean up
1 parent eb15631 commit 58dee20

File tree

2 files changed

+51
-76
lines changed

2 files changed

+51
-76
lines changed

python/pyspark/sql.py

Lines changed: 35 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -2077,9 +2077,9 @@ def dtypes(self):
20772077
"""Return all column names and their data types as a list.
20782078
20792079
>>> df.dtypes
2080-
[(u'age', 'IntegerType'), (u'name', 'StringType')]
2080+
[('age', 'integer'), ('name', 'string')]
20812081
"""
2082-
return [(f.name, str(f.dataType)) for f in self.schema().fields]
2082+
return [(str(f.name), f.dataType.jsonValue()) for f in self.schema().fields]
20832083

20842084
@property
20852085
def columns(self):
@@ -2263,18 +2263,6 @@ def subtract(self, other):
22632263
"""
22642264
return DataFrame(getattr(self._jdf, "except")(other._jdf), self.sql_ctx)
22652265

2266-
def sample(self, withReplacement, fraction, seed=None):
2267-
""" Return a new DataFrame by sampling a fraction of rows.
2268-
2269-
>>> df.sample(False, 0.5, 10).collect()
2270-
[Row(age=2, name=u'Alice')]
2271-
"""
2272-
if seed is None:
2273-
jdf = self._jdf.sample(withReplacement, fraction)
2274-
else:
2275-
jdf = self._jdf.sample(withReplacement, fraction, seed)
2276-
return DataFrame(jdf, self.sql_ctx)
2277-
22782266
def addColumn(self, colName, col):
22792267
""" Return a new :class:`DataFrame` by adding a column.
22802268
@@ -2376,28 +2364,6 @@ def sum(self):
23762364
group."""
23772365

23782366

2379-
SCALA_METHOD_MAPPINGS = {
2380-
'=': '$eq',
2381-
'>': '$greater',
2382-
'<': '$less',
2383-
'+': '$plus',
2384-
'-': '$minus',
2385-
'*': '$times',
2386-
'/': '$div',
2387-
'!': '$bang',
2388-
'@': '$at',
2389-
'#': '$hash',
2390-
'%': '$percent',
2391-
'^': '$up',
2392-
'&': '$amp',
2393-
'~': '$tilde',
2394-
'?': '$qmark',
2395-
'|': '$bar',
2396-
'\\': '$bslash',
2397-
':': '$colon',
2398-
}
2399-
2400-
24012367
def _create_column_from_literal(literal):
24022368
sc = SparkContext._active_spark_context
24032369
return sc._jvm.Dsl.lit(literal)
@@ -2416,23 +2382,18 @@ def _to_java_column(col):
24162382
return jcol
24172383

24182384

2419-
def _scalaMethod(name):
2420-
""" Translate operators into methodName in Scala
2421-
2422-
>>> _scalaMethod('+')
2423-
'$plus'
2424-
>>> _scalaMethod('>=')
2425-
'$greater$eq'
2426-
>>> _scalaMethod('cast')
2427-
'cast'
2428-
"""
2429-
return ''.join(SCALA_METHOD_MAPPINGS.get(c, c) for c in name)
2430-
2431-
24322385
def _unary_op(name, doc="unary operator"):
24332386
""" Create a method for given unary operator """
24342387
def _(self):
2435-
jc = getattr(self._jc, _scalaMethod(name))()
2388+
jc = getattr(self._jc, name)()
2389+
return Column(jc, self.sql_ctx)
2390+
_.__doc__ = doc
2391+
return _
2392+
2393+
2394+
def _dsl_op(name, doc=''):
2395+
def _(self):
2396+
jc = getattr(self._sc._jvm.Dsl, name)(self._jc)
24362397
return Column(jc, self.sql_ctx)
24372398
_.__doc__ = doc
24382399
return _
@@ -2443,7 +2404,7 @@ def _bin_op(name, doc="binary operator"):
24432404
"""
24442405
def _(self, other):
24452406
jc = other._jc if isinstance(other, Column) else other
2446-
njc = getattr(self._jc, _scalaMethod(name))(jc)
2407+
njc = getattr(self._jc, name)(jc)
24472408
return Column(njc, self.sql_ctx)
24482409
_.__doc__ = doc
24492410
return _
@@ -2454,7 +2415,7 @@ def _reverse_op(name, doc="binary operator"):
24542415
"""
24552416
def _(self, other):
24562417
jother = _create_column_from_literal(other)
2457-
jc = getattr(jother, _scalaMethod(name))(self._jc)
2418+
jc = getattr(jother, name)(self._jc)
24582419
return Column(jc, self.sql_ctx)
24592420
_.__doc__ = doc
24602421
return _
@@ -2481,34 +2442,33 @@ def __init__(self, jc, sql_ctx=None):
24812442
super(Column, self).__init__(jc, sql_ctx)
24822443

24832444
# arithmetic operators
2484-
__neg__ = _unary_op("unary_-")
2485-
__add__ = _bin_op("+")
2486-
__sub__ = _bin_op("-")
2487-
__mul__ = _bin_op("*")
2488-
__div__ = _bin_op("/")
2489-
__mod__ = _bin_op("%")
2490-
__radd__ = _bin_op("+")
2491-
__rsub__ = _reverse_op("-")
2492-
__rmul__ = _bin_op("*")
2493-
__rdiv__ = _reverse_op("/")
2494-
__rmod__ = _reverse_op("%")
2495-
__abs__ = _unary_op("abs")
2445+
__neg__ = _dsl_op("negate")
2446+
__add__ = _bin_op("plus")
2447+
__sub__ = _bin_op("minus")
2448+
__mul__ = _bin_op("multiply")
2449+
__div__ = _bin_op("divide")
2450+
__mod__ = _bin_op("mod")
2451+
__radd__ = _bin_op("plus")
2452+
__rsub__ = _reverse_op("minus")
2453+
__rmul__ = _bin_op("multiply")
2454+
__rdiv__ = _reverse_op("divide")
2455+
__rmod__ = _reverse_op("mod")
24962456

24972457
# logistic operators
2498-
__eq__ = _bin_op("===")
2499-
__ne__ = _bin_op("!==")
2500-
__lt__ = _bin_op("<")
2501-
__le__ = _bin_op("<=")
2502-
__ge__ = _bin_op(">=")
2503-
__gt__ = _bin_op(">")
2458+
__eq__ = _bin_op("equalTo")
2459+
__ne__ = _bin_op("notEqual")
2460+
__lt__ = _bin_op("lt")
2461+
__le__ = _bin_op("leq")
2462+
__ge__ = _bin_op("geq")
2463+
__gt__ = _bin_op("gt")
25042464

25052465
# `and`, `or`, `not` cannot be overloaded in Python,
25062466
# so use bitwise operators as boolean operators
2507-
__and__ = _bin_op('&&')
2508-
__or__ = _bin_op('||')
2509-
__invert__ = _unary_op('unary_!')
2510-
__rand__ = _bin_op("&&")
2511-
__ror__ = _bin_op("||")
2467+
__and__ = _bin_op('and')
2468+
__or__ = _bin_op('or')
2469+
__invert__ = _dsl_op('not')
2470+
__rand__ = _bin_op("and")
2471+
__ror__ = _bin_op("or")
25122472

25132473
# container operators
25142474
__contains__ = _bin_op("contains")

sql/core/src/main/scala/org/apache/spark/sql/Column.scala

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,6 @@ trait Column extends DataFrame {
128128
*/
129129
def unary_! : Column = exprToColumn(Not(expr))
130130

131-
132131
/**
133132
* Equality test.
134133
* {{{
@@ -173,6 +172,22 @@ trait Column extends DataFrame {
173172
Not(EqualTo(expr, o.expr))
174173
}
175174

175+
/**
176+
* Inequality test.
177+
* {{{
178+
* // Scala:
179+
* df.select( df("colA") !== df("colB") )
180+
* df.select( !(df("colA") === df("colB")) )
181+
*
182+
* // Java:
183+
* import static org.apache.spark.sql.Dsl.*;
184+
* df.filter( col("colA").notEqual(col("colB")) );
185+
* }}}
186+
*/
187+
def notEqual(other: Any): Column = constructColumn(other) { o =>
188+
Not(EqualTo(expr, o.expr))
189+
}
190+
176191
/**
177192
* Greater than.
178193
* {{{

0 commit comments

Comments
 (0)