@@ -238,6 +238,22 @@ def printSchema(self):
238238 """
239239 print (self ._jdf .schema ().treeString ())
240240
241+ def explain (self , extended = False ):
242+ """
243+ Prints the plans (logical and physical) to the console for
244+ debugging purpose.
245+
246+ If extended is False, only prints the physical plan.
247+ """
248+ self ._jdf .explain (extended )
249+
250+ def isLocal (self ):
251+ """
252+ Returns True if the `collect` and `take` methods can be run locally
253+ (without any Spark executors).
254+ """
255+ return self ._jdf .isLocal ()
256+
241257 def show (self ):
242258 """
243259 Print the first 20 rows.
@@ -247,14 +263,12 @@ def show(self):
247263 2 Alice
248264 5 Bob
249265 >>> df
250- age name
251- 2 Alice
252- 5 Bob
266+ DataFrame[age: int, name: string]
253267 """
254- print ( self )
268+ print self . _jdf . showString (). encode ( 'utf8' , 'ignore' )
255269
256270 def __repr__ (self ):
257- return self . _jdf . showString ( )
271+ return "DataFrame[%s]" % ( ", " . join ( "%s: %s" % c for c in self . dtypes ) )
258272
259273 def count (self ):
260274 """Return the number of elements in this RDD.
@@ -336,13 +350,40 @@ def mapPartitions(self, f, preservesPartitioning=False):
336350 """
337351 Return a new RDD by applying a function to each partition.
338352
353+ It's a shorthand for df.rdd.mapPartitions()
354+
339355 >>> rdd = sc.parallelize([1, 2, 3, 4], 4)
340356 >>> def f(iterator): yield 1
341357 >>> rdd.mapPartitions(f).sum()
342358 4
343359 """
344360 return self .rdd .mapPartitions (f , preservesPartitioning )
345361
362+ def foreach (self , f ):
363+ """
364+ Applies a function to all rows of this DataFrame.
365+
366+ It's a shorthand for df.rdd.foreach()
367+
368+ >>> def f(person):
369+ ... print person.name
370+ >>> df.foreach(f)
371+ """
372+ return self .rdd .foreach (f )
373+
374+ def foreachPartition (self , f ):
375+ """
376+ Applies a function to each partition of this DataFrame.
377+
378+ It's a shorthand for df.rdd.foreachPartition()
379+
380+ >>> def f(people):
381+ ... for person in people:
382+ ... print person.name
383+ >>> df.foreachPartition(f)
384+ """
385+ return self .rdd .foreachPartition (f )
386+
346387 def cache (self ):
347388 """ Persist with the default storage level (C{MEMORY_ONLY_SER}).
348389 """
@@ -377,8 +418,13 @@ def repartition(self, numPartitions):
377418 """ Return a new :class:`DataFrame` that has exactly `numPartitions`
378419 partitions.
379420 """
380- rdd = self ._jdf .repartition (numPartitions , None )
381- return DataFrame (rdd , self .sql_ctx )
421+ return DataFrame (self ._jdf .repartition (numPartitions , None ), self .sql_ctx )
422+
423+ def distinct (self ):
424+ """
425+ Return a new :class:`DataFrame` containing the distinct rows in this DataFrame.
426+ """
427+ return DataFrame (self ._jdf .distinct (), self .sql_ctx )
382428
383429 def sample (self , withReplacement , fraction , seed = None ):
384430 """
@@ -957,10 +1003,7 @@ def cast(self, dataType):
9571003 return Column (jc , self .sql_ctx )
9581004
9591005 def __repr__ (self ):
960- if self ._jdf .isComputable ():
961- return self ._jdf .samples ()
962- else :
963- return 'Column<%s>' % self ._jdf .toString ()
1006+ return 'Column<%s>' % self ._jdf .toString ().encode ('utf8' )
9641007
9651008 def toPandas (self ):
9661009 """
0 commit comments