|
32 | 32 | PairDeserializer |
33 | 33 | from pyspark.storagelevel import StorageLevel |
34 | 34 | from pyspark import rdd |
35 | | -from pyspark.rdd import RDD, SchemaRDD |
| 35 | +from pyspark.rdd import RDD |
36 | 36 |
|
37 | 37 | from py4j.java_collections import ListConverter |
38 | | -from py4j.protocol import Py4JError |
39 | 38 |
|
40 | 39 |
|
41 | 40 | class SparkContext(object): |
@@ -175,8 +174,6 @@ def _ensure_initialized(cls, instance=None, gateway=None): |
175 | 174 | SparkContext._gateway = gateway or launch_gateway() |
176 | 175 | SparkContext._jvm = SparkContext._gateway.jvm |
177 | 176 | SparkContext._writeToFile = SparkContext._jvm.PythonRDD.writeToFile |
178 | | - SparkContext._pythonToJavaMap = SparkContext._jvm.PythonRDD.pythonToJavaMap |
179 | | - SparkContext._javaToPython = SparkContext._jvm.PythonRDD.javaToPython |
180 | 177 |
|
181 | 178 | if instance: |
182 | 179 | if SparkContext._active_spark_context and SparkContext._active_spark_context != instance: |
@@ -463,225 +460,6 @@ def sparkUser(self): |
463 | 460 | """ |
464 | 461 | return self._jsc.sc().sparkUser() |
465 | 462 |
|
466 | | -class SQLContext: |
467 | | - """ |
468 | | - Main entry point for SparkSQL functionality. A SQLContext can be used create L{SchemaRDD}s, |
469 | | - register L{SchemaRDD}s as tables, execute sql over tables, cache tables, and read parquet files. |
470 | | - """ |
471 | | - |
472 | | - def __init__(self, sparkContext): |
473 | | - """ |
474 | | - Create a new SQLContext. |
475 | | -
|
476 | | - @param sparkContext: The SparkContext to wrap. |
477 | | -
|
478 | | - >>> from pyspark.context import SQLContext |
479 | | - >>> sqlCtx = SQLContext(sc) |
480 | | -
|
481 | | - >>> rdd = sc.parallelize([{"field1" : 1, "field2" : "row1"}, |
482 | | - ... {"field1" : 2, "field2": "row2"}, {"field1" : 3, "field2": "row3"}]) |
483 | | -
|
484 | | - >>> srdd = sqlCtx.inferSchema(rdd) |
485 | | - >>> sqlCtx.inferSchema(srdd) # doctest: +IGNORE_EXCEPTION_DETAIL |
486 | | - Traceback (most recent call last): |
487 | | - ... |
488 | | - ValueError:... |
489 | | -
|
490 | | - >>> bad_rdd = sc.parallelize([1,2,3]) |
491 | | - >>> sqlCtx.inferSchema(bad_rdd) # doctest: +IGNORE_EXCEPTION_DETAIL |
492 | | - Traceback (most recent call last): |
493 | | - ... |
494 | | - ValueError:... |
495 | | -
|
496 | | - >>> allTypes = sc.parallelize([{"int" : 1, "string" : "string", "double" : 1.0, "long": 1L, |
497 | | - ... "boolean" : True}]) |
498 | | - >>> srdd = sqlCtx.inferSchema(allTypes).map(lambda x: (x.int, x.string, x.double, x.long, |
499 | | - ... x.boolean)) |
500 | | - >>> srdd.collect()[0] |
501 | | - (1, u'string', 1.0, 1, True) |
502 | | - """ |
503 | | - self._sc = sparkContext |
504 | | - self._jsc = self._sc._jsc |
505 | | - self._jvm = self._sc._jvm |
506 | | - |
507 | | - @property |
508 | | - def _ssql_ctx(self): |
509 | | - """ |
510 | | - Accessor for the JVM SparkSQL context. Subclasses can overrite this property to provide |
511 | | - their own JVM Contexts. |
512 | | - """ |
513 | | - if not hasattr(self, '_scala_SQLContext'): |
514 | | - self._scala_SQLContext = self._jvm.SQLContext(self._jsc.sc()) |
515 | | - return self._scala_SQLContext |
516 | | - |
517 | | - def inferSchema(self, rdd): |
518 | | - """ |
519 | | - Infer and apply a schema to an RDD of L{dict}s. We peek at the first row of the RDD to |
520 | | - determine the fields names and types, and then use that to extract all the dictionaries. |
521 | | -
|
522 | | - >>> from pyspark.context import SQLContext |
523 | | - >>> sqlCtx = SQLContext(sc) |
524 | | - >>> rdd = sc.parallelize([{"field1" : 1, "field2" : "row1"}, |
525 | | - ... {"field1" : 2, "field2": "row2"}, {"field1" : 3, "field2": "row3"}]) |
526 | | - >>> srdd = sqlCtx.inferSchema(rdd) |
527 | | - >>> srdd.collect() == [{"field1" : 1, "field2" : "row1"}, {"field1" : 2, "field2": "row2"}, |
528 | | - ... {"field1" : 3, "field2": "row3"}] |
529 | | - True |
530 | | - """ |
531 | | - if (rdd.__class__ is SchemaRDD): |
532 | | - raise ValueError("Cannot apply schema to %s" % SchemaRDD.__name__) |
533 | | - elif not isinstance(rdd.first(), dict): |
534 | | - raise ValueError("Only RDDs with dictionaries can be converted to %s: %s" % |
535 | | - (SchemaRDD.__name__, rdd.first())) |
536 | | - |
537 | | - jrdd = self._sc._pythonToJavaMap(rdd._jrdd) |
538 | | - srdd = self._ssql_ctx.inferSchema(jrdd.rdd()) |
539 | | - return SchemaRDD(srdd, self) |
540 | | - |
541 | | - def registerRDDAsTable(self, rdd, tableName): |
542 | | - """ |
543 | | - Registers the given RDD as a temporary table in the catalog. Temporary tables exist only |
544 | | - during the lifetime of this instance of SQLContext. |
545 | | -
|
546 | | - >>> from pyspark.context import SQLContext |
547 | | - >>> sqlCtx = SQLContext(sc) |
548 | | - >>> rdd = sc.parallelize([{"field1" : 1, "field2" : "row1"}, |
549 | | - ... {"field1" : 2, "field2": "row2"}, {"field1" : 3, "field2": "row3"}]) |
550 | | - >>> srdd = sqlCtx.inferSchema(rdd) |
551 | | - >>> sqlCtx.registerRDDAsTable(srdd, "table1") |
552 | | - """ |
553 | | - if (rdd.__class__ is SchemaRDD): |
554 | | - jschema_rdd = rdd._jschema_rdd |
555 | | - self._ssql_ctx.registerRDDAsTable(jschema_rdd, tableName) |
556 | | - else: |
557 | | - raise ValueError("Can only register SchemaRDD as table") |
558 | | - |
559 | | - def parquetFile(self, path): |
560 | | - """ |
561 | | - Loads a Parquet file, returning the result as a L{SchemaRDD}. |
562 | | -
|
563 | | - >>> from pyspark.context import SQLContext |
564 | | - >>> sqlCtx = SQLContext(sc) |
565 | | - >>> rdd = sc.parallelize([{"field1" : 1, "field2" : "row1"}, |
566 | | - ... {"field1" : 2, "field2": "row2"}, {"field1" : 3, "field2": "row3"}]) |
567 | | - >>> srdd = sqlCtx.inferSchema(rdd) |
568 | | - >>> srdd.saveAsParquetFile("/tmp/tmp.parquet") |
569 | | - >>> srdd2 = sqlCtx.parquetFile("/tmp/tmp.parquet") |
570 | | - >>> srdd.collect() == srdd2.collect() |
571 | | - True |
572 | | - """ |
573 | | - jschema_rdd = self._ssql_ctx.parquetFile(path) |
574 | | - return SchemaRDD(jschema_rdd, self) |
575 | | - |
576 | | - def sql(self, sqlQuery): |
577 | | - """ |
578 | | - Executes a SQL query using Spark, returning the result as a L{SchemaRDD}. |
579 | | -
|
580 | | - >>> from pyspark.context import SQLContext |
581 | | - >>> sqlCtx = SQLContext(sc) |
582 | | - >>> rdd = sc.parallelize([{"field1" : 1, "field2" : "row1"}, |
583 | | - ... {"field1" : 2, "field2": "row2"}, {"field1" : 3, "field2": "row3"}]) |
584 | | - >>> srdd = sqlCtx.inferSchema(rdd) |
585 | | - >>> sqlCtx.registerRDDAsTable(srdd, "table1") |
586 | | - >>> srdd2 = sqlCtx.sql("SELECT field1 AS f1, field2 as f2 from table1") |
587 | | - >>> srdd2.collect() == [{"f1" : 1, "f2" : "row1"}, {"f1" : 2, "f2": "row2"}, |
588 | | - ... {"f1" : 3, "f2": "row3"}] |
589 | | - True |
590 | | - """ |
591 | | - return SchemaRDD(self._ssql_ctx.sql(sqlQuery), self) |
592 | | - |
593 | | - def table(self, tableName): |
594 | | - """ |
595 | | - Returns the specified table as a L{SchemaRDD}. |
596 | | -
|
597 | | - >>> from pyspark.context import SQLContext |
598 | | - >>> sqlCtx = SQLContext(sc) |
599 | | - >>> rdd = sc.parallelize([{"field1" : 1, "field2" : "row1"}, |
600 | | - ... {"field1" : 2, "field2": "row2"}, {"field1" : 3, "field2": "row3"}]) |
601 | | - >>> srdd = sqlCtx.inferSchema(rdd) |
602 | | - >>> sqlCtx.registerRDDAsTable(srdd, "table1") |
603 | | - >>> srdd2 = sqlCtx.table("table1") |
604 | | - >>> srdd.collect() == srdd2.collect() |
605 | | - True |
606 | | - """ |
607 | | - return SchemaRDD(self._ssql_ctx.table(tableName), self) |
608 | | - |
609 | | - def cacheTable(tableName): |
610 | | - """ |
611 | | - Caches the specified table in-memory. |
612 | | - """ |
613 | | - self._ssql_ctx.cacheTable(tableName) |
614 | | - |
615 | | - def uncacheTable(tableName): |
616 | | - """ |
617 | | - Removes the specified table from the in-memory cache. |
618 | | - """ |
619 | | - self._ssql_ctx.uncacheTable(tableName) |
620 | | - |
621 | | -class HiveContext(SQLContext): |
622 | | - """ |
623 | | - An instance of the Spark SQL execution engine that integrates with data stored in Hive. |
624 | | - Configuration for Hive is read from hive-site.xml on the classpath. It supports running both SQL |
625 | | - and HiveQL commands. |
626 | | - """ |
627 | | - |
628 | | - @property |
629 | | - def _ssql_ctx(self): |
630 | | - try: |
631 | | - if not hasattr(self, '_scala_HiveContext'): |
632 | | - self._scala_HiveContext = self._get_hive_ctx() |
633 | | - return self._scala_HiveContext |
634 | | - except Py4JError as e: |
635 | | - raise Exception("You must build Spark with Hive. Export 'SPARK_HIVE=true' and run " \ |
636 | | - "sbt/sbt assembly" , e) |
637 | | - |
638 | | - def _get_hive_ctx(self): |
639 | | - return self._jvm.HiveContext(self._jsc.sc()) |
640 | | - |
641 | | - def hiveql(self, hqlQuery): |
642 | | - """ |
643 | | - Runs a query expressed in HiveQL, returning the result as a L{SchemaRDD}. |
644 | | - """ |
645 | | - return SchemaRDD(self._ssql_ctx.hiveql(hqlQuery), self) |
646 | | - |
647 | | - def hql(self, hqlQuery): |
648 | | - """ |
649 | | - Runs a query expressed in HiveQL, returning the result as a L{SchemaRDD}. |
650 | | - """ |
651 | | - return self.hiveql(hqlQuery) |
652 | | - |
653 | | -class LocalHiveContext(HiveContext): |
654 | | - """ |
655 | | - Starts up an instance of hive where metadata is stored locally. An in-process metadata data is |
656 | | - created with data stored in ./metadata. Warehouse data is stored in in ./warehouse. |
657 | | -
|
658 | | - >>> import os |
659 | | - >>> from pyspark.context import LocalHiveContext |
660 | | - >>> hiveCtx = LocalHiveContext(sc) |
661 | | - >>> try: |
662 | | - ... supress = hiveCtx.hql("DROP TABLE src") |
663 | | - ... except Exception: |
664 | | - ... pass |
665 | | - >>> kv1 = os.path.join(os.environ["SPARK_HOME"], 'examples/src/main/resources/kv1.txt') |
666 | | - >>> supress = hiveCtx.hql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING)") |
667 | | - >>> supress = hiveCtx.hql("LOAD DATA LOCAL INPATH '%s' INTO TABLE src" % kv1) |
668 | | - >>> results = hiveCtx.hql("FROM src SELECT value").map(lambda r: int(r.value.split('_')[1])) |
669 | | - >>> num = results.count() |
670 | | - >>> reduce_sum = results.reduce(lambda x, y: x + y) |
671 | | - >>> num |
672 | | - 500 |
673 | | - >>> reduce_sum |
674 | | - 130091 |
675 | | - """ |
676 | | - |
677 | | - def _get_hive_ctx(self): |
678 | | - return self._jvm.LocalHiveContext(self._jsc.sc()) |
679 | | - |
680 | | -class TestHiveContext(HiveContext): |
681 | | - |
682 | | - def _get_hive_ctx(self): |
683 | | - return self._jvm.TestHiveContext(self._jsc.sc()) |
684 | | - |
685 | 463 | def _test(): |
686 | 464 | import atexit |
687 | 465 | import doctest |
|
0 commit comments