[SPARK-5872] [SQL] create a sqlCtx in pyspark shell

Davies Liu · marmbrus · commit 0dba382ee656 · 2015-02-17T15:44:45.000-08:00
The sqlCtx will be HiveContext if hive is built in assembly jar, or SQLContext if not. It also skip the Hive tests in pyspark.sql.tests if no hive is available. Author: Davies Liu <davies@databricks.com> Closes #4659 from davies/sqlctx and squashes the following commits: 0e6629a [Davies Liu] sqlCtx in pyspark (cherry picked from commit 4d4cc76) Signed-off-by: Michael Armbrust <michael@databricks.com>
diff --git a/python/pyspark/shell.py b/python/pyspark/shell.py
@@ -31,8 +31,12 @@
 import atexit
 import os
 import platform
+
+import py4j
+
 import pyspark
 from pyspark.context import SparkContext
+from pyspark.sql import SQLContext, HiveContext
 from pyspark.storagelevel import StorageLevel
 
 # this is the deprecated equivalent of ADD_JARS
@@ -46,6 +50,13 @@
 sc = SparkContext(appName="PySparkShell", pyFiles=add_files)
 atexit.register(lambda: sc.stop())
 
+try:
+    # Try to access HiveConf, it will raise exception if Hive is not added
+    sc._jvm.org.apache.hadoop.hive.conf.HiveConf()
+    sqlCtx = HiveContext(sc)
+except py4j.protocol.Py4JError:
+    sqlCtx = SQLContext(sc)
+
 print("""Welcome to
       ____              __
      / __/__  ___ _____/ /__
@@ -57,7 +68,7 @@
     platform.python_version(),
     platform.python_build()[0],
     platform.python_build()[1]))
-print("SparkContext available as sc.")
+print("SparkContext available as sc, %s available as sqlCtx." % sqlCtx.__class__.__name__)
 
 if add_files is not None:
     print("Warning: ADD_FILES environment variable is deprecated, use --py-files argument instead")
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
@@ -25,6 +25,8 @@
 import shutil
 import tempfile
 
+import py4j
+
 if sys.version_info[:2] <= (2, 6):
     try:
         import unittest2 as unittest
@@ -329,9 +331,12 @@ class HiveContextSQLTests(ReusedPySparkTestCase):
     def setUpClass(cls):
         ReusedPySparkTestCase.setUpClass()
         cls.tempdir = tempfile.NamedTemporaryFile(delete=False)
+        try:
+            cls.sc._jvm.org.apache.hadoop.hive.conf.HiveConf()
+        except py4j.protocol.Py4JError:
+            cls.sqlCtx = None
+            return
         os.unlink(cls.tempdir.name)
-        print "type", type(cls.sc)
-        print "type", type(cls.sc._jsc)
         _scala_HiveContext =\
             cls.sc._jvm.org.apache.spark.sql.hive.test.TestHiveContext(cls.sc._jsc.sc())
         cls.sqlCtx = HiveContext(cls.sc, _scala_HiveContext)
@@ -344,6 +349,9 @@ def tearDownClass(cls):
         shutil.rmtree(cls.tempdir.name, ignore_errors=True)
 
     def test_save_and_load_table(self):
+        if self.sqlCtx is None:
+            return  # no hive available, skipped
+
         df = self.df
         tmpPath = tempfile.mkdtemp()
         shutil.rmtree(tmpPath)