From 80d577838cc2cee6509c5202b89a1e503f2f05c1 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Fri, 18 May 2018 15:35:59 -0700
Subject: [PATCH 1/2] [SPARK-16451][repl] Fail shell if SparkSession fails to
 start.

Currently, in spark-shell, if the session fails to start, the
user sees a bunch of unrelated errors which are caused by code
in the shell initialization that references the "spark" variable,
which does not exist in that case. Things like:

```
<console>:14: error: not found: value spark
       import spark.sql
```

The user is also left with a non-working shell (unless they want
to just write non-Spark Scala or Python code, that is).

This change fails the whole shell session at the point where the
failure occurs, so that the last error message is the one with
the actual information about the failure.

Tested with spark-shell, pyspark (with 2.7 and 3.5), by forcing an
error during SparkContext initialization.
---
 python/pyspark/shell.py                       | 26 ++-----
 python/pyspark/sql/session.py                 | 34 +++++++++
 .../org/apache/spark/repl/SparkILoop.scala    |  9 ++-
 .../scala/org/apache/spark/repl/Main.scala    | 72 ++++++++++---------
 4 files changed, 89 insertions(+), 52 deletions(-)
diff --git a/python/pyspark/shell.py b/python/pyspark/shell.py
index b5fcf7092d93a..472c3cd4452f0 100644
--- a/python/pyspark/shell.py
+++ b/python/pyspark/shell.py
@@ -38,25 +38,13 @@
 SparkContext._ensure_initialized()
 
 try:
-    # Try to access HiveConf, it will raise exception if Hive is not added
-    conf = SparkConf()
-    if conf.get('spark.sql.catalogImplementation', 'hive').lower() == 'hive':
-        SparkContext._jvm.org.apache.hadoop.hive.conf.HiveConf()
-        spark = SparkSession.builder\
-            .enableHiveSupport()\
-            .getOrCreate()
-    else:
-        spark = SparkSession.builder.getOrCreate()
-except py4j.protocol.Py4JError:
-    if conf.get('spark.sql.catalogImplementation', '').lower() == 'hive':
-        warnings.warn("Fall back to non-hive support because failing to access HiveConf, "
-                      "please make sure you build spark with hive")
-    spark = SparkSession.builder.getOrCreate()
-except TypeError:
-    if conf.get('spark.sql.catalogImplementation', '').lower() == 'hive':
-        warnings.warn("Fall back to non-hive support because failing to access HiveConf, "
-                      "please make sure you build spark with hive")
-    spark = SparkSession.builder.getOrCreate()
+    spark = SparkSession._create_shell_session()
+except Exception:
+    import sys
+    import traceback
+    warnings.warn("Failed to initialize Spark session.")
+    traceback.print_exc(file=sys.stderr)
+    sys.exit(1)
 
 sc = spark.sparkContext
 sql = spark.sql
diff --git a/python/pyspark/sql/session.py b/python/pyspark/sql/session.py
index 13d6e2e53dbd0..a8930fbc7b268 100644
--- a/python/pyspark/sql/session.py
+++ b/python/pyspark/sql/session.py
@@ -547,6 +547,40 @@ def _create_from_pandas_with_arrow(self, pdf, schema, timezone):
         df._schema = schema
         return df
 
+    @staticmethod
+    def _create_shell_session():
+        """
+        Initialize a SparkSession for a pyspark shell session. This is called from shell.py
+        to make error handling simpler without needing to declare local variables in that
+        script, which would expose those to users.
+        """
+        import py4j
+        from pyspark.conf import SparkConf
+        from pyspark.context import SparkContext
+        try:
+            # Try to access HiveConf, it will raise exception if Hive is not added
+            conf = SparkConf()
+            if conf.get('spark.sql.catalogImplementation', 'hive').lower() == 'hive':
+                SparkContext._jvm.org.apache.hadoop.hive.conf.HiveConf()
+                return SparkSession.builder\
+                    .enableHiveSupport()\
+                    .getOrCreate()
+            else:
+                return SparkSession.builder.getOrCreate()
+        except py4j.protocol.Py4JError:
+            if conf.get('spark.sql.catalogImplementation', '').lower() == 'hive':
+                warnings.warn("Fall back to non-hive support because failing to access HiveConf, "
+                              "please make sure you build spark with hive")
+
+        try:
+            return SparkSession.builder.getOrCreate()
+        except TypeError:
+            if conf.get('spark.sql.catalogImplementation', '').lower() == 'hive':
+                warnings.warn("Fall back to non-hive support because failing to access HiveConf, "
+                              "please make sure you build spark with hive")
+
+        return SparkSession.builder.getOrCreate()
+
     @since(2.0)
     @ignore_unicode_prefix
     def createDataFrame(self, data, schema=None, samplingRatio=None, verifySchema=True):
diff --git a/repl/scala-2.12/src/main/scala/org/apache/spark/repl/SparkILoop.scala b/repl/scala-2.12/src/main/scala/org/apache/spark/repl/SparkILoop.scala
index ffb2e5f5db7e2..7c34f67eebeab 100644
--- a/repl/scala-2.12/src/main/scala/org/apache/spark/repl/SparkILoop.scala
+++ b/repl/scala-2.12/src/main/scala/org/apache/spark/repl/SparkILoop.scala
@@ -37,7 +37,14 @@ class SparkILoop(in0: Option[BufferedReader], out: JPrintWriter)
     @transient val spark = if (org.apache.spark.repl.Main.sparkSession != null) {
         org.apache.spark.repl.Main.sparkSession
       } else {
-        org.apache.spark.repl.Main.createSparkSession()
+        try {
+          org.apache.spark.repl.Main.createSparkSession()
+        } catch {
+          case e: Exception =>
+            println("Failed to initialize Spark session:")
+            e.printStackTrace()
+            sys.exit(1)
+        }
       }
     @transient val sc = {
       val _sc = spark.sparkContext
diff --git a/repl/src/main/scala/org/apache/spark/repl/Main.scala b/repl/src/main/scala/org/apache/spark/repl/Main.scala
index cc76a703bdf8f..e4ddcef9772e4 100644
--- a/repl/src/main/scala/org/apache/spark/repl/Main.scala
+++ b/repl/src/main/scala/org/apache/spark/repl/Main.scala
@@ -44,6 +44,7 @@ object Main extends Logging {
   var interp: SparkILoop = _
 
   private var hasErrors = false
+  private var isShellSession = false
 
   private def scalaOptionError(msg: String): Unit = {
     hasErrors = true
@@ -53,6 +54,7 @@ object Main extends Logging {
   }
 
   def main(args: Array[String]) {
+    isShellSession = true
     doMain(args, new SparkILoop)
   }
 
@@ -79,44 +81,50 @@ object Main extends Logging {
   }
 
   def createSparkSession(): SparkSession = {
-    val execUri = System.getenv("SPARK_EXECUTOR_URI")
-    conf.setIfMissing("spark.app.name", "Spark shell")
-    // SparkContext will detect this configuration and register it with the RpcEnv's
-    // file server, setting spark.repl.class.uri to the actual URI for executors to
-    // use. This is sort of ugly but since executors are started as part of SparkContext
-    // initialization in certain cases, there's an initialization order issue that prevents
-    // this from being set after SparkContext is instantiated.
-    conf.set("spark.repl.class.outputDir", outputDir.getAbsolutePath())
-    if (execUri != null) {
-      conf.set("spark.executor.uri", execUri)
-    }
-    if (System.getenv("SPARK_HOME") != null) {
-      conf.setSparkHome(System.getenv("SPARK_HOME"))
-    }
+    try {
+      val execUri = System.getenv("SPARK_EXECUTOR_URI")
+      conf.setIfMissing("spark.app.name", "Spark shell")
+      // SparkContext will detect this configuration and register it with the RpcEnv's
+      // file server, setting spark.repl.class.uri to the actual URI for executors to
+      // use. This is sort of ugly but since executors are started as part of SparkContext
+      // initialization in certain cases, there's an initialization order issue that prevents
+      // this from being set after SparkContext is instantiated.
+      conf.set("spark.repl.class.outputDir", outputDir.getAbsolutePath())
+      if (execUri != null) {
+        conf.set("spark.executor.uri", execUri)
+      }
+      if (System.getenv("SPARK_HOME") != null) {
+        conf.setSparkHome(System.getenv("SPARK_HOME"))
+      }
 
-    val builder = SparkSession.builder.config(conf)
-    if (conf.get(CATALOG_IMPLEMENTATION.key, "hive").toLowerCase(Locale.ROOT) == "hive") {
-      if (SparkSession.hiveClassesArePresent) {
-        // In the case that the property is not set at all, builder's config
-        // does not have this value set to 'hive' yet. The original default
-        // behavior is that when there are hive classes, we use hive catalog.
-        sparkSession = builder.enableHiveSupport().getOrCreate()
-        logInfo("Created Spark session with Hive support")
+      val builder = SparkSession.builder.config(conf)
+      if (conf.get(CATALOG_IMPLEMENTATION.key, "hive").toLowerCase(Locale.ROOT) == "hive") {
+        if (SparkSession.hiveClassesArePresent) {
+          // In the case that the property is not set at all, builder's config
+          // does not have this value set to 'hive' yet. The original default
+          // behavior is that when there are hive classes, we use hive catalog.
+          sparkSession = builder.enableHiveSupport().getOrCreate()
+          logInfo("Created Spark session with Hive support")
+        } else {
+          // Need to change it back to 'in-memory' if no hive classes are found
+          // in the case that the property is set to hive in spark-defaults.conf
+          builder.config(CATALOG_IMPLEMENTATION.key, "in-memory")
+          sparkSession = builder.getOrCreate()
+          logInfo("Created Spark session")
+        }
       } else {
-        // Need to change it back to 'in-memory' if no hive classes are found
-        // in the case that the property is set to hive in spark-defaults.conf
-        builder.config(CATALOG_IMPLEMENTATION.key, "in-memory")
+        // In the case that the property is set but not to 'hive', the internal
+        // default is 'in-memory'. So the sparkSession will use in-memory catalog.
         sparkSession = builder.getOrCreate()
         logInfo("Created Spark session")
       }
-    } else {
-      // In the case that the property is set but not to 'hive', the internal
-      // default is 'in-memory'. So the sparkSession will use in-memory catalog.
-      sparkSession = builder.getOrCreate()
-      logInfo("Created Spark session")
+      sparkContext = sparkSession.sparkContext
+      sparkSession
+    } catch {
+      case e: Exception if isShellSession =>
+        logError("Failed to initialize Spark session.", e)
+        sys.exit(1)
     }
-    sparkContext = sparkSession.sparkContext
-    sparkSession
   }
 
 }

From 6d53ca024a5f88d7d3dcd41257c3de72aadd40b6 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Mon, 21 May 2018 18:20:54 -0700
Subject: [PATCH 2/2] Revert scala 2.12 changes.

---
 .../main/scala/org/apache/spark/repl/SparkILoop.scala    | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/repl/scala-2.12/src/main/scala/org/apache/spark/repl/SparkILoop.scala b/repl/scala-2.12/src/main/scala/org/apache/spark/repl/SparkILoop.scala
index 7c34f67eebeab..ffb2e5f5db7e2 100644
--- a/repl/scala-2.12/src/main/scala/org/apache/spark/repl/SparkILoop.scala
+++ b/repl/scala-2.12/src/main/scala/org/apache/spark/repl/SparkILoop.scala
@@ -37,14 +37,7 @@ class SparkILoop(in0: Option[BufferedReader], out: JPrintWriter)
     @transient val spark = if (org.apache.spark.repl.Main.sparkSession != null) {
         org.apache.spark.repl.Main.sparkSession
       } else {
-        try {
-          org.apache.spark.repl.Main.createSparkSession()
-        } catch {
-          case e: Exception =>
-            println("Failed to initialize Spark session:")
-            e.printStackTrace()
-            sys.exit(1)
-        }
+        org.apache.spark.repl.Main.createSparkSession()
       }
     @transient val sc = {
       val _sc = spark.sparkContext