Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
package org.apache.spark.examples.sql.hive;

// $example on:spark_hive$
import java.io.File;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
Expand Down Expand Up @@ -56,7 +57,7 @@ public void setValue(String value) {
public static void main(String[] args) {
// $example on:spark_hive$
// warehouseLocation points to the default location for managed databases and tables
String warehouseLocation = "spark-warehouse";
String warehouseLocation = new File("spark-warehouse").getAbsolutePath();
SparkSession spark = SparkSession
.builder()
.appName("Java Spark Hive Example")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,25 +18,20 @@
Binary Classification Metrics Example.
"""
from __future__ import print_function
from pyspark.sql import SparkSession
from pyspark import SparkContext
# $example on$
from pyspark.mllib.classification import LogisticRegressionWithLBFGS
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.util import MLUtils
# $example off$

if __name__ == "__main__":
spark = SparkSession\
.builder\
.appName("BinaryClassificationMetricsExample")\
.getOrCreate()
sc = SparkContext(appName="BinaryClassificationMetricsExample")
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I just used SparkContext to be consistent with other examples.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is the point that this is an .mllib example rather than .ml so should use the older API?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, it is up to my understanding.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1


# $example on$
# Several of the methods available in scala are currently missing from pyspark
# Load training data in LIBSVM format
data = spark\
.read.format("libsvm").load("data/mllib/sample_binary_classification_data.txt")\
.rdd.map(lambda row: LabeledPoint(row[0], row[1]))
data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_binary_classification_data.txt")

# Split data into training (60%) and test (40%)
training, test = data.randomSplit([0.6, 0.4], seed=11)
Expand All @@ -58,4 +53,4 @@
print("Area under ROC = %s" % metrics.areaUnderROC)
# $example off$

spark.stop()
sc.stop()
5 changes: 0 additions & 5 deletions examples/src/main/python/mllib/bisecting_k_means_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,6 @@
# Evaluate clustering
cost = model.computeCost(parsedData)
print("Bisecting K-means Cost = " + str(cost))

# Save and load model
path = "target/org/apache/spark/PythonBisectingKMeansExample/BisectingKMeansModel"
model.save(sc, path)
sameModel = BisectingKMeansModel.load(sc, path)
# $example off$

sc.stop()
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@
print(each)

print("transformedData2:")
for each in transformedData2.collect():
for each in transformedData2:
print(each)

sc.stop()
4 changes: 2 additions & 2 deletions examples/src/main/python/sql/hive.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from __future__ import print_function

# $example on:spark_hive$
from os.path import expanduser, join
from os.path import expanduser, join, abspath

from pyspark.sql import SparkSession
from pyspark.sql import Row
Expand All @@ -34,7 +34,7 @@
if __name__ == "__main__":
# $example on:spark_hive$
# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'spark-warehouse'
warehouse_location = abspath('spark-warehouse')

spark = SparkSession \
.builder \
Expand Down
6 changes: 5 additions & 1 deletion examples/src/main/python/status_api_demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,11 @@

import time
import threading
import Queue
import sys
if sys.version >= '3':
import queue as Queue
else:
import Queue

from pyspark import SparkConf, SparkContext

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
package org.apache.spark.examples.sql.hive

// $example on:spark_hive$
import java.io.File

import org.apache.spark.sql.Row
import org.apache.spark.sql.SparkSession
// $example off:spark_hive$
Expand All @@ -38,7 +40,7 @@ object SparkHiveExample {

// $example on:spark_hive$
// warehouseLocation points to the default location for managed databases and tables
val warehouseLocation = "spark-warehouse"
val warehouseLocation = new File("spark-warehouse").getAbsolutePath

val spark = SparkSession
.builder()
Expand Down