Skip to content

Commit 75663b5

Browse files
daviesmateiz
authored andcommitted
[SPARK-2652] [PySpark] Turning some default configs for PySpark
Add several default configs for PySpark, related to serialization in JVM. spark.serializer = org.apache.spark.serializer.KryoSerializer spark.serializer.objectStreamReset = 100 spark.rdd.compress = True This will help to reduce the memory usage during RDD.partitionBy() Author: Davies Liu <[email protected]> Closes #1568 from davies/conf and squashes the following commits: cd316f1 [Davies Liu] remove duplicated line f71a355 [Davies Liu] rebase to master, add spark.rdd.compress = True 8f63f45 [Davies Liu] Merge branch 'master' into conf 8bc9f08 [Davies Liu] fix unittest c04a83d [Davies Liu] some default configs for PySpark
1 parent 66f26a4 commit 75663b5

File tree

1 file changed

+12
-1
lines changed

1 file changed

+12
-1
lines changed

python/pyspark/context.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,15 @@
3737
from py4j.java_collections import ListConverter
3838

3939

40+
# These are special default configs for PySpark, they will overwrite
41+
# the default ones for Spark if they are not configured by user.
42+
DEFAULT_CONFIGS = {
43+
"spark.serializer": "org.apache.spark.serializer.KryoSerializer",
44+
"spark.serializer.objectStreamReset": 100,
45+
"spark.rdd.compress": True,
46+
}
47+
48+
4049
class SparkContext(object):
4150
"""
4251
Main entry point for Spark functionality. A SparkContext represents the
@@ -101,7 +110,7 @@ def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None,
101110
else:
102111
self.serializer = BatchedSerializer(self._unbatched_serializer,
103112
batchSize)
104-
self._conf.setIfMissing("spark.rdd.compress", "true")
113+
105114
# Set any parameters passed directly to us on the conf
106115
if master:
107116
self._conf.setMaster(master)
@@ -112,6 +121,8 @@ def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None,
112121
if environment:
113122
for key, value in environment.iteritems():
114123
self._conf.setExecutorEnv(key, value)
124+
for key, value in DEFAULT_CONFIGS.items():
125+
self._conf.setIfMissing(key, value)
115126

116127
# Check that we have at least the required parameters
117128
if not self._conf.contains("spark.master"):

0 commit comments

Comments
 (0)