Skip to content

Commit 65f5331

Browse files
lins05srowen
authored andcommitted
[SPARK-18652][PYTHON] Include the example data and third-party licenses in pyspark package.
## What changes were proposed in this pull request? Since we already include the python examples in the pyspark package, we should include the example data with it as well. We should also include the third-party licences since we distribute their jars with the pyspark package. ## How was this patch tested? Manually tested with python2.7 and python3.4 ```sh $ ./build/mvn -DskipTests -Phive -Phive-thriftserver -Pyarn -Pmesos clean package $ cd python $ python setup.py sdist $ pip install dist/pyspark-2.1.0.dev0.tar.gz $ ls -1 /usr/local/lib/python2.7/dist-packages/pyspark/data/ graphx mllib streaming $ du -sh /usr/local/lib/python2.7/dist-packages/pyspark/data/ 600K /usr/local/lib/python2.7/dist-packages/pyspark/data/ $ ls -1 /usr/local/lib/python2.7/dist-packages/pyspark/licenses/|head -5 LICENSE-AnchorJS.txt LICENSE-DPark.txt LICENSE-Mockito.txt LICENSE-SnapTree.txt LICENSE-antlr.txt ``` Author: Shuai Lin <[email protected]> Closes #16082 from lins05/include-data-in-pyspark-dist. (cherry picked from commit bd9a4a5) Signed-off-by: Sean Owen <[email protected]>
1 parent d20e0d6 commit 65f5331

File tree

2 files changed

+21
-1
lines changed

2 files changed

+21
-1
lines changed

python/MANIFEST.in

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@
1717
global-exclude *.py[cod] __pycache__ .DS_Store
1818
recursive-include deps/jars *.jar
1919
graft deps/bin
20+
recursive-include deps/data *.data *.txt
21+
recursive-include deps/licenses *.txt
2022
recursive-include deps/examples *.py
2123
recursive-include lib *.zip
2224
include README.md

python/setup.py

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,10 +69,14 @@
6969

7070
EXAMPLES_PATH = os.path.join(SPARK_HOME, "examples/src/main/python")
7171
SCRIPTS_PATH = os.path.join(SPARK_HOME, "bin")
72+
DATA_PATH = os.path.join(SPARK_HOME, "data")
73+
LICENSES_PATH = os.path.join(SPARK_HOME, "licenses")
74+
7275
SCRIPTS_TARGET = os.path.join(TEMP_PATH, "bin")
7376
JARS_TARGET = os.path.join(TEMP_PATH, "jars")
7477
EXAMPLES_TARGET = os.path.join(TEMP_PATH, "examples")
75-
78+
DATA_TARGET = os.path.join(TEMP_PATH, "data")
79+
LICENSES_TARGET = os.path.join(TEMP_PATH, "licenses")
7680

7781
# Check and see if we are under the spark path in which case we need to build the symlink farm.
7882
# This is important because we only want to build the symlink farm while under Spark otherwise we
@@ -114,11 +118,15 @@ def _supports_symlinks():
114118
os.symlink(JARS_PATH, JARS_TARGET)
115119
os.symlink(SCRIPTS_PATH, SCRIPTS_TARGET)
116120
os.symlink(EXAMPLES_PATH, EXAMPLES_TARGET)
121+
os.symlink(DATA_PATH, DATA_TARGET)
122+
os.symlink(LICENSES_PATH, LICENSES_TARGET)
117123
else:
118124
# For windows fall back to the slower copytree
119125
copytree(JARS_PATH, JARS_TARGET)
120126
copytree(SCRIPTS_PATH, SCRIPTS_TARGET)
121127
copytree(EXAMPLES_PATH, EXAMPLES_TARGET)
128+
copytree(DATA_PATH, DATA_TARGET)
129+
copytree(LICENSES_PATH, LICENSES_TARGET)
122130
else:
123131
# If we are not inside of SPARK_HOME verify we have the required symlink farm
124132
if not os.path.exists(JARS_TARGET):
@@ -161,18 +169,24 @@ def _supports_symlinks():
161169
'pyspark.jars',
162170
'pyspark.python.pyspark',
163171
'pyspark.python.lib',
172+
'pyspark.data',
173+
'pyspark.licenses',
164174
'pyspark.examples.src.main.python'],
165175
include_package_data=True,
166176
package_dir={
167177
'pyspark.jars': 'deps/jars',
168178
'pyspark.bin': 'deps/bin',
169179
'pyspark.python.lib': 'lib',
180+
'pyspark.data': 'deps/data',
181+
'pyspark.licenses': 'deps/licenses',
170182
'pyspark.examples.src.main.python': 'deps/examples',
171183
},
172184
package_data={
173185
'pyspark.jars': ['*.jar'],
174186
'pyspark.bin': ['*'],
175187
'pyspark.python.lib': ['*.zip'],
188+
'pyspark.data': ['*.txt', '*.data'],
189+
'pyspark.licenses': ['*.txt'],
176190
'pyspark.examples.src.main.python': ['*.py', '*/*.py']},
177191
scripts=scripts,
178192
license='http://www.apache.org/licenses/LICENSE-2.0',
@@ -202,8 +216,12 @@ def _supports_symlinks():
202216
os.remove(os.path.join(TEMP_PATH, "jars"))
203217
os.remove(os.path.join(TEMP_PATH, "bin"))
204218
os.remove(os.path.join(TEMP_PATH, "examples"))
219+
os.remove(os.path.join(TEMP_PATH, "data"))
220+
os.remove(os.path.join(TEMP_PATH, "licenses"))
205221
else:
206222
rmtree(os.path.join(TEMP_PATH, "jars"))
207223
rmtree(os.path.join(TEMP_PATH, "bin"))
208224
rmtree(os.path.join(TEMP_PATH, "examples"))
225+
rmtree(os.path.join(TEMP_PATH, "data"))
226+
rmtree(os.path.join(TEMP_PATH, "licenses"))
209227
os.rmdir(TEMP_PATH)

0 commit comments

Comments
 (0)