Skip to content

Commit ef68b61

Browse files
committed
test
1 parent be9f62f commit ef68b61

File tree

4 files changed

+223
-0
lines changed

4 files changed

+223
-0
lines changed

dev/create-release/do-release

Whitespace-only changes.
Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
#!/usr/bin/env bash
2+
3+
#
4+
# Licensed to the Apache Software Foundation (ASF) under one or more
5+
# contributor license agreements. See the NOTICE file distributed with
6+
# this work for additional information regarding copyright ownership.
7+
# The ASF licenses this file to You under the Apache License, Version 2.0
8+
# (the "License"); you may not use this file except in compliance with
9+
# the License. You may obtain a copy of the License at
10+
#
11+
# http://www.apache.org/licenses/LICENSE-2.0
12+
#
13+
# Unless required by applicable law or agreed to in writing, software
14+
# distributed under the License is distributed on an "AS IS" BASIS,
15+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16+
# See the License for the specific language governing permissions and
17+
# limitations under the License.
18+
#
19+
20+
#
21+
# Generate TPC-DS data for TPCDSQUeryTestSuite.
22+
# Run with "-h" for options.
23+
#
24+
25+
set -e
26+
SELF=$(cd $(dirname $0) && pwd)
27+
28+
# Re-uses helper funcs for the release scripts
29+
. "$SELF/../create-release/release-util.sh"
30+
31+
function usage {
32+
local NAME=$(basename $0)
33+
cat <<EOF
34+
Usage: $NAME [options]
35+
36+
This script generates TPC-DS data for TPCDSQUeryTestSuite inside a docker image. The image is hardcoded to be called
37+
"spark-tpcds" and will be re-generated (as needed) on every invocation of this script.
38+
39+
Options are:
40+
41+
-d [path] : required: working directory (output will be written to an "output" directory in
42+
the working directory).
43+
EOF
44+
}
45+
46+
WORKDIR=
47+
IMGTAG=latest
48+
while getopts ":d:h" opt; do
49+
case $opt in
50+
d) WORKDIR="$OPTARG" ;;
51+
h) usage; exit 0 ;;
52+
\?) error "Invalid option. Run with -h for help." ;;
53+
esac
54+
done
55+
56+
if [ -z "$WORKDIR" ] || [ ! -d "$WORKDIR" ]; then
57+
error "Work directory (-d) must be defined and exist. Run with -h for help."
58+
fi
59+
60+
if [ -d "$WORKDIR/output" ]; then
61+
read -p "Output directory already exists. Overwrite and continue? [y/n] " ANSWER
62+
if [ "$ANSWER" != "y" ]; then
63+
error "Exiting."
64+
fi
65+
fi
66+
67+
cd "$WORKDIR"
68+
rm -rf "$WORKDIR/output"
69+
mkdir "$WORKDIR/output"
70+
71+
# Place all scripts in a local directory that must be defined in the command
72+
# line. This directory is mounted into the image.
73+
for f in "$SELF"/*; do
74+
if [ -f "$f" ]; then
75+
cp "$f" "$WORKDIR"
76+
fi
77+
done
78+
79+
# Place `release-util.sh` for reuse
80+
cp "$SELF/../create-release/release-util.sh" "$WORKDIR"
81+
82+
run_silent "Building spark-tpcds image with tag $IMGTAG..." "docker-build.log" \
83+
docker build -t "spark-tpcds:$IMGTAG" --build-arg UID=$UID "$SELF/spark-tpcds"
84+
85+
# Write the release information to a file with environment variables to be used when running the
86+
# image.
87+
ENVFILE="$WORKDIR/env.list"
88+
fcreate_secure "$ENVFILE"
89+
90+
function cleanup {
91+
rm -f "$ENVFILE"
92+
}
93+
94+
trap cleanup EXIT
95+
96+
cat > $ENVFILE <<EOF
97+
RUNNING_IN_DOCKER=1
98+
EOF
99+
100+
echo "Building Spark to generate TPC-DS data; output will be at $WORKDIR/output/tpcds-data"
101+
docker run -ti \
102+
--env-file "$ENVFILE" \
103+
--volume "$WORKDIR:/opt/spark-tpcds" \
104+
"spark-tpcds:$IMGTAG"

dev/tpcds-datagen/do-datagen.sh

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
#!/usr/bin/env bash
2+
3+
#
4+
# Licensed to the Apache Software Foundation (ASF) under one or more
5+
# contributor license agreements. See the NOTICE file distributed with
6+
# this work for additional information regarding copyright ownership.
7+
# The ASF licenses this file to You under the Apache License, Version 2.0
8+
# (the "License"); you may not use this file except in compliance with
9+
# the License. You may obtain a copy of the License at
10+
#
11+
# http://www.apache.org/licenses/LICENSE-2.0
12+
#
13+
# Unless required by applicable law or agreed to in writing, software
14+
# distributed under the License is distributed on an "AS IS" BASIS,
15+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16+
# See the License for the specific language governing permissions and
17+
# limitations under the License.
18+
#
19+
20+
set -e
21+
SELF=$(cd $(dirname $0) && pwd)
22+
23+
# Re-uses helper funcs for the release scripts
24+
if [ "$RUNNING_IN_DOCKER" = "1" ]; then
25+
. "$SELF/release-util.sh"
26+
else
27+
. "$SELF/../create-release/release-util.sh"
28+
fi
29+
30+
export LC_ALL=C.UTF-8
31+
export LANG=C.UTF-8
32+
33+
# Checks out tpcds-kit and builds dsdgen
34+
rm -rf tpcds-kit
35+
git clone https://github.com/databricks/tpcds-kit
36+
cd tpcds-kit/tools
37+
run_silent "Building dsdgen in tpcds-kit..." "$SELF/dsdgen-build.log" make OS=LINUX
38+
cd ../..
39+
40+
# Builds Spark to generate TPC-DS data
41+
if [ -z "$SCALE_FACTOR" ]; then
42+
SCALE_FACTOR=1
43+
fi
44+
45+
rm -rf spark
46+
# git clone https://github.com/apache/spark
47+
SBT_OPTS="-Xmx2g -XX:ReservedCodeCacheSize=1g -XX:+UseG1GC"
48+
git clone https://github.com/maropu/spark
49+
cd spark
50+
git checkout tpcdsDatagen
51+
./build/sbt "sql/test:runMain org.apache.spark.sql.GenTPCDSData --dsdgenDir $SELF/tpcds-kit/tools --location $SELF/tpcds-data --scaleFactor $SCALE_FACTOR"
52+
# run_silent "Building Spark to generate TPC-DS data in $SELF/tpcds-data..." "$SELF/spark-build.log" \
53+
# ./build/sbt "sql/test:runMain org.apache.spark.sql.GenTPCDSData --dsdgenDir $SELF/tpcds-kit/tools --location $SELF/tpcds-data --scaleFactor $SCALE_FACTOR"
54+
cd ..
55+
56+
rm -rf spark tpcds-kit
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
#
2+
# Licensed to the Apache Software Foundation (ASF) under one or more
3+
# contributor license agreements. See the NOTICE file distributed with
4+
# this work for additional information regarding copyright ownership.
5+
# The ASF licenses this file to You under the Apache License, Version 2.0
6+
# (the "License"); you may not use this file except in compliance with
7+
# the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
#
17+
18+
# Image to generate TPC-DS data for TPCDSQUeryTestSuite.
19+
# Based on Ubuntu 20.04 to generate the same data with one on GitHub Actions.
20+
#
21+
# Includes:
22+
# * Java 8
23+
#
24+
# You can test it as below:
25+
# cd dev/tpcds-datagen
26+
# docker build -t spark-tpcds --build-arg UID=$UID .
27+
28+
FROM ubuntu:20.04
29+
30+
# For apt to be noninteractive
31+
ENV DEBIAN_FRONTEND noninteractive
32+
ENV DEBCONF_NONINTERACTIVE_SEEN true
33+
34+
# These arguments are just for reuse and not really meant to be customized.
35+
ARG APT_INSTALL="apt-get install --no-install-recommends -y"
36+
37+
# This is all in a single "RUN" command so that if anything changes, "apt update" is run to fetch
38+
# the most current package versions (instead of potentially using old versions cached by docker).
39+
RUN apt-get clean && apt-get update && $APT_INSTALL gnupg ca-certificates && \
40+
gpg --keyserver keyserver.ubuntu.com --recv-key E298A3A825C0D65DFD57CBB651716619E084DAB9 && \
41+
gpg -a --export E084DAB9 | apt-key add - && \
42+
apt-get clean && \
43+
rm -rf /var/lib/apt/lists/* && \
44+
apt-get clean && \
45+
apt-get update && \
46+
$APT_INSTALL software-properties-common && \
47+
apt-get update && \
48+
# Install openjdk 8.
49+
$APT_INSTALL openjdk-8-jdk && \
50+
update-alternatives --set java /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java && \
51+
# Install build / source control tools
52+
$APT_INSTALL curl wget git maven subversion make gcc bison flex lsof libffi-dev \
53+
libssl-dev libcurl4-openssl-dev libxml2-dev && \
54+
curl -sL https://deb.nodesource.com/setup_12.x | bash && \
55+
$APT_INSTALL libcurl4-openssl-dev libgit2-dev libssl-dev libxml2-dev
56+
57+
WORKDIR /opt/spark-tpcds/output
58+
59+
ARG UID
60+
RUN useradd -m -s /bin/bash -p spark-tpcds -u $UID spark-tpcds
61+
USER spark-tpcds:spark-tpcds
62+
63+
ENTRYPOINT [ "/opt/spark-tpcds/do-datagen.sh" ]

0 commit comments

Comments
 (0)