diff --git a/dev/pr-deps/spark-deps-hadoop-2.2 b/dev/pr-deps/spark-deps-hadoop-2.2 new file mode 100644 index 0000000000000..3a14499d9b4d9 --- /dev/null +++ b/dev/pr-deps/spark-deps-hadoop-2.2 @@ -0,0 +1,182 @@ +JavaEWAH-0.3.2.jar +RoaringBitmap-0.5.11.jar +ST4-4.0.4.jar +activation-1.1.jar +antlr-runtime-3.5.2.jar +aopalliance-1.0.jar +apache-log4j-extras-1.2.17.jar +arpack_combined_all-0.1.jar +asm-3.1.jar +asm-commons-3.1.jar +asm-tree-3.1.jar +avro-1.7.7.jar +avro-ipc-1.7.7-tests.jar +avro-ipc-1.7.7.jar +avro-mapred-1.7.7-hadoop2.jar +bonecp-0.8.0.RELEASE.jar +breeze-macros_2.11-0.11.2.jar +breeze_2.11-0.11.2.jar +calcite-avatica-1.2.0-incubating.jar +calcite-core-1.2.0-incubating.jar +calcite-linq4j-1.2.0-incubating.jar +chill-java-0.5.0.jar +chill_2.11-0.5.0.jar +commons-beanutils-1.7.0.jar +commons-beanutils-core-1.8.0.jar +commons-cli-1.2.jar +commons-codec-1.10.jar +commons-collections-3.2.2.jar +commons-compiler-2.7.6.jar +commons-compress-1.4.1.jar +commons-configuration-1.6.jar +commons-dbcp-1.4.jar +commons-digester-1.8.jar +commons-httpclient-3.1.jar +commons-io-2.1.jar +commons-lang-2.6.jar +commons-lang3-3.3.2.jar +commons-logging-1.1.3.jar +commons-math-2.1.jar +commons-math3-3.4.1.jar +commons-net-2.2.jar +commons-pool-1.5.4.jar +compress-lzf-1.0.3.jar +core-1.1.2.jar +curator-client-2.4.0.jar +curator-framework-2.4.0.jar +curator-recipes-2.4.0.jar +datanucleus-api-jdo-3.2.6.jar +datanucleus-core-3.2.10.jar +datanucleus-rdbms-3.2.9.jar +derby-10.10.1.1.jar +eigenbase-properties-1.1.5.jar +geronimo-annotation_1.0_spec-1.1.1.jar +geronimo-jaspic_1.0_spec-1.0.jar +geronimo-jta_1.1_spec-1.1.1.jar +gmbal-api-only-3.0.0-b023.jar +grizzly-framework-2.1.2.jar +grizzly-http-2.1.2.jar +grizzly-http-server-2.1.2.jar +grizzly-http-servlet-2.1.2.jar +grizzly-rcm-2.1.2.jar +groovy-all-2.1.6.jar +guice-3.0.jar +guice-servlet-3.0.jar +hadoop-annotations-2.2.0.jar +hadoop-auth-2.2.0.jar +hadoop-client-2.2.0.jar +hadoop-common-2.2.0.jar +hadoop-hdfs-2.2.0.jar +hadoop-mapreduce-client-app-2.2.0.jar +hadoop-mapreduce-client-common-2.2.0.jar +hadoop-mapreduce-client-core-2.2.0.jar +hadoop-mapreduce-client-jobclient-2.2.0.jar +hadoop-mapreduce-client-shuffle-2.2.0.jar +hadoop-yarn-api-2.2.0.jar +hadoop-yarn-client-2.2.0.jar +hadoop-yarn-common-2.2.0.jar +hadoop-yarn-server-common-2.2.0.jar +hadoop-yarn-server-web-proxy-2.2.0.jar +httpclient-4.3.2.jar +httpcore-4.3.2.jar +ivy-2.4.0.jar +jackson-annotations-2.5.3.jar +jackson-core-2.5.3.jar +jackson-core-asl-1.9.13.jar +jackson-databind-2.5.3.jar +jackson-jaxrs-1.9.13.jar +jackson-mapper-asl-1.9.13.jar +jackson-module-scala_2.11-2.5.3.jar +jackson-xc-1.9.13.jar +janino-2.7.8.jar +javax.inject-1.jar +javax.servlet-3.0.0.v201112011016.jar +javax.servlet-3.1.jar +javax.servlet-api-3.0.1.jar +javolution-5.5.1.jar +jaxb-api-2.2.2.jar +jaxb-impl-2.2.3-1.jar +jcl-over-slf4j-1.7.10.jar +jdo-api-3.0.1.jar +jersey-client-1.9.jar +jersey-core-1.9.jar +jersey-grizzly2-1.9.jar +jersey-guice-1.9.jar +jersey-json-1.9.jar +jersey-server-1.9.jar +jersey-test-framework-core-1.9.jar +jersey-test-framework-grizzly2-1.9.jar +jets3t-0.7.1.jar +jettison-1.1.jar +jetty-all-7.6.0.v20120127.jar +jetty-util-6.1.26.jar +jline-2.12.jar +joda-time-2.9.jar +jodd-core-3.5.2.jar +jpam-1.1.jar +json-20090211.jar +json4s-ast_2.11-3.2.10.jar +json4s-core_2.11-3.2.10.jar +json4s-jackson_2.11-3.2.10.jar +jsr305-1.3.9.jar +jta-1.1.jar +jtransforms-2.4.0.jar +jul-to-slf4j-1.7.10.jar +kryo-2.21.jar +leveldbjni-all-1.8.jar +libfb303-0.9.2.jar +libthrift-0.9.2.jar +log4j-1.2.17.jar +lz4-1.3.0.jar +mail-1.4.1.jar +management-api-3.0.0-b012.jar +mesos-0.21.1-shaded-protobuf.jar +metrics-core-3.1.2.jar +metrics-graphite-3.1.2.jar +metrics-json-3.1.2.jar +metrics-jvm-3.1.2.jar +minlog-1.2.jar +netty-3.8.0.Final.jar +netty-all-4.0.29.Final.jar +objenesis-1.2.jar +opencsv-2.3.jar +oro-2.0.8.jar +paranamer-2.6.jar +parquet-column-1.7.0.jar +parquet-common-1.7.0.jar +parquet-encoding-1.7.0.jar +parquet-format-2.3.0-incubating.jar +parquet-generator-1.7.0.jar +parquet-hadoop-1.7.0.jar +parquet-hadoop-bundle-1.6.0.jar +parquet-jackson-1.7.0.jar +pmml-agent-1.2.7.jar +pmml-model-1.2.7.jar +pmml-schema-1.2.7.jar +protobuf-java-2.5.0.jar +py4j-0.9.1.jar +pyrolite-4.9.jar +reflectasm-1.07-shaded.jar +scala-compiler-2.11.7.jar +scala-library-2.11.7.jar +scala-parser-combinators_2.11-1.0.4.jar +scala-reflect-2.11.7.jar +scala-xml_2.11-1.0.2.jar +scalap-2.11.7.jar +servlet-api-2.5.jar +slf4j-api-1.7.10.jar +slf4j-log4j12-1.7.10.jar +snappy-0.2.jar +snappy-java-1.1.2.jar +spire-macros_2.11-0.7.4.jar +spire_2.11-0.7.4.jar +stax-api-1.0-2.jar +stax-api-1.0.1.jar +stream-2.7.0.jar +super-csv-2.2.0.jar +univocity-parsers-1.5.6.jar +unused-1.0.0.jar +xbean-asm5-shaded-4.4.jar +xmlenc-0.52.jar +xz-1.0.jar +zookeeper-3.4.5.jar diff --git a/dev/pr-deps/spark-deps-hadoop-2.3 b/dev/pr-deps/spark-deps-hadoop-2.3 new file mode 100644 index 0000000000000..615836b3d3b77 --- /dev/null +++ b/dev/pr-deps/spark-deps-hadoop-2.3 @@ -0,0 +1,173 @@ +JavaEWAH-0.3.2.jar +RoaringBitmap-0.5.11.jar +ST4-4.0.4.jar +activation-1.1.1.jar +antlr-runtime-3.5.2.jar +aopalliance-1.0.jar +apache-log4j-extras-1.2.17.jar +arpack_combined_all-0.1.jar +asm-3.1.jar +asm-commons-3.1.jar +asm-tree-3.1.jar +avro-1.7.7.jar +avro-ipc-1.7.7-tests.jar +avro-ipc-1.7.7.jar +avro-mapred-1.7.7-hadoop2.jar +base64-2.3.8.jar +bcprov-jdk15on-1.51.jar +bonecp-0.8.0.RELEASE.jar +breeze-macros_2.11-0.11.2.jar +breeze_2.11-0.11.2.jar +calcite-avatica-1.2.0-incubating.jar +calcite-core-1.2.0-incubating.jar +calcite-linq4j-1.2.0-incubating.jar +chill-java-0.5.0.jar +chill_2.11-0.5.0.jar +commons-beanutils-1.7.0.jar +commons-beanutils-core-1.8.0.jar +commons-cli-1.2.jar +commons-codec-1.10.jar +commons-collections-3.2.2.jar +commons-compiler-2.7.6.jar +commons-compress-1.4.1.jar +commons-configuration-1.6.jar +commons-dbcp-1.4.jar +commons-digester-1.8.jar +commons-httpclient-3.1.jar +commons-io-2.4.jar +commons-lang-2.6.jar +commons-lang3-3.3.2.jar +commons-logging-1.1.3.jar +commons-math3-3.4.1.jar +commons-net-2.2.jar +commons-pool-1.5.4.jar +compress-lzf-1.0.3.jar +core-1.1.2.jar +curator-client-2.4.0.jar +curator-framework-2.4.0.jar +curator-recipes-2.4.0.jar +datanucleus-api-jdo-3.2.6.jar +datanucleus-core-3.2.10.jar +datanucleus-rdbms-3.2.9.jar +derby-10.10.1.1.jar +eigenbase-properties-1.1.5.jar +geronimo-annotation_1.0_spec-1.1.1.jar +geronimo-jaspic_1.0_spec-1.0.jar +geronimo-jta_1.1_spec-1.1.1.jar +groovy-all-2.1.6.jar +guice-3.0.jar +guice-servlet-3.0.jar +hadoop-annotations-2.3.0.jar +hadoop-auth-2.3.0.jar +hadoop-client-2.3.0.jar +hadoop-common-2.3.0.jar +hadoop-hdfs-2.3.0.jar +hadoop-mapreduce-client-app-2.3.0.jar +hadoop-mapreduce-client-common-2.3.0.jar +hadoop-mapreduce-client-core-2.3.0.jar +hadoop-mapreduce-client-jobclient-2.3.0.jar +hadoop-mapreduce-client-shuffle-2.3.0.jar +hadoop-yarn-api-2.3.0.jar +hadoop-yarn-client-2.3.0.jar +hadoop-yarn-common-2.3.0.jar +hadoop-yarn-server-common-2.3.0.jar +hadoop-yarn-server-web-proxy-2.3.0.jar +httpclient-4.3.2.jar +httpcore-4.3.2.jar +ivy-2.4.0.jar +jackson-annotations-2.5.3.jar +jackson-core-2.5.3.jar +jackson-core-asl-1.9.13.jar +jackson-databind-2.5.3.jar +jackson-jaxrs-1.9.13.jar +jackson-mapper-asl-1.9.13.jar +jackson-module-scala_2.11-2.5.3.jar +jackson-xc-1.9.13.jar +janino-2.7.8.jar +java-xmlbuilder-1.0.jar +javax.inject-1.jar +javax.servlet-3.0.0.v201112011016.jar +javolution-5.5.1.jar +jaxb-api-2.2.2.jar +jaxb-impl-2.2.3-1.jar +jcl-over-slf4j-1.7.10.jar +jdo-api-3.0.1.jar +jersey-core-1.9.jar +jersey-guice-1.9.jar +jersey-json-1.9.jar +jersey-server-1.9.jar +jets3t-0.9.3.jar +jettison-1.1.jar +jetty-6.1.26.jar +jetty-all-7.6.0.v20120127.jar +jetty-util-6.1.26.jar +jline-2.12.jar +joda-time-2.9.jar +jodd-core-3.5.2.jar +jpam-1.1.jar +json-20090211.jar +json4s-ast_2.11-3.2.10.jar +json4s-core_2.11-3.2.10.jar +json4s-jackson_2.11-3.2.10.jar +jsr305-1.3.9.jar +jta-1.1.jar +jtransforms-2.4.0.jar +jul-to-slf4j-1.7.10.jar +kryo-2.21.jar +leveldbjni-all-1.8.jar +libfb303-0.9.2.jar +libthrift-0.9.2.jar +log4j-1.2.17.jar +lz4-1.3.0.jar +mail-1.4.7.jar +mesos-0.21.1-shaded-protobuf.jar +metrics-core-3.1.2.jar +metrics-graphite-3.1.2.jar +metrics-json-3.1.2.jar +metrics-jvm-3.1.2.jar +minlog-1.2.jar +mx4j-3.0.2.jar +netty-3.8.0.Final.jar +netty-all-4.0.29.Final.jar +objenesis-1.2.jar +opencsv-2.3.jar +oro-2.0.8.jar +paranamer-2.6.jar +parquet-column-1.7.0.jar +parquet-common-1.7.0.jar +parquet-encoding-1.7.0.jar +parquet-format-2.3.0-incubating.jar +parquet-generator-1.7.0.jar +parquet-hadoop-1.7.0.jar +parquet-hadoop-bundle-1.6.0.jar +parquet-jackson-1.7.0.jar +pmml-agent-1.2.7.jar +pmml-model-1.2.7.jar +pmml-schema-1.2.7.jar +protobuf-java-2.5.0.jar +py4j-0.9.1.jar +pyrolite-4.9.jar +reflectasm-1.07-shaded.jar +scala-compiler-2.11.7.jar +scala-library-2.11.7.jar +scala-parser-combinators_2.11-1.0.4.jar +scala-reflect-2.11.7.jar +scala-xml_2.11-1.0.2.jar +scalap-2.11.7.jar +servlet-api-2.5.jar +slf4j-api-1.7.10.jar +slf4j-log4j12-1.7.10.jar +snappy-0.2.jar +snappy-java-1.1.2.jar +spire-macros_2.11-0.7.4.jar +spire_2.11-0.7.4.jar +stax-api-1.0-2.jar +stax-api-1.0.1.jar +stream-2.7.0.jar +super-csv-2.2.0.jar +univocity-parsers-1.5.6.jar +unused-1.0.0.jar +xbean-asm5-shaded-4.4.jar +xmlenc-0.52.jar +xz-1.0.jar +zookeeper-3.4.5.jar diff --git a/dev/pr-deps/spark-deps-hadoop-2.4 b/dev/pr-deps/spark-deps-hadoop-2.4 new file mode 100644 index 0000000000000..f275226f1d088 --- /dev/null +++ b/dev/pr-deps/spark-deps-hadoop-2.4 @@ -0,0 +1,174 @@ +JavaEWAH-0.3.2.jar +RoaringBitmap-0.5.11.jar +ST4-4.0.4.jar +activation-1.1.1.jar +antlr-runtime-3.5.2.jar +aopalliance-1.0.jar +apache-log4j-extras-1.2.17.jar +arpack_combined_all-0.1.jar +asm-3.1.jar +asm-commons-3.1.jar +asm-tree-3.1.jar +avro-1.7.7.jar +avro-ipc-1.7.7-tests.jar +avro-ipc-1.7.7.jar +avro-mapred-1.7.7-hadoop2.jar +base64-2.3.8.jar +bcprov-jdk15on-1.51.jar +bonecp-0.8.0.RELEASE.jar +breeze-macros_2.11-0.11.2.jar +breeze_2.11-0.11.2.jar +calcite-avatica-1.2.0-incubating.jar +calcite-core-1.2.0-incubating.jar +calcite-linq4j-1.2.0-incubating.jar +chill-java-0.5.0.jar +chill_2.11-0.5.0.jar +commons-beanutils-1.7.0.jar +commons-beanutils-core-1.8.0.jar +commons-cli-1.2.jar +commons-codec-1.10.jar +commons-collections-3.2.2.jar +commons-compiler-2.7.6.jar +commons-compress-1.4.1.jar +commons-configuration-1.6.jar +commons-dbcp-1.4.jar +commons-digester-1.8.jar +commons-httpclient-3.1.jar +commons-io-2.4.jar +commons-lang-2.6.jar +commons-lang3-3.3.2.jar +commons-logging-1.1.3.jar +commons-math3-3.4.1.jar +commons-net-2.2.jar +commons-pool-1.5.4.jar +compress-lzf-1.0.3.jar +core-1.1.2.jar +curator-client-2.4.0.jar +curator-framework-2.4.0.jar +curator-recipes-2.4.0.jar +datanucleus-api-jdo-3.2.6.jar +datanucleus-core-3.2.10.jar +datanucleus-rdbms-3.2.9.jar +derby-10.10.1.1.jar +eigenbase-properties-1.1.5.jar +geronimo-annotation_1.0_spec-1.1.1.jar +geronimo-jaspic_1.0_spec-1.0.jar +geronimo-jta_1.1_spec-1.1.1.jar +groovy-all-2.1.6.jar +guice-3.0.jar +guice-servlet-3.0.jar +hadoop-annotations-2.4.0.jar +hadoop-auth-2.4.0.jar +hadoop-client-2.4.0.jar +hadoop-common-2.4.0.jar +hadoop-hdfs-2.4.0.jar +hadoop-mapreduce-client-app-2.4.0.jar +hadoop-mapreduce-client-common-2.4.0.jar +hadoop-mapreduce-client-core-2.4.0.jar +hadoop-mapreduce-client-jobclient-2.4.0.jar +hadoop-mapreduce-client-shuffle-2.4.0.jar +hadoop-yarn-api-2.4.0.jar +hadoop-yarn-client-2.4.0.jar +hadoop-yarn-common-2.4.0.jar +hadoop-yarn-server-common-2.4.0.jar +hadoop-yarn-server-web-proxy-2.4.0.jar +httpclient-4.3.2.jar +httpcore-4.3.2.jar +ivy-2.4.0.jar +jackson-annotations-2.5.3.jar +jackson-core-2.5.3.jar +jackson-core-asl-1.9.13.jar +jackson-databind-2.5.3.jar +jackson-jaxrs-1.9.13.jar +jackson-mapper-asl-1.9.13.jar +jackson-module-scala_2.11-2.5.3.jar +jackson-xc-1.9.13.jar +janino-2.7.8.jar +java-xmlbuilder-1.0.jar +javax.inject-1.jar +javax.servlet-3.0.0.v201112011016.jar +javolution-5.5.1.jar +jaxb-api-2.2.2.jar +jaxb-impl-2.2.3-1.jar +jcl-over-slf4j-1.7.10.jar +jdo-api-3.0.1.jar +jersey-client-1.9.jar +jersey-core-1.9.jar +jersey-guice-1.9.jar +jersey-json-1.9.jar +jersey-server-1.9.jar +jets3t-0.9.3.jar +jettison-1.1.jar +jetty-6.1.26.jar +jetty-all-7.6.0.v20120127.jar +jetty-util-6.1.26.jar +jline-2.12.jar +joda-time-2.9.jar +jodd-core-3.5.2.jar +jpam-1.1.jar +json-20090211.jar +json4s-ast_2.11-3.2.10.jar +json4s-core_2.11-3.2.10.jar +json4s-jackson_2.11-3.2.10.jar +jsr305-1.3.9.jar +jta-1.1.jar +jtransforms-2.4.0.jar +jul-to-slf4j-1.7.10.jar +kryo-2.21.jar +leveldbjni-all-1.8.jar +libfb303-0.9.2.jar +libthrift-0.9.2.jar +log4j-1.2.17.jar +lz4-1.3.0.jar +mail-1.4.7.jar +mesos-0.21.1-shaded-protobuf.jar +metrics-core-3.1.2.jar +metrics-graphite-3.1.2.jar +metrics-json-3.1.2.jar +metrics-jvm-3.1.2.jar +minlog-1.2.jar +mx4j-3.0.2.jar +netty-3.8.0.Final.jar +netty-all-4.0.29.Final.jar +objenesis-1.2.jar +opencsv-2.3.jar +oro-2.0.8.jar +paranamer-2.6.jar +parquet-column-1.7.0.jar +parquet-common-1.7.0.jar +parquet-encoding-1.7.0.jar +parquet-format-2.3.0-incubating.jar +parquet-generator-1.7.0.jar +parquet-hadoop-1.7.0.jar +parquet-hadoop-bundle-1.6.0.jar +parquet-jackson-1.7.0.jar +pmml-agent-1.2.7.jar +pmml-model-1.2.7.jar +pmml-schema-1.2.7.jar +protobuf-java-2.5.0.jar +py4j-0.9.1.jar +pyrolite-4.9.jar +reflectasm-1.07-shaded.jar +scala-compiler-2.11.7.jar +scala-library-2.11.7.jar +scala-parser-combinators_2.11-1.0.4.jar +scala-reflect-2.11.7.jar +scala-xml_2.11-1.0.2.jar +scalap-2.11.7.jar +servlet-api-2.5.jar +slf4j-api-1.7.10.jar +slf4j-log4j12-1.7.10.jar +snappy-0.2.jar +snappy-java-1.1.2.jar +spire-macros_2.11-0.7.4.jar +spire_2.11-0.7.4.jar +stax-api-1.0-2.jar +stax-api-1.0.1.jar +stream-2.7.0.jar +super-csv-2.2.0.jar +univocity-parsers-1.5.6.jar +unused-1.0.0.jar +xbean-asm5-shaded-4.4.jar +xmlenc-0.52.jar +xz-1.0.jar +zookeeper-3.4.5.jar diff --git a/dev/pr-deps/spark-deps-hadoop-2.6 b/dev/pr-deps/spark-deps-hadoop-2.6 new file mode 100644 index 0000000000000..21432a16e3659 --- /dev/null +++ b/dev/pr-deps/spark-deps-hadoop-2.6 @@ -0,0 +1,181 @@ +JavaEWAH-0.3.2.jar +RoaringBitmap-0.5.11.jar +ST4-4.0.4.jar +activation-1.1.1.jar +antlr-runtime-3.5.2.jar +aopalliance-1.0.jar +apache-log4j-extras-1.2.17.jar +apacheds-i18n-2.0.0-M15.jar +apacheds-kerberos-codec-2.0.0-M15.jar +api-asn1-api-1.0.0-M20.jar +api-util-1.0.0-M20.jar +arpack_combined_all-0.1.jar +asm-3.1.jar +asm-commons-3.1.jar +asm-tree-3.1.jar +avro-1.7.7.jar +avro-ipc-1.7.7-tests.jar +avro-ipc-1.7.7.jar +avro-mapred-1.7.7-hadoop2.jar +base64-2.3.8.jar +bcprov-jdk15on-1.51.jar +bonecp-0.8.0.RELEASE.jar +breeze-macros_2.11-0.11.2.jar +breeze_2.11-0.11.2.jar +calcite-avatica-1.2.0-incubating.jar +calcite-core-1.2.0-incubating.jar +calcite-linq4j-1.2.0-incubating.jar +chill-java-0.5.0.jar +chill_2.11-0.5.0.jar +commons-beanutils-1.7.0.jar +commons-beanutils-core-1.8.0.jar +commons-cli-1.2.jar +commons-codec-1.10.jar +commons-collections-3.2.2.jar +commons-compiler-2.7.6.jar +commons-compress-1.4.1.jar +commons-configuration-1.6.jar +commons-dbcp-1.4.jar +commons-digester-1.8.jar +commons-httpclient-3.1.jar +commons-io-2.4.jar +commons-lang-2.6.jar +commons-lang3-3.3.2.jar +commons-logging-1.1.3.jar +commons-math3-3.4.1.jar +commons-net-2.2.jar +commons-pool-1.5.4.jar +compress-lzf-1.0.3.jar +core-1.1.2.jar +curator-client-2.6.0.jar +curator-framework-2.6.0.jar +curator-recipes-2.6.0.jar +datanucleus-api-jdo-3.2.6.jar +datanucleus-core-3.2.10.jar +datanucleus-rdbms-3.2.9.jar +derby-10.10.1.1.jar +eigenbase-properties-1.1.5.jar +geronimo-annotation_1.0_spec-1.1.1.jar +geronimo-jaspic_1.0_spec-1.0.jar +geronimo-jta_1.1_spec-1.1.1.jar +groovy-all-2.1.6.jar +gson-2.2.4.jar +guice-3.0.jar +guice-servlet-3.0.jar +hadoop-annotations-2.6.0.jar +hadoop-auth-2.6.0.jar +hadoop-client-2.6.0.jar +hadoop-common-2.6.0.jar +hadoop-hdfs-2.6.0.jar +hadoop-mapreduce-client-app-2.6.0.jar +hadoop-mapreduce-client-common-2.6.0.jar +hadoop-mapreduce-client-core-2.6.0.jar +hadoop-mapreduce-client-jobclient-2.6.0.jar +hadoop-mapreduce-client-shuffle-2.6.0.jar +hadoop-yarn-api-2.6.0.jar +hadoop-yarn-client-2.6.0.jar +hadoop-yarn-common-2.6.0.jar +hadoop-yarn-server-common-2.6.0.jar +hadoop-yarn-server-web-proxy-2.6.0.jar +htrace-core-3.0.4.jar +httpclient-4.3.2.jar +httpcore-4.3.2.jar +ivy-2.4.0.jar +jackson-annotations-2.5.3.jar +jackson-core-2.5.3.jar +jackson-core-asl-1.9.13.jar +jackson-databind-2.5.3.jar +jackson-jaxrs-1.9.13.jar +jackson-mapper-asl-1.9.13.jar +jackson-module-scala_2.11-2.5.3.jar +jackson-xc-1.9.13.jar +janino-2.7.8.jar +java-xmlbuilder-1.0.jar +javax.inject-1.jar +javax.servlet-3.0.0.v201112011016.jar +javolution-5.5.1.jar +jaxb-api-2.2.2.jar +jaxb-impl-2.2.3-1.jar +jcl-over-slf4j-1.7.10.jar +jdo-api-3.0.1.jar +jersey-client-1.9.jar +jersey-core-1.9.jar +jersey-guice-1.9.jar +jersey-json-1.9.jar +jersey-server-1.9.jar +jets3t-0.9.3.jar +jettison-1.1.jar +jetty-6.1.26.jar +jetty-all-7.6.0.v20120127.jar +jetty-util-6.1.26.jar +jline-2.12.jar +joda-time-2.9.jar +jodd-core-3.5.2.jar +jpam-1.1.jar +json-20090211.jar +json4s-ast_2.11-3.2.10.jar +json4s-core_2.11-3.2.10.jar +json4s-jackson_2.11-3.2.10.jar +jsr305-1.3.9.jar +jta-1.1.jar +jtransforms-2.4.0.jar +jul-to-slf4j-1.7.10.jar +kryo-2.21.jar +leveldbjni-all-1.8.jar +libfb303-0.9.2.jar +libthrift-0.9.2.jar +log4j-1.2.17.jar +lz4-1.3.0.jar +mail-1.4.7.jar +mesos-0.21.1-shaded-protobuf.jar +metrics-core-3.1.2.jar +metrics-graphite-3.1.2.jar +metrics-json-3.1.2.jar +metrics-jvm-3.1.2.jar +minlog-1.2.jar +mx4j-3.0.2.jar +netty-3.8.0.Final.jar +netty-all-4.0.29.Final.jar +objenesis-1.2.jar +opencsv-2.3.jar +oro-2.0.8.jar +paranamer-2.6.jar +parquet-column-1.7.0.jar +parquet-common-1.7.0.jar +parquet-encoding-1.7.0.jar +parquet-format-2.3.0-incubating.jar +parquet-generator-1.7.0.jar +parquet-hadoop-1.7.0.jar +parquet-hadoop-bundle-1.6.0.jar +parquet-jackson-1.7.0.jar +pmml-agent-1.2.7.jar +pmml-model-1.2.7.jar +pmml-schema-1.2.7.jar +protobuf-java-2.5.0.jar +py4j-0.9.1.jar +pyrolite-4.9.jar +reflectasm-1.07-shaded.jar +scala-compiler-2.11.7.jar +scala-library-2.11.7.jar +scala-parser-combinators_2.11-1.0.4.jar +scala-reflect-2.11.7.jar +scala-xml_2.11-1.0.2.jar +scalap-2.11.7.jar +servlet-api-2.5.jar +slf4j-api-1.7.10.jar +slf4j-log4j12-1.7.10.jar +snappy-0.2.jar +snappy-java-1.1.2.jar +spire-macros_2.11-0.7.4.jar +spire_2.11-0.7.4.jar +stax-api-1.0-2.jar +stax-api-1.0.1.jar +stream-2.7.0.jar +super-csv-2.2.0.jar +univocity-parsers-1.5.6.jar +unused-1.0.0.jar +xbean-asm5-shaded-4.4.jar +xercesImpl-2.9.1.jar +xmlenc-0.52.jar +xz-1.0.jar +zookeeper-3.4.6.jar diff --git a/dev/pr-deps/spark-deps-hadoop-2.7 b/dev/pr-deps/spark-deps-hadoop-2.7 new file mode 100644 index 0000000000000..20e09cd002635 --- /dev/null +++ b/dev/pr-deps/spark-deps-hadoop-2.7 @@ -0,0 +1,182 @@ +JavaEWAH-0.3.2.jar +RoaringBitmap-0.5.11.jar +ST4-4.0.4.jar +activation-1.1.1.jar +antlr-runtime-3.5.2.jar +aopalliance-1.0.jar +apache-log4j-extras-1.2.17.jar +apacheds-i18n-2.0.0-M15.jar +apacheds-kerberos-codec-2.0.0-M15.jar +api-asn1-api-1.0.0-M20.jar +api-util-1.0.0-M20.jar +arpack_combined_all-0.1.jar +asm-3.1.jar +asm-commons-3.1.jar +asm-tree-3.1.jar +avro-1.7.7.jar +avro-ipc-1.7.7-tests.jar +avro-ipc-1.7.7.jar +avro-mapred-1.7.7-hadoop2.jar +base64-2.3.8.jar +bcprov-jdk15on-1.51.jar +bonecp-0.8.0.RELEASE.jar +breeze-macros_2.11-0.11.2.jar +breeze_2.11-0.11.2.jar +calcite-avatica-1.2.0-incubating.jar +calcite-core-1.2.0-incubating.jar +calcite-linq4j-1.2.0-incubating.jar +chill-java-0.5.0.jar +chill_2.11-0.5.0.jar +commons-beanutils-1.7.0.jar +commons-beanutils-core-1.8.0.jar +commons-cli-1.2.jar +commons-codec-1.10.jar +commons-collections-3.2.2.jar +commons-compiler-2.7.6.jar +commons-compress-1.4.1.jar +commons-configuration-1.6.jar +commons-dbcp-1.4.jar +commons-digester-1.8.jar +commons-httpclient-3.1.jar +commons-io-2.4.jar +commons-lang-2.6.jar +commons-lang3-3.3.2.jar +commons-logging-1.1.3.jar +commons-math3-3.4.1.jar +commons-net-2.2.jar +commons-pool-1.5.4.jar +compress-lzf-1.0.3.jar +core-1.1.2.jar +curator-client-2.6.0.jar +curator-framework-2.6.0.jar +curator-recipes-2.6.0.jar +datanucleus-api-jdo-3.2.6.jar +datanucleus-core-3.2.10.jar +datanucleus-rdbms-3.2.9.jar +derby-10.10.1.1.jar +eigenbase-properties-1.1.5.jar +geronimo-annotation_1.0_spec-1.1.1.jar +geronimo-jaspic_1.0_spec-1.0.jar +geronimo-jta_1.1_spec-1.1.1.jar +groovy-all-2.1.6.jar +gson-2.2.4.jar +guice-3.0.jar +guice-servlet-3.0.jar +hadoop-annotations-2.7.0.jar +hadoop-auth-2.7.0.jar +hadoop-client-2.7.0.jar +hadoop-common-2.7.0.jar +hadoop-hdfs-2.7.0.jar +hadoop-mapreduce-client-app-2.7.0.jar +hadoop-mapreduce-client-common-2.7.0.jar +hadoop-mapreduce-client-core-2.7.0.jar +hadoop-mapreduce-client-jobclient-2.7.0.jar +hadoop-mapreduce-client-shuffle-2.7.0.jar +hadoop-yarn-api-2.7.0.jar +hadoop-yarn-client-2.7.0.jar +hadoop-yarn-common-2.7.0.jar +hadoop-yarn-server-common-2.7.0.jar +hadoop-yarn-server-web-proxy-2.7.0.jar +htrace-core-3.1.0-incubating.jar +httpclient-4.3.2.jar +httpcore-4.3.2.jar +ivy-2.4.0.jar +jackson-annotations-2.5.3.jar +jackson-core-2.5.3.jar +jackson-core-asl-1.9.13.jar +jackson-databind-2.5.3.jar +jackson-jaxrs-1.9.13.jar +jackson-mapper-asl-1.9.13.jar +jackson-module-scala_2.11-2.5.3.jar +jackson-xc-1.9.13.jar +janino-2.7.8.jar +java-xmlbuilder-1.0.jar +javax.inject-1.jar +javax.servlet-3.0.0.v201112011016.jar +javolution-5.5.1.jar +jaxb-api-2.2.2.jar +jaxb-impl-2.2.3-1.jar +jcl-over-slf4j-1.7.10.jar +jdo-api-3.0.1.jar +jersey-client-1.9.jar +jersey-core-1.9.jar +jersey-guice-1.9.jar +jersey-json-1.9.jar +jersey-server-1.9.jar +jets3t-0.9.3.jar +jettison-1.1.jar +jetty-6.1.26.jar +jetty-all-7.6.0.v20120127.jar +jetty-util-6.1.26.jar +jline-2.12.jar +joda-time-2.9.jar +jodd-core-3.5.2.jar +jpam-1.1.jar +json-20090211.jar +json4s-ast_2.11-3.2.10.jar +json4s-core_2.11-3.2.10.jar +json4s-jackson_2.11-3.2.10.jar +jsp-api-2.1.jar +jsr305-1.3.9.jar +jta-1.1.jar +jtransforms-2.4.0.jar +jul-to-slf4j-1.7.10.jar +kryo-2.21.jar +leveldbjni-all-1.8.jar +libfb303-0.9.2.jar +libthrift-0.9.2.jar +log4j-1.2.17.jar +lz4-1.3.0.jar +mail-1.4.7.jar +mesos-0.21.1-shaded-protobuf.jar +metrics-core-3.1.2.jar +metrics-graphite-3.1.2.jar +metrics-json-3.1.2.jar +metrics-jvm-3.1.2.jar +minlog-1.2.jar +mx4j-3.0.2.jar +netty-3.8.0.Final.jar +netty-all-4.0.29.Final.jar +objenesis-1.2.jar +opencsv-2.3.jar +oro-2.0.8.jar +paranamer-2.6.jar +parquet-column-1.7.0.jar +parquet-common-1.7.0.jar +parquet-encoding-1.7.0.jar +parquet-format-2.3.0-incubating.jar +parquet-generator-1.7.0.jar +parquet-hadoop-1.7.0.jar +parquet-hadoop-bundle-1.6.0.jar +parquet-jackson-1.7.0.jar +pmml-agent-1.2.7.jar +pmml-model-1.2.7.jar +pmml-schema-1.2.7.jar +protobuf-java-2.5.0.jar +py4j-0.9.1.jar +pyrolite-4.9.jar +reflectasm-1.07-shaded.jar +scala-compiler-2.11.7.jar +scala-library-2.11.7.jar +scala-parser-combinators_2.11-1.0.4.jar +scala-reflect-2.11.7.jar +scala-xml_2.11-1.0.2.jar +scalap-2.11.7.jar +servlet-api-2.5.jar +slf4j-api-1.7.10.jar +slf4j-log4j12-1.7.10.jar +snappy-0.2.jar +snappy-java-1.1.2.jar +spire-macros_2.11-0.7.4.jar +spire_2.11-0.7.4.jar +stax-api-1.0-2.jar +stax-api-1.0.1.jar +stream-2.7.0.jar +super-csv-2.2.0.jar +univocity-parsers-1.5.6.jar +unused-1.0.0.jar +xbean-asm5-shaded-4.4.jar +xercesImpl-2.9.1.jar +xmlenc-0.52.jar +xz-1.0.jar +zookeeper-3.4.6.jar diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Phrase.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Phrase.scala new file mode 100644 index 0000000000000..3488e06e0ea6b --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Phrase.scala @@ -0,0 +1,255 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.feature + +import scala.util.matching.Regex +import scala.util.Try + +import org.apache.hadoop.fs.Path + +import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.ml.{Estimator, Model} +import org.apache.spark.ml.param.{DoubleParam, IntParam, ParamMap, Params} +import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} +import org.apache.spark.ml.util._ +import org.apache.spark.mllib.linalg.{Vector, Vectors, VectorUDT} +import org.apache.spark.mllib.stat.Statistics +import org.apache.spark.sql.{functions => f, Column, DataFrame} +import org.apache.spark.sql.functions._ +import org.apache.spark.sql.types.{StructField, StructType} +import org.apache.spark.sql.SQLContext +import org.apache.spark.SparkConf +import org.apache.spark.SparkContext +import org.apache.spark.SparkContext._ + + +/** + * Params for [[Word2Phrase]] and [[Word2PhraseModel]]. + */ +private[feature] trait Word2PhraseParams extends Params with HasInputCol with HasOutputCol { + + /** + * delta + * Default: 100 + * @group param + */ + val delta: IntParam = new IntParam(this, "delta", + "minimum word occurrence") + + /** @group getParam */ + def getDelta: Int = $(delta) + + /** + * minimum number of occurrences before word is counted + * Default: 5 + * @group param + */ + val minWords: IntParam = new IntParam(this, "minWords", + "minimum word count before it's counted") + + /** + * threshold for score + * Default: 0.00001 + * @group param + */ + val threshold: DoubleParam = new DoubleParam(this, "threshold", + "score threshold") + + /** @group getParam */ + def getThreshold: Double = $(threshold) + + /** Validates and transforms the input schema. */ + protected def validateAndTransformSchema(schema: StructType): StructType = { + validateParams() + val inputType = schema($(inputCol)).dataType + require(inputType.isInstanceOf[VectorUDT], + s"Input column ${$(inputCol)} must be a vector column") + require(!schema.fieldNames.contains($(outputCol)), + s"Output column ${$(outputCol)} already exists.") + val outputFields = schema.fields :+ StructField($(outputCol), new VectorUDT, false) + StructType(outputFields) + } +} + +/** + * :: Experimental :: + * Creates a training model for word2phrase + * Uses the Word2Phrase algorithm to determine which words to turn into phrases + */ +@Experimental +class Word2Phrase(override val uid: String) + extends Estimator[Word2PhraseModel] with Word2PhraseParams with DefaultParamsWritable { + + def this() = this(Identifiable.randomUID("deltathresholdScal")) + + setDefault(delta -> 100, threshold -> 0.00001, minWords -> 5) + + /** @group setParam */ + def setInputCol(value: String): this.type = set(inputCol, value) + + /** @group setParam */ + def setOutputCol(value: String): this.type = set(outputCol, value) + + /** @group setParam */ + def setDelta(value: Int): this.type = set(delta, value) + + /** @group setParam */ + def setThreshold(value: Double): this.type = set(threshold, value) + + def setMinWords(value: Int): this.type = set(minWords, value) + + override def fit(dataset: DataFrame): Word2PhraseModel = { + + import dataset.sqlContext.implicits._ + val sqlContext = dataset.sqlContext + + val tokenizer = new RegexTokenizer().setInputCol($(inputCol)).setOutputCol("words") + .setPattern("\\W") + val wordsData = tokenizer.transform(dataset) + + val inputColName = $(inputCol) + val ind = wordsData.select(s"$inputColName") + + // counts the number of times each word appears + val counts = ind.rdd.flatMap(line => line(0).asInstanceOf[String].toLowerCase.split("\\s+")) + .map(word => (word, 1)).reduceByKey(_ + _).toDF("word", "count") + val wordCountName = Identifiable.randomUID("wc") + counts.registerTempTable(wordCountName) + + val ngram = new NGram().setInputCol("words").setOutputCol("ngrams") + val ngramDataFrame = ngram.transform(wordsData) + + // counts the number of bigrams (w1 w2, w2 w3, etc.) + val biGramCount = ngramDataFrame.select("ngrams").rdd.flatMap(line => line(0). + asInstanceOf[Seq[String]]).map(word => (word, 1)).reduceByKey(_ + _). + toDF("biGram", "count") + val biGramCountName = Identifiable.randomUID("bgc") + biGramCount.registerTempTable(biGramCountName) + + // calculate the score for each bigram + val deltaA = $(delta) + val minWordsA = $(minWords) + val biGramScoresName = Identifiable.randomUID("bgs") + sqlContext.sql(s"""select biGram, (bigram_count - $deltaA)/(word1_count * word2_count) + as score from (select biGrams.biGram as biGram, biGrams.count as bigram_count, + wc1.word as word1, wc1.count as word1_count, wc2.word as word2, + wc2.count as word2_count from (Select biGram, count, split(biGram,' ')[0] as word1, + split(biGram,' ')[1] as word2 from $biGramCountName) biGrams inner join $wordCountName + as wc1 on (wc1.word = biGrams.word1) inner join $wordCountName as wc2 on + (wc2.word = biGrams.word2)) biGramsStats where word2_count > $minWordsA and + word1_count > $minWordsA order by score desc""").registerTempTable(biGramScoresName) + + val thresholdA = $(threshold) + // Scores > threshold + val biGrams = sqlContext.sql(s"""select biGram from $biGramScoresName where + score > $thresholdA""").collect() + val bigramList = biGrams.map(row => (row(0).toString)) + + copyValues(new Word2PhraseModel(uid, bigramList).setParent(this)) + + } + + override def transformSchema(schema: StructType): StructType = { + validateAndTransformSchema(schema) + } + + override def copy(extra: ParamMap): Word2Phrase = defaultCopy(extra) +} + +@Since("1.6.0") +object Word2Phrase extends DefaultParamsReadable[Word2Phrase] { + + @Since("1.6.0") + override def load(path: String): Word2Phrase = super.load(path) +} + +/** + * :: Experimental :: + * Model fitted by [[Word2Phrase]]. + */ +@Experimental +class Word2PhraseModel private[ml] ( + override val uid: String, + val bigramList: Array[String]) + extends Model[Word2PhraseModel] with Word2PhraseParams with MLWritable { + + import Word2PhraseModel._ + + /** @group setParam */ + def setInputCol(value: String): this.type = set(inputCol, value) + + /** @group setParam */ + def setOutputCol(value: String): this.type = set(outputCol, value) + + override def transform(dataset: DataFrame): DataFrame = { + + var mapBiGrams = udf((t: String) => bigramList.foldLeft(t){case (z, r) => + z.replaceAll("(?i)" + Regex.quote(r), r.split(" ").mkString("_"))}) + dataset.withColumn($(outputCol), mapBiGrams(dataset($(inputCol)))) + } + + override def transformSchema(schema: StructType): StructType = { + validateAndTransformSchema(schema) + } + + override def copy(extra: ParamMap): Word2PhraseModel = { + val copied = new Word2PhraseModel(uid, bigramList) + copyValues(copied, extra).setParent(parent) + } + + @Since("1.6.0") + override def write: MLWriter = new Word2PhraseModelWriter(this) +} + +@Since("1.6.0") +object Word2PhraseModel extends MLReadable[Word2PhraseModel] { + + private[Word2PhraseModel] + class Word2PhraseModelWriter(instance: Word2PhraseModel) extends MLWriter { + + private case class Data(bigramList: Seq[String]) + + override protected def saveImpl(path: String): Unit = { + DefaultParamsWriter.saveMetadata(instance, path, sc) + val data = new Data(instance.bigramList) + val dataPath = new Path(path, "data").toString + sqlContext.createDataFrame(Seq(data)).repartition(1).write.parquet(dataPath) + } + } + + private class Word2PhraseModelReader extends MLReader[Word2PhraseModel] { + + private val className = classOf[Word2PhraseModel].getName + + override def load(path: String): Word2PhraseModel = { + val metadata = DefaultParamsReader.loadMetadata(path, sc, className) + val dataPath = new Path(path, "data").toString + val data = sqlContext.read.parquet(dataPath).select("bigramList").head() + val bigramList = data.getAs[Seq[String]](0).toArray + val model = new Word2PhraseModel(metadata.uid, bigramList) + DefaultParamsReader.getAndSetParams(model, metadata) + model + } + } + + @Since("1.6.0") + override def read: MLReader[Word2PhraseModel] = new Word2PhraseModelReader + + @Since("1.6.0") + override def load(path: String): Word2PhraseModel = super.load(path) +} diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/Word2PhraseSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/Word2PhraseSuite.scala new file mode 100644 index 0000000000000..d65fbf207d375 --- /dev/null +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/Word2PhraseSuite.scala @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.feature + +import org.apache.spark.SparkFunSuite +import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils} +import org.apache.spark.mllib.util.MLlibTestSparkContext +import org.apache.spark.mllib.util.TestingUtils._ +import org.apache.spark.util.Utils + +class Word2PhraseSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { + + test("Word2Phrase Trained Model; Set Functions") { + + var wordDataFrame = sqlContext.createDataFrame(Seq( + (0, "Hi I heard about Spark"), + (1, "I heard Java could use case classes"), + (2, "I heard Logistic regression models are neat") + )).toDF("label", "inputWords") + + var t = new Word2Phrase().setInputCol("inputWords").setOutputCol("out") + t.setDelta(0) + t.setThreshold(-100) + t.setMinWords(0) + + var model = t.fit(wordDataFrame) + var actualDf = model.transform(wordDataFrame) + + var expectedDf = sqlContext.createDataFrame(Seq( + (0, "hi_i_heard_about_spark"), + (1, "i_heard_java_could_use_case_classes"), + (2, "i_heard_logistic_regression_models_are_neat") + )).toDF("label", "bigrams") + + var expected = expectedDf.map(row => (row(1).toString)).collect() + var actual = actualDf.map(row => (row(2).toString)).collect() + + assert(expected.deep == actual.deep) + } + + test("Word2PhraseModel Read/Write") { + + val wordDataFrame = sqlContext.createDataFrame(Seq( + (0, "Hi I heard about Spark"), + (1, "I wish Java could use case classes"), + (2, "Logistic regression models are neat") + )).toDF("label", "inputWords") + + var t = new Word2Phrase().setInputCol("inputWords").setOutputCol("out") + + var model = t.fit(wordDataFrame) + + val newInstance = testDefaultReadWrite(model) + assert(newInstance.bigramList === model.bigramList) + } + + +}