From 2e5fb77da13f53c834361d50a806f170ae2b4c83 Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Thu, 19 Sep 2019 19:24:33 +0800 Subject: [PATCH 1/2] run sqlflow with hive --- doc/run_with_hive.md | 51 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 doc/run_with_hive.md diff --git a/doc/run_with_hive.md b/doc/run_with_hive.md new file mode 100644 index 0000000000..188a359e42 --- /dev/null +++ b/doc/run_with_hive.md @@ -0,0 +1,51 @@ +# Run SQLFlow with Hive via HiveServer2 + +This is a tutorial on how to run SQLFlow which connects to the hive server2. + +For the most production environment, the system administrators may setup hive server with [authentication configuration](https://cwiki.apache.org/confluence/display/Hive/Setting+Up+HiveServer2#SettingUpHiveServer2-Authentication/SecurityConfiguration): e.g. KERBEROS, LDAP, PAM or CUSTOM. + +## Connect Hive Server wih No SASL + +Launch your standalone hive server Docker container by running: + +``` bash +> docker run -d -p 8888:8888 --name=hive sqlflow/gohive:dev python3 -m http.server 8899 +``` + +This implies settings in `hive-site.xml`: + +``` text +hive.server2.authentication = NOSASL +``` + +Test SQLFlow by running a query in Jupyter Notebook + +``` bash +> docker run --rm --net=container:hive sqlflow/sqlflow \ +bash -c "sqlflowserver --datasource='hive://root:root@localhost:10000/' & +SQLFLOW_SERVER=localhost:50051 jupyter notebook --ip=0.0.0.0 --port=8888 --allow-root --NotebookApp.token=''" +``` + +## Connect Hive Server with PLAIN SASL + +This section would use the [PAM](https://cwiki.apache.org/confluence/display/Hive/Setting+Up+HiveServer2#SettingUpHiveServer2-PluggableAuthenticationModules(PAM)) authentication to do the demonstration. + +Launch your standalone hive server Docker container with enable the PAM authentication: + +``` bash +> docker run -d -e WITH_HS2_PAM_AUTH=ON -p 8888:8888 --name=hive sqlflow/gohive:dev python3 -m http.server 8899 +``` + +This implies settings in `hive-site.xml`: + +``` text +hive.server2.authentication = PAM +``` + +Test SQLFlow by running a query in Jupyter Notebook: + +``` bash +> docker run --rm --net=container:hive sqlflow/sqlflow \ +bash -c "sqlflowserver --datasource='hive://sqlflow:sqlflow@localhost:10000/?auth=PLAIN' & +SQLFLOW_SERVER=localhost:50051 jupyter notebook --ip=0.0.0.0 --port=8888 --allow-root --NotebookApp.token=''" +``` From e52c83f858f6c2c1c17d0072672cb0342392641f Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Fri, 20 Sep 2019 19:21:01 +0800 Subject: [PATCH 2/2] add how sqlflow connects with hive tutorial --- doc/run_with_hive.md | 53 +++++++++++++++++++++++++++++++++++--------- 1 file changed, 42 insertions(+), 11 deletions(-) diff --git a/doc/run_with_hive.md b/doc/run_with_hive.md index 188a359e42..e9f81e6942 100644 --- a/doc/run_with_hive.md +++ b/doc/run_with_hive.md @@ -1,24 +1,54 @@ -# Run SQLFlow with Hive via HiveServer2 +# How SQLFlow connects with Hive -This is a tutorial on how to run SQLFlow which connects to the hive server2. +This document is a tutorial on how SQLFlow connects Hive via [HiveServer2](https://cwiki.apache.org/confluence/display/Hive/HiveServer2+Overview). -For the most production environment, the system administrators may setup hive server with [authentication configuration](https://cwiki.apache.org/confluence/display/Hive/Setting+Up+HiveServer2#SettingUpHiveServer2-Authentication/SecurityConfiguration): e.g. KERBEROS, LDAP, PAM or CUSTOM. +## Connect Existing Hive server -## Connect Hive Server wih No SASL +To connect an existing Hive server instance, we only need to configure a `datasource` string in the format of + +``` text +hive://user:password@ip:port/dbname[?auth=&session.=...&session=valueN] +``` + +In the above format, + +- `user:password` is the username and password of hiveserver2. +- `ip:port` is the endpoint which the hiveserver2 instance listened on. +- `dbname` is the default database name. +- `auth_mechanism` is the authentication mechanism of hiveserver2, can be `NOSASL` for unsecurest transport or `PLAIN` for SASL transport. +- parameters with prefix `session.` is the session confiuration of Hive Thrift API, such as `session.mapreduce_job_queuename=mr` implies `mapreduce.job.queuename=mr`. + +You can find more details at [gohive](https://sql-machine-learning.github.io/doc_index/gohive.html). + +Using the `datasource` string, you can launch an all-in-one Docker container by running: + +``` bash +docker run --rm -p 8888:8888 sqlflow/sqlflow bash -c \ +"sqlflowserver --datasource='hive://root:root@localhost:10000/iris' & +SQLFLOW_SERVER=localhost:50051 jupyter notebook --ip=0.0.0.0 --port=8888 --allow-root --NotebookApp.token=''" +``` + +Then you can open a web browser and go to `localhost:8888`. There are many SQLFlow tutorials, e.g. `tutorial_dnn_iris.ipynb`. You can follow the tutorials and substitute the data for your own use. + +## Connect standalone Hive server for testing + +We also pack a standalone Hive server Docker image for testing. + +### Connect Hive server with NOSASL Transport Launch your standalone hive server Docker container by running: ``` bash -> docker run -d -p 8888:8888 --name=hive sqlflow/gohive:dev python3 -m http.server 8899 +> docker run -d -p 8888:8888 --name=hive sqlflow/gohive:dev ``` This implies settings in `hive-site.xml`: ``` text -hive.server2.authentication = NOSASL +hive.server2.authentication=NOSASL ``` -Test SQLFlow by running a query in Jupyter Notebook +Test SQLFlow by running the tutorials in Jupyter Notebook: ``` bash > docker run --rm --net=container:hive sqlflow/sqlflow \ @@ -26,23 +56,24 @@ bash -c "sqlflowserver --datasource='hive://root:root@localhost:10000/' & SQLFLOW_SERVER=localhost:50051 jupyter notebook --ip=0.0.0.0 --port=8888 --allow-root --NotebookApp.token=''" ``` -## Connect Hive Server with PLAIN SASL +## Connect Hive Server with PLAIN SASL Transport This section would use the [PAM](https://cwiki.apache.org/confluence/display/Hive/Setting+Up+HiveServer2#SettingUpHiveServer2-PluggableAuthenticationModules(PAM)) authentication to do the demonstration. Launch your standalone hive server Docker container with enable the PAM authentication: ``` bash -> docker run -d -e WITH_HS2_PAM_AUTH=ON -p 8888:8888 --name=hive sqlflow/gohive:dev python3 -m http.server 8899 +> docker run -d -e WITH_HS2_PAM_AUTH=ON -p 8888:8888 --name=hive sqlflow/gohive:dev ``` This implies settings in `hive-site.xml`: ``` text -hive.server2.authentication = PAM +hive.server2.authentication=PAM +hive.server2.authentication.pam.services=login,sshd ``` -Test SQLFlow by running a query in Jupyter Notebook: +Test SQLFlow by running the tutorials in Jupyter Notebook: ``` bash > docker run --rm --net=container:hive sqlflow/sqlflow \