use spark-submit to run spark interpreter process when SPARK_HOME is defined

This commit is contained in:
Lee moon soo 2015-09-01 19:45:07 -07:00
parent 5de01c6800
commit 2d27e9cb7d
4 changed files with 73 additions and 86 deletions

View file

@ -124,9 +124,7 @@ If you set `SPARK_HOME`, you should deploy spark binary on the same location to
Yarn
# ./conf/zeppelin-env.sh
export HADOOP_CONF_DIR=/path/to/hadoop_conf_dir
`HADOOP_CONF_DIR` should contains yarn-site.xml and core-site.xml.
export SPARK_HOME=/path/to/spark_dir
### Run
./bin/zeppelin-daemon.sh start

View file

@ -72,77 +72,55 @@ fi
# set spark related env variables
if [[ "${INTERPRETER_ID}" == "spark" ]]; then
# add Hadoop jars into classpath
if [[ -n "${HADOOP_HOME}" ]]; then
# Apache
addEachJarInDir "${HADOOP_HOME}/share"
# CDH
addJarInDir "${HADOOP_HOME}"
addJarInDir "${HADOOP_HOME}/lib"
fi
# autodetect HADOOP_CONF_HOME by heuristic
if [[ -n "${HADOOP_HOME}" ]] && [[ -z "${HADOOP_CONF_DIR}" ]]; then
if [[ -d "${HADOOP_HOME}/etc/hadoop" ]]; then
export HADOOP_CONF_DIR="${HADOOP_HOME}/etc/hadoop"
elif [[ -d "/etc/hadoop/conf" ]]; then
export HADOOP_CONF_DIR="/etc/hadoop/conf"
fi
fi
if [[ -n "${HADOOP_CONF_DIR}" ]] && [[ -d "${HADOOP_CONF_DIR}" ]]; then
ZEPPELIN_CLASSPATH+=":${HADOOP_CONF_DIR}"
fi
# add Spark jars into classpath
if [[ -n "${SPARK_HOME}" ]]; then
addJarInDir "${SPARK_HOME}/lib"
PYSPARKPATH="${SPARK_HOME}/python:${SPARK_HOME}/python/lib/pyspark.zip:${SPARK_HOME}/python/lib/py4j-0.8.2.1-src.zip"
SPARK_SUBMIT="${SPARK_HOME}/bin/spark-submit"
SPARK_APP_JAR="$(ls ${ZEPPELIN_HOME}/interpreter/spark/zeppelin-spark*.jar)"
else
# add Hadoop jars into classpath
if [[ -n "${HADOOP_HOME}" ]]; then
# Apache
addEachJarInDir "${HADOOP_HOME}/share"
# CDH
addJarInDir "${HADOOP_HOME}"
addJarInDir "${HADOOP_HOME}/lib"
fi
addJarInDir "${INTERPRETER_DIR}/dep"
PYSPARKPATH="${ZEPPELIN_HOME}/interpreter/spark/pyspark/pyspark.zip:${ZEPPELIN_HOME}/interpreter/spark/pyspark/py4j-0.8.2.1-src.zip"
fi
# autodetect SPARK_CONF_DIR
if [[ -n "${SPARK_HOME}" ]] && [[ -z "${SPARK_CONF_DIR}" ]]; then
if [[ -d "${SPARK_HOME}/conf" ]]; then
SPARK_CONF_DIR="${SPARK_HOME}/conf"
if [[ -z "${PYTHONPATH}" ]]; then
export PYTHONPATH="${PYSPARKPATH}"
else
export PYTHONPATH="${PYTHONPATH}:${PYSPARKPATH}"
fi
fi
unset PYSPARKPATH
# read spark-*.conf if exists
if [[ -d "${SPARK_CONF_DIR}" ]]; then
ls ${SPARK_CONF_DIR}/spark-*.conf > /dev/null 2>&1
if [[ "$?" -eq 0 ]]; then
for file in ${SPARK_CONF_DIR}/spark-*.conf; do
while read -r line; do
echo "${line}" | grep -e "^spark[.]" > /dev/null
if [ "$?" -ne 0 ]; then
# skip the line not started with 'spark.'
continue;
fi
SPARK_CONF_KEY=`echo "${line}" | sed -e 's/\(^spark[^ ]*\)[ \t]*\(.*\)/\1/g'`
SPARK_CONF_VALUE=`echo "${line}" | sed -e 's/\(^spark[^ ]*\)[ \t]*\(.*\)/\2/g'`
export ZEPPELIN_JAVA_OPTS+=" -D${SPARK_CONF_KEY}=\"${SPARK_CONF_VALUE}\""
done < "${file}"
done
# autodetect HADOOP_CONF_HOME by heuristic
if [[ -n "${HADOOP_HOME}" ]] && [[ -z "${HADOOP_CONF_DIR}" ]]; then
if [[ -d "${HADOOP_HOME}/etc/hadoop" ]]; then
export HADOOP_CONF_DIR="${HADOOP_HOME}/etc/hadoop"
elif [[ -d "/etc/hadoop/conf" ]]; then
export HADOOP_CONF_DIR="/etc/hadoop/conf"
fi
fi
fi
if [[ -z "${PYTHONPATH}" ]]; then
export PYTHONPATH="${PYSPARKPATH}"
else
export PYTHONPATH="${PYTHONPATH}:${PYSPARKPATH}"
fi
if [[ -n "${HADOOP_CONF_DIR}" ]] && [[ -d "${HADOOP_CONF_DIR}" ]]; then
ZEPPELIN_CLASSPATH+=":${HADOOP_CONF_DIR}"
fi
unset PYSPARKPATH
export SPARK_CLASSPATH+=":${ZEPPELIN_CLASSPATH}"
fi
fi
export SPARK_CLASSPATH+=":${ZEPPELIN_CLASSPATH}"
CLASSPATH+=":${ZEPPELIN_CLASSPATH}"
${ZEPPELIN_RUNNER} ${JAVA_INTP_OPTS} -cp ${CLASSPATH} ${ZEPPELIN_SERVER} ${PORT} &
if [[ -n "${SPARK_SUBMIT}" ]]; then
${SPARK_SUBMIT} --class ${ZEPPELIN_SERVER} --driver-class-path "${CLASSPATH}" --driver-java-options "${JAVA_INTP_OPTS}" ${SPARK_SUBMIT_OPTIONS} ${SPARK_APP_JAR} ${PORT} &
else
${ZEPPELIN_RUNNER} ${JAVA_INTP_OPTS} -cp ${CLASSPATH} ${ZEPPELIN_SERVER} ${PORT} &
fi
pid=$!
if [[ -z "${pid}" ]]; then
return 1;

View file

@ -33,14 +33,8 @@
# export ZEPPELIN_IDENT_STRING # A string representing this instance of zeppelin. $USER by default.
# export ZEPPELIN_NICENESS # The scheduling priority for daemons. Defaults to 0.
# export ZEPPELIN_SPARK_USEHIVECONTEXT # Use HiveContext instead of SQLContext if set true. true by default.
# export ZEPPELIN_SPARK_CONCURRENTSQL # Execute multiple SQL concurrently if set true. false by default.
# export ZEPPELIN_SPARK_MAXRESULT # Max number of SparkSQL result to display. 1000 by default.
# Options read in YARN client mode
# export HADOOP_CONF_DIR # yarn-site.xml is located in configuration directory in HADOOP_CONF_DIR.
# Pyspark (supported with Spark 1.2.1 and above)
# To configure pyspark, you need to set spark distribution's path to 'spark.home' property in Interpreter setting screen in Zeppelin GUI
# export PYSPARK_PYTHON # path to the python command. must be the same path on the driver(Zeppelin) and all workers.
# export PYTHONPATH # extra PYTHONPATH.
## Spark configuration
# export SPARK_HOME # When it is defined, load it instead of Zeppelin embedded Spark libraries
# export SPARK_SUBMIT_OPTIONS # options to pass to spark submit. eg) "--driver-memory 512M --executor-memory 1G".
# export ZEPPELIN_SPARK_CONCURRENTSQL # Execute multiple SQL concurrently if set true. false by default.
# export ZEPPELIN_SPARK_MAXRESULT # Max number of SparkSQL result to display. 1000 by default.

View file

@ -65,7 +65,6 @@
<groupId>${project.groupId}</groupId>
<artifactId>zeppelin-interpreter</artifactId>
<version>${project.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
@ -366,25 +365,43 @@
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>2.3</version>
<configuration>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
</excludes>
</filter>
</filters>
<transformers>
<transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/>
<transformer implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
<resource>reference.conf</resource>
</transformer>
</transformers>
</configuration>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-dependency-plugin</artifactId>
<version>2.8</version>
<executions>
<execution>
<id>copy-dependencies</id>
<phase>package</phase>
<goals>
<goal>copy-dependencies</goal>
</goals>
<configuration>
<outputDirectory>${project.build.directory}/../../interpreter/spark</outputDirectory>
<overWriteReleases>false</overWriteReleases>
<overWriteSnapshots>false</overWriteSnapshots>
<overWriteIfNewer>true</overWriteIfNewer>
<includeScope>runtime</includeScope>
</configuration>
</execution>
<execution>
<phase>package</phase>
<goals>