mirror of
https://github.com/apache/zeppelin
synced 2026-05-24 09:38:26 +00:00
use spark-submit to run spark interpreter process when SPARK_HOME is defined
This commit is contained in:
parent
5de01c6800
commit
2d27e9cb7d
4 changed files with 73 additions and 86 deletions
|
|
@ -124,9 +124,7 @@ If you set `SPARK_HOME`, you should deploy spark binary on the same location to
|
|||
Yarn
|
||||
|
||||
# ./conf/zeppelin-env.sh
|
||||
export HADOOP_CONF_DIR=/path/to/hadoop_conf_dir
|
||||
|
||||
`HADOOP_CONF_DIR` should contains yarn-site.xml and core-site.xml.
|
||||
export SPARK_HOME=/path/to/spark_dir
|
||||
|
||||
### Run
|
||||
./bin/zeppelin-daemon.sh start
|
||||
|
|
|
|||
|
|
@ -72,77 +72,55 @@ fi
|
|||
|
||||
# set spark related env variables
|
||||
if [[ "${INTERPRETER_ID}" == "spark" ]]; then
|
||||
# add Hadoop jars into classpath
|
||||
if [[ -n "${HADOOP_HOME}" ]]; then
|
||||
# Apache
|
||||
addEachJarInDir "${HADOOP_HOME}/share"
|
||||
|
||||
# CDH
|
||||
addJarInDir "${HADOOP_HOME}"
|
||||
addJarInDir "${HADOOP_HOME}/lib"
|
||||
fi
|
||||
|
||||
# autodetect HADOOP_CONF_HOME by heuristic
|
||||
if [[ -n "${HADOOP_HOME}" ]] && [[ -z "${HADOOP_CONF_DIR}" ]]; then
|
||||
if [[ -d "${HADOOP_HOME}/etc/hadoop" ]]; then
|
||||
export HADOOP_CONF_DIR="${HADOOP_HOME}/etc/hadoop"
|
||||
elif [[ -d "/etc/hadoop/conf" ]]; then
|
||||
export HADOOP_CONF_DIR="/etc/hadoop/conf"
|
||||
fi
|
||||
fi
|
||||
|
||||
if [[ -n "${HADOOP_CONF_DIR}" ]] && [[ -d "${HADOOP_CONF_DIR}" ]]; then
|
||||
ZEPPELIN_CLASSPATH+=":${HADOOP_CONF_DIR}"
|
||||
fi
|
||||
|
||||
# add Spark jars into classpath
|
||||
if [[ -n "${SPARK_HOME}" ]]; then
|
||||
addJarInDir "${SPARK_HOME}/lib"
|
||||
PYSPARKPATH="${SPARK_HOME}/python:${SPARK_HOME}/python/lib/pyspark.zip:${SPARK_HOME}/python/lib/py4j-0.8.2.1-src.zip"
|
||||
SPARK_SUBMIT="${SPARK_HOME}/bin/spark-submit"
|
||||
SPARK_APP_JAR="$(ls ${ZEPPELIN_HOME}/interpreter/spark/zeppelin-spark*.jar)"
|
||||
else
|
||||
# add Hadoop jars into classpath
|
||||
if [[ -n "${HADOOP_HOME}" ]]; then
|
||||
# Apache
|
||||
addEachJarInDir "${HADOOP_HOME}/share"
|
||||
|
||||
# CDH
|
||||
addJarInDir "${HADOOP_HOME}"
|
||||
addJarInDir "${HADOOP_HOME}/lib"
|
||||
fi
|
||||
|
||||
addJarInDir "${INTERPRETER_DIR}/dep"
|
||||
PYSPARKPATH="${ZEPPELIN_HOME}/interpreter/spark/pyspark/pyspark.zip:${ZEPPELIN_HOME}/interpreter/spark/pyspark/py4j-0.8.2.1-src.zip"
|
||||
fi
|
||||
|
||||
# autodetect SPARK_CONF_DIR
|
||||
if [[ -n "${SPARK_HOME}" ]] && [[ -z "${SPARK_CONF_DIR}" ]]; then
|
||||
if [[ -d "${SPARK_HOME}/conf" ]]; then
|
||||
SPARK_CONF_DIR="${SPARK_HOME}/conf"
|
||||
if [[ -z "${PYTHONPATH}" ]]; then
|
||||
export PYTHONPATH="${PYSPARKPATH}"
|
||||
else
|
||||
export PYTHONPATH="${PYTHONPATH}:${PYSPARKPATH}"
|
||||
fi
|
||||
fi
|
||||
unset PYSPARKPATH
|
||||
|
||||
# read spark-*.conf if exists
|
||||
if [[ -d "${SPARK_CONF_DIR}" ]]; then
|
||||
ls ${SPARK_CONF_DIR}/spark-*.conf > /dev/null 2>&1
|
||||
if [[ "$?" -eq 0 ]]; then
|
||||
for file in ${SPARK_CONF_DIR}/spark-*.conf; do
|
||||
while read -r line; do
|
||||
echo "${line}" | grep -e "^spark[.]" > /dev/null
|
||||
if [ "$?" -ne 0 ]; then
|
||||
# skip the line not started with 'spark.'
|
||||
continue;
|
||||
fi
|
||||
SPARK_CONF_KEY=`echo "${line}" | sed -e 's/\(^spark[^ ]*\)[ \t]*\(.*\)/\1/g'`
|
||||
SPARK_CONF_VALUE=`echo "${line}" | sed -e 's/\(^spark[^ ]*\)[ \t]*\(.*\)/\2/g'`
|
||||
export ZEPPELIN_JAVA_OPTS+=" -D${SPARK_CONF_KEY}=\"${SPARK_CONF_VALUE}\""
|
||||
done < "${file}"
|
||||
done
|
||||
# autodetect HADOOP_CONF_HOME by heuristic
|
||||
if [[ -n "${HADOOP_HOME}" ]] && [[ -z "${HADOOP_CONF_DIR}" ]]; then
|
||||
if [[ -d "${HADOOP_HOME}/etc/hadoop" ]]; then
|
||||
export HADOOP_CONF_DIR="${HADOOP_HOME}/etc/hadoop"
|
||||
elif [[ -d "/etc/hadoop/conf" ]]; then
|
||||
export HADOOP_CONF_DIR="/etc/hadoop/conf"
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
if [[ -z "${PYTHONPATH}" ]]; then
|
||||
export PYTHONPATH="${PYSPARKPATH}"
|
||||
else
|
||||
export PYTHONPATH="${PYTHONPATH}:${PYSPARKPATH}"
|
||||
fi
|
||||
if [[ -n "${HADOOP_CONF_DIR}" ]] && [[ -d "${HADOOP_CONF_DIR}" ]]; then
|
||||
ZEPPELIN_CLASSPATH+=":${HADOOP_CONF_DIR}"
|
||||
fi
|
||||
|
||||
unset PYSPARKPATH
|
||||
export SPARK_CLASSPATH+=":${ZEPPELIN_CLASSPATH}"
|
||||
fi
|
||||
fi
|
||||
|
||||
export SPARK_CLASSPATH+=":${ZEPPELIN_CLASSPATH}"
|
||||
CLASSPATH+=":${ZEPPELIN_CLASSPATH}"
|
||||
|
||||
${ZEPPELIN_RUNNER} ${JAVA_INTP_OPTS} -cp ${CLASSPATH} ${ZEPPELIN_SERVER} ${PORT} &
|
||||
if [[ -n "${SPARK_SUBMIT}" ]]; then
|
||||
${SPARK_SUBMIT} --class ${ZEPPELIN_SERVER} --driver-class-path "${CLASSPATH}" --driver-java-options "${JAVA_INTP_OPTS}" ${SPARK_SUBMIT_OPTIONS} ${SPARK_APP_JAR} ${PORT} &
|
||||
else
|
||||
${ZEPPELIN_RUNNER} ${JAVA_INTP_OPTS} -cp ${CLASSPATH} ${ZEPPELIN_SERVER} ${PORT} &
|
||||
fi
|
||||
|
||||
pid=$!
|
||||
if [[ -z "${pid}" ]]; then
|
||||
return 1;
|
||||
|
|
|
|||
|
|
@ -33,14 +33,8 @@
|
|||
# export ZEPPELIN_IDENT_STRING # A string representing this instance of zeppelin. $USER by default.
|
||||
# export ZEPPELIN_NICENESS # The scheduling priority for daemons. Defaults to 0.
|
||||
|
||||
# export ZEPPELIN_SPARK_USEHIVECONTEXT # Use HiveContext instead of SQLContext if set true. true by default.
|
||||
# export ZEPPELIN_SPARK_CONCURRENTSQL # Execute multiple SQL concurrently if set true. false by default.
|
||||
# export ZEPPELIN_SPARK_MAXRESULT # Max number of SparkSQL result to display. 1000 by default.
|
||||
|
||||
# Options read in YARN client mode
|
||||
# export HADOOP_CONF_DIR # yarn-site.xml is located in configuration directory in HADOOP_CONF_DIR.
|
||||
|
||||
# Pyspark (supported with Spark 1.2.1 and above)
|
||||
# To configure pyspark, you need to set spark distribution's path to 'spark.home' property in Interpreter setting screen in Zeppelin GUI
|
||||
# export PYSPARK_PYTHON # path to the python command. must be the same path on the driver(Zeppelin) and all workers.
|
||||
# export PYTHONPATH # extra PYTHONPATH.
|
||||
## Spark configuration
|
||||
# export SPARK_HOME # When it is defined, load it instead of Zeppelin embedded Spark libraries
|
||||
# export SPARK_SUBMIT_OPTIONS # options to pass to spark submit. eg) "--driver-memory 512M --executor-memory 1G".
|
||||
# export ZEPPELIN_SPARK_CONCURRENTSQL # Execute multiple SQL concurrently if set true. false by default.
|
||||
# export ZEPPELIN_SPARK_MAXRESULT # Max number of SparkSQL result to display. 1000 by default.
|
||||
|
|
|
|||
|
|
@ -65,7 +65,6 @@
|
|||
<groupId>${project.groupId}</groupId>
|
||||
<artifactId>zeppelin-interpreter</artifactId>
|
||||
<version>${project.version}</version>
|
||||
<scope>provided</scope>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
|
|
@ -366,25 +365,43 @@
|
|||
</configuration>
|
||||
</plugin>
|
||||
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-shade-plugin</artifactId>
|
||||
<version>2.3</version>
|
||||
<configuration>
|
||||
<filters>
|
||||
<filter>
|
||||
<artifact>*:*</artifact>
|
||||
<excludes>
|
||||
<exclude>META-INF/*.SF</exclude>
|
||||
<exclude>META-INF/*.DSA</exclude>
|
||||
<exclude>META-INF/*.RSA</exclude>
|
||||
</excludes>
|
||||
</filter>
|
||||
</filters>
|
||||
<transformers>
|
||||
<transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/>
|
||||
<transformer implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
|
||||
<resource>reference.conf</resource>
|
||||
</transformer>
|
||||
</transformers>
|
||||
</configuration>
|
||||
<executions>
|
||||
<execution>
|
||||
<phase>package</phase>
|
||||
<goals>
|
||||
<goal>shade</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-dependency-plugin</artifactId>
|
||||
<version>2.8</version>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>copy-dependencies</id>
|
||||
<phase>package</phase>
|
||||
<goals>
|
||||
<goal>copy-dependencies</goal>
|
||||
</goals>
|
||||
<configuration>
|
||||
<outputDirectory>${project.build.directory}/../../interpreter/spark</outputDirectory>
|
||||
<overWriteReleases>false</overWriteReleases>
|
||||
<overWriteSnapshots>false</overWriteSnapshots>
|
||||
<overWriteIfNewer>true</overWriteIfNewer>
|
||||
<includeScope>runtime</includeScope>
|
||||
</configuration>
|
||||
</execution>
|
||||
<execution>
|
||||
<phase>package</phase>
|
||||
<goals>
|
||||
|
|
|
|||
Loading…
Reference in a new issue