support hdfs file for

This commit is contained in:
Jeff Zhang 2021-06-17 23:07:55 +08:00
parent ee8d11b8fd
commit bdb4dcbbfa
4 changed files with 63 additions and 39 deletions

View file

@ -80,11 +80,11 @@ Zeppelin supports python language which is very popular in data analytics and ma
<tr>
<td>zeppelin.yarn.dist.archives</td>
<td></td>
<td>Comma separated list of archives to be extracted into the working directory of interpreter. e.g. You can specify conda pack archive files via this property in python's yarn mode</td>
<td>Comma separated list of archives to be extracted into the working directory of interpreter. e.g. You can specify conda pack archive files via this property in python's yarn mode. It could be either files in local filesystem or files on hadoop compatible file systems</td>
</tr>
<tr>
<td>zeppelin.interpreter.conda.env.name</td>
<td>environment</td>
<td></td>
<td>conda environment name, aka the folder name in the working directory of interpreter</td>
</tr>
</table>
@ -377,41 +377,50 @@ Here's one example of yml file which could be used to generate a conda environme
* Create yml file for conda environment, write the following content into file `env_python_3.yml`
```text
name: python_3
name: python_3_env
channels:
- conda-forge
- defaults
dependencies:
- python=3.7
- pycodestyle=2.5.0
- numpy=1.17.3
- pandas=0.25.0
- scipy=1.3.1
- grpcio=1.22.0
- hvplot=0.5.2
- protobuf=3.10.0
- pandasql=0.7.3
- ipython=7.8.0
- matplotlib=3.0.3
- ipykernel=5.1.2
- jupyter_client=5.3.4
- bokeh=1.3.4
- panel=0.6.0
- holoviews=1.12.3
- pip
- pip:
- bkzep==0.6.1
- python=3.7
- pycodestyle
- numpy
- pandas
- scipy
- grpcio
- protobuf
- pandasql
- ipython
- ipykernel
- jupyter_client
- panel
- pyyaml
- seaborn
- plotnine
- hvplot
- intake
- intake-parquet
- intake-xarray
- altair
- vega_datasets
- pyarrow
```
* Create conda environment via this yml file
* Create conda environment via this yml file using either `conda` or `mamba`
```bash
conda env create -f env_python_3.yml
```
* Pack the conda environment
```bash
mamba env create -f python_3_env
```
* Pack the conda environment using either `conda`
```bash
@ -426,14 +435,11 @@ Specify the following properties to enable yarn mode for python interpreter, and
zeppelin.interpreter.launcher yarn
zeppelin.yarn.dist.archives /home/hadoop/python_3.tar.gz#environment
zeppelin.interpreter.conda.env.name environment
zeppelin.python ./environment/bin/python
```
`zeppelin.yarn.dist.archives` is the python conda environment tar which is created in step 1.
This tar will be shipped to yarn container and untar in the working directory of yarn container.
`environment` is the folder name after untar. Also you need to specify `zeppelin.python` as `./environment/bin/python` which is the
python path of the conda environment in yarn container.
`environment` in `/home/hadoop/python_3.tar.gz#environment` is the folder name after untar. This folder name should be the same as `zeppelin.interpreter.conda.env.name`.
## Python environments (used for non-yarn mode)

View file

@ -223,9 +223,12 @@ public class PythonInterpreter extends Interpreter {
// Run python script
// Choose python in the order of
// condaPythonExec > zeppelin.python
// {conda.env.name}/bin/python > condaPythonExec > zeppelin.python
protected String getPythonExec() {
if (condaPythonExec != null) {
String condaEnv = getProperty("zeppelin.interpreter.conda.env.name");
if (StringUtils.isNotBlank(condaEnv)) {
return condaEnv + "/bin/python";
} else if (condaPythonExec != null) {
return condaPythonExec;
} else {
return getProperty("zeppelin.python", "python");

View file

@ -242,7 +242,7 @@ public class JupyterKernelInterpreter extends AbstractInterpreter {
CommandLine cmd = CommandLine.parse(pythonExecutable);
cmd.addArgument(kernelWorkDir.getAbsolutePath() + "/kernel_server.py");
cmd.addArgument(getKernelName());
cmd.addArgument(kernelPort + "");
cmd.addArgument(String.valueOf(kernelPort));
Map<String, String> envs = setupKernelEnv();
jupyterKernelProcessLauncher = new JupyterKernelProcessLauncher(cmd, envs);

View file

@ -276,18 +276,24 @@ public class YarnRemoteInterpreterProcess extends RemoteInterpreterProcess {
String yarnDistArchives = launchContext.getProperties().getProperty("zeppelin.yarn.dist.archives");
if (StringUtils.isNotBlank(yarnDistArchives)) {
for (String localArchive : yarnDistArchives.split(",")) {
URI localURI = null;
for (String distArchive : yarnDistArchives.split(",")) {
URI distArchiveURI = null;
try {
localURI = new URI(localArchive);
distArchiveURI = new URI(distArchive);
} catch (URISyntaxException e) {
throw new IOException("Invalid uri: " + localArchive, e);
throw new IOException("Invalid uri: " + distArchive, e);
}
if (distArchiveURI.getScheme() == null || "file".equals(distArchiveURI.getScheme())) {
// zeppelin.yarn.dist.archives is local file
srcPath = localFs.makeQualified(new Path(distArchiveURI));
destPath = copyFileToRemote(stagingDir, srcPath, (short) 1);
} else {
// zeppelin.yarn.dist.archives is files on any hadoop compatible file system
destPath = new Path(removeLink(distArchive));
}
srcPath = localFs.makeQualified(new Path(localURI));
destPath = copyFileToRemote(stagingDir, srcPath, (short) 1);
String linkName = srcPath.getName();
if (localURI.getFragment() != null) {
linkName = localURI.getFragment();
if (distArchiveURI.getFragment() != null) {
linkName = distArchiveURI.getFragment();
}
addResource(fs, destPath, localResources, LocalResourceType.ARCHIVE, linkName);
}
@ -352,6 +358,15 @@ public class YarnRemoteInterpreterProcess extends RemoteInterpreterProcess {
return amContainer;
}
private String removeLink(String path) {
int pos = path.lastIndexOf("#");
if (pos != -1) {
return path.substring(0, pos);
} else {
return path;
}
}
/**
* Populate the classpath entry in the given environment map with any application
* classpath specified through the Hadoop and Yarn configurations.