Modifying pom file and Making documentation

This commit is contained in:
mahmoudelgamal 2016-08-01 12:23:48 +02:00
parent 26fc59bc53
commit 5695077d13
3 changed files with 197 additions and 155 deletions

View file

@ -16,47 +16,13 @@
<artifactId>zeppelin-beam</artifactId>
<version>0.7.0-SNAPSHOT</version>
<repositories>
<repository>
<id>apache-beam</id>
<url>https://repository.apache.org/content/repositories/snapshots/org/apache/beam/</url>
</repository>
</repositories>
<dependencies>
<!-- <dependency> <groupId>com.google.cloud.dataflow</groupId> <artifactId>google-cloud-dataflow-java-sdk-all</artifactId>
<version>1.6.0</version> </dependency> <dependency> <groupId>org.apache.beam</groupId>
<artifactId>beam-runners-parent</artifactId> <version>0.1.0-incubating</version>
<type>pom</type> </dependency> <dependency> <groupId>org.apache.beam</groupId>
<artifactId>beam-sdks-java-io-google-cloud-platform</artifactId> <version>0.1.0-incubating</version>
</dependency> <dependency> <groupId>org.apache.beam</groupId> <artifactId>beam-parent</artifactId>
<version>0.1.0-incubating</version> <type>pom</type> </dependency>
<dependency> <groupId>org.apache.beam</groupId> <artifactId>beam-runners-google-cloud-dataflow-java</artifactId>
<version>0.2.0-incubating</version> </dependency> <dependency> <groupId>org.apache.beam</groupId>
<artifactId>beam-runners-core-java</artifactId> <version>0.2.0-incubating</version>
</dependency> <dependency> <groupId>org.apache.beam</groupId> <artifactId>beam-sdks-java-core</artifactId>
<version>0.1.0-incubating</version> </dependency> -->
<!--
<dependency>
<groupId>com.google.cloud.dataflow</groupId>
<artifactId>google-cloud-dataflow-java-sdk-all</artifactId>
<version>1.6.0</version>
<exclusions>
<exclusion>
<artifactId>google-api-client-jackson2</artifactId>
<groupId>com.google.api-client</groupId>
</exclusion>
<exclusion>
<artifactId>google-http-client-jackson2</artifactId>
<groupId>com.google.http-client</groupId>
</exclusion>
</exclusions>
</dependency>
-->
<dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.10</artifactId>
<version>1.4.1</version>
@ -89,74 +55,74 @@
<version>1.4.1</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-core</artifactId>
<version>2.3.0</version>
<exclusions>
<exclusion>
<artifactId>slf4j-log4j12</artifactId>
<groupId>org.slf4j</groupId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.3.0</version>
<exclusions>
<exclusion>
<artifactId>slf4j-log4j12</artifactId>
<groupId>org.slf4j</groupId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>2.3.0</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.3.0</version>
<exclusions>
<exclusion>
<artifactId>slf4j-log4j12</artifactId>
<groupId>org.slf4j</groupId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-annotations</artifactId>
<version>2.3.0</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-yarn-common</artifactId>
<version>2.3.0</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-common</artifactId>
<version>2.3.0</version>
<exclusions>
<exclusion>
<artifactId>slf4j-log4j12</artifactId>
<groupId>org.slf4j</groupId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-core</artifactId>
<version>1.2.1</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-core</artifactId>
<version>2.3.0</version>
<exclusions>
<exclusion>
<artifactId>slf4j-log4j12</artifactId>
<groupId>org.slf4j</groupId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.3.0</version>
<exclusions>
<exclusion>
<artifactId>slf4j-log4j12</artifactId>
<groupId>org.slf4j</groupId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>2.3.0</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.3.0</version>
<exclusions>
<exclusion>
<artifactId>slf4j-log4j12</artifactId>
<groupId>org.slf4j</groupId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-annotations</artifactId>
<version>2.3.0</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-yarn-common</artifactId>
<version>2.3.0</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-common</artifactId>
<version>2.3.0</version>
<exclusions>
<exclusion>
<artifactId>slf4j-log4j12</artifactId>
<groupId>org.slf4j</groupId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-core</artifactId>
<version>1.2.1</version>
</dependency>
<dependency>
<groupId>com.thoughtworks.qdox</groupId>
<artifactId>qdox</artifactId>
@ -164,18 +130,6 @@
</dependency>
<dependency>
<groupId>org.mdkt.compiler</groupId>
<artifactId>InMemoryJavaCompiler</artifactId>
<version>1.2</version>
</dependency>
<dependency>
<groupId>com.ning</groupId>
<artifactId>async-http-client</artifactId>
<version>1.9.31</version>
</dependency>
<dependency>
<groupId>com.google.code.gson</groupId>
@ -195,20 +149,7 @@
<version>0.1.0-incubating</version>
<type>pom</type>
</dependency>
<!--
<dependency>
<groupId>org.apache.beam</groupId>
<artifactId>beam-examples-java</artifactId>
<version>0.1.0-incubating</version>
<exclusions>
<exclusion>
<artifactId>slf4j-jdk14</artifactId>
<groupId>org.slf4j</groupId>
</exclusion>
</exclusions>
</dependency>
-->
<dependency>
<groupId>org.apache.beam</groupId>
<artifactId>beam-runners-core-java</artifactId>
@ -240,7 +181,7 @@
<artifactId>netty-all</artifactId>
<groupId>io.netty</groupId>
</exclusion>
</exclusions>
</dependency>
<dependency>
@ -255,7 +196,7 @@
</exclusion>
</exclusions>
</dependency>
<!--
<dependency>
<groupId>org.apache.beam</groupId>
<artifactId>beam-runners-google-cloud-dataflow-java</artifactId>
@ -268,33 +209,17 @@
</exclusion>
</exclusions>
</dependency>
--><dependency>
<dependency>
<groupId>org.apache.beam</groupId>
<artifactId>beam-runners-spark</artifactId>
<version>0.1.0-incubating</version>
<type>jar</type>
</dependency>
<!-- <dependency>
<groupId>com.google.api.client</groupId>
<artifactId>google-api-client-json</artifactId>
<version>1.2.3-alpha</version>
</dependency>
<dependency>
<groupId>com.google.api.client</groupId>
<artifactId>google-api-client-util</artifactId>
<version>1.2.3-alpha</version>
</dependency>
<dependency>
<groupId>com.google.api.client</groupId>
<artifactId>google-api-client-auth</artifactId>
<version>1.2.3-alpha</version>
</dependency>
-->
<dependency>
<groupId>${project.groupId}</groupId>
<artifactId>zeppelin-interpreter</artifactId>
@ -306,9 +231,7 @@
<artifactId>commons-exec</artifactId>
<version>1.3</version>
</dependency>
<!-- <dependency> <groupId>org.slf4j</groupId> <artifactId>slf4j-api</artifactId>
</dependency> <dependency> <groupId>org.slf4j</groupId> <artifactId>slf4j-log4j12</artifactId>
</dependency> -->
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>

View file

@ -0,0 +1,11 @@
[
{
"group": "beam",
"name": "beam",
"className": "org.apache.zeppelin.beam.BeamInterpreter",
"defaultInterpreter": true,
"properties": {
}
}
]

108
docs/interpreter/beam.md Normal file
View file

@ -0,0 +1,108 @@
---
layout: page
title: "Beam Interpreter"
description: ""
group: interpreter
---
{% include JB/setup %}
# Beam interpreter for Apache Zeppelin
<div id="toc"></div>
## Overview
[Apache Beam](http://beam.incubator.apache.org) is an open source, unified programming model that you can use to create a data processing pipeline. You start by building a program that defines the pipeline using one of the open source Beam SDKs. The pipeline is then executed by one of Beams supported distributed processing back-ends, which include Apache Flink, Apache Spark, and Google Cloud Dataflow.
Beam is particularly useful for Embarrassingly Parallel data processing tasks, in which the problem can be decomposed into many smaller bundles of data that can be processed independently and in parallel. You can also use Beam for Extract, Transform, and Load (ETL) tasks and pure data integration. These tasks are useful for moving data between different storage media and data sources, transforming data into a more desirable format, or loading data onto a new system.
# Apache Beam Pipeline Runners
The Beam Pipeline Runners translate the data processing pipeline you define with your Beam program into the API compatible with the distributed processing back-end of your choice. When you run your Beam program, youll need to specify the appropriate runner for the back-end where you want to execute your pipeline.
* Beam currently supports Runners that work with the following distributed processing back-ends:
- Google Cloud Dataflow
- Apache Flink
- Apache Spark
## How to use
Basically, You can write normal java code and determine the runner inside the code.
You should write the main method inside the main class beacuase the interpreter invoke this main to execute pipline.
Each paragraph is considered as separate job, there isn't any relate to any another job, Beacuse the interpreter is a static repl of java, we compile and run each paragraph apart.
**Example for Flink Runner**
```
%beam
import java.util.ArrayList;
import java.util.List;
import org.apache.spark.api.java.*;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.SparkConf;
import org.apache.spark.streaming.*;
import org.apache.spark.SparkContext;
import org.apache.beam.runners.direct.*;
import org.apache.beam.sdk.runners.*;
import org.apache.beam.sdk.options.*;
import org.apache.beam.runners.spark.*;
import org.apache.beam.runners.spark.io.ConsoleIO;
import org.apache.beam.runners.flink.*;
import org.apache.beam.runners.flink.examples.WordCount.Options;
import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.io.TextIO;
import org.apache.beam.sdk.options.PipelineOptionsFactory;
import org.apache.beam.sdk.transforms.Count;
import org.apache.beam.sdk.transforms.DoFn;
import org.apache.beam.sdk.transforms.MapElements;
import org.apache.beam.sdk.transforms.ParDo;
import org.apache.beam.sdk.transforms.SimpleFunction;
import org.apache.beam.sdk.values.KV;
public class MinimalWordCount {
static List<String> s = new ArrayList<>();
public static void main(String[] args) {
Options options = PipelineOptionsFactory.create().as(Options.class);
options.setRunner(FlinkPipelineRunner.class);
Pipeline p = Pipeline.create(options);
p.apply(TextIO.Read.from("/home/admin/mahmoud/work/bigdata/beam/shakespeare/input/file1.txt"))
.apply(ParDo.named("ExtractWords").of(new DoFn<String, String>() {
@Override
public void processElement(ProcessContext c) {
for (String word : c.element().split("[^a-zA-Z']+")) {
if (!word.isEmpty()) {
c.output(word);
}
}
}
}))
.apply(Count.<String> perElement())
.apply("FormatResults", ParDo.of(new DoFn<KV<String, Long>, String>() {
@Override
public void processElement(DoFn<KV<String, Long>, String>.ProcessContext arg0)
throws Exception {
s.add("\n" + arg0.element().getKey() + "\t" + arg0.element().getValue());
}
}));
p.run();
System.out.println("%table word\tcount");
for (int i = 0; i < s.size(); i++) {
System.out.print(s.get(i));
}
}
}
```