avro-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From tomwh...@apache.org
Subject svn commit: r1424014 - in /avro/trunk: ./ doc/ doc/examples/mr-example/ doc/examples/mr-example/src/ doc/examples/mr-example/src/main/ doc/examples/mr-example/src/main/java/ doc/examples/mr-example/src/main/java/example/ doc/src/content/xdocs/
Date Wed, 19 Dec 2012 18:32:13 GMT
Author: tomwhite
Date: Wed Dec 19 18:32:12 2012
New Revision: 1424014

URL: http://svn.apache.org/viewvc?rev=1424014&view=rev
Log:
AVRO-1211. Add MR guide to documentation. Contributed by Skye Wanderman-Milne.

Added:
    avro/trunk/doc/examples/mr-example/
    avro/trunk/doc/examples/mr-example/pom.xml   (with props)
    avro/trunk/doc/examples/mr-example/src/
    avro/trunk/doc/examples/mr-example/src/main/
    avro/trunk/doc/examples/mr-example/src/main/java/
    avro/trunk/doc/examples/mr-example/src/main/java/example/
    avro/trunk/doc/examples/mr-example/src/main/java/example/AvroWordCount.java   (with props)
    avro/trunk/doc/examples/mr-example/src/main/java/example/ColorCount.java   (with props)
    avro/trunk/doc/examples/mr-example/src/main/java/example/GenerateData.java   (with props)
    avro/trunk/doc/src/content/xdocs/mr.xml   (with props)
Modified:
    avro/trunk/CHANGES.txt
    avro/trunk/doc/build.xml
    avro/trunk/doc/src/content/xdocs/site.xml

Modified: avro/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/avro/trunk/CHANGES.txt?rev=1424014&r1=1424013&r2=1424014&view=diff
==============================================================================
--- avro/trunk/CHANGES.txt (original)
+++ avro/trunk/CHANGES.txt Wed Dec 19 18:32:12 2012
@@ -6,6 +6,9 @@ Trunk (not yet released)
 
   IMPROVEMENTS
 
+    AVRO-1211. Add MR guide to documentation. (Skye Wanderman-Milne via
+    tomwhite)
+
   BUG FIXES
 
 Avro 1.7.3 (6 December 2012)

Modified: avro/trunk/doc/build.xml
URL: http://svn.apache.org/viewvc/avro/trunk/doc/build.xml?rev=1424014&r1=1424013&r2=1424014&view=diff
==============================================================================
--- avro/trunk/doc/build.xml (original)
+++ avro/trunk/doc/build.xml Wed Dec 19 18:32:12 2012
@@ -21,6 +21,9 @@
       <arg value="-Dproject.content-dir=src"/>
       <arg value="-Dproject.site=../${build.dir}/"/>
     </exec>
+    <copy todir="${build.dir}/examples">
+      <fileset dir="examples"/>
+    </copy>
   </target>
 
   <target name="forrest.check" unless="forrest.home">

Added: avro/trunk/doc/examples/mr-example/pom.xml
URL: http://svn.apache.org/viewvc/avro/trunk/doc/examples/mr-example/pom.xml?rev=1424014&view=auto
==============================================================================
--- avro/trunk/doc/examples/mr-example/pom.xml (added)
+++ avro/trunk/doc/examples/mr-example/pom.xml Wed Dec 19 18:32:12 2012
@@ -0,0 +1,59 @@
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+
+  <groupId>example</groupId>
+  <artifactId>mr-example</artifactId>
+  <version>1.0</version>
+  <packaging>jar</packaging>
+
+  <name>mr-example</name>
+
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-compiler-plugin</artifactId>
+        <configuration>
+          <source>1.6</source>
+          <target>1.6</target>
+        </configuration>
+      </plugin>
+      <plugin>
+        <groupId>org.apache.avro</groupId>
+        <artifactId>avro-maven-plugin</artifactId>
+        <version>1.7.2</version>
+        <executions>
+          <execution>
+            <phase>generate-sources</phase>
+            <goals>
+              <goal>schema</goal>
+            </goals>
+            <configuration>
+              <sourceDirectory>${project.basedir}/../</sourceDirectory>
+              <outputDirectory>${project.basedir}/src/main/java/</outputDirectory>
+            </configuration>
+          </execution>
+        </executions>
+      </plugin>
+    </plugins>
+  </build>
+
+  <dependencies>
+    <dependency>
+      <groupId>org.apache.avro</groupId>
+      <artifactId>avro</artifactId>
+      <version>1.7.2</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.avro</groupId>
+      <artifactId>avro-mapred</artifactId>
+      <version>1.7.2</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.hadoop</groupId>
+      <artifactId>hadoop-core</artifactId>
+      <version>1.1.0</version>
+    </dependency>
+  </dependencies>
+</project>

Propchange: avro/trunk/doc/examples/mr-example/pom.xml
------------------------------------------------------------------------------
    svn:eol-style = native

Added: avro/trunk/doc/examples/mr-example/src/main/java/example/AvroWordCount.java
URL: http://svn.apache.org/viewvc/avro/trunk/doc/examples/mr-example/src/main/java/example/AvroWordCount.java?rev=1424014&view=auto
==============================================================================
--- avro/trunk/doc/examples/mr-example/src/main/java/example/AvroWordCount.java (added)
+++ avro/trunk/doc/examples/mr-example/src/main/java/example/AvroWordCount.java Wed Dec 19
18:32:12 2012
@@ -0,0 +1,105 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package example;
+
+import java.io.IOException;
+import java.util.*;
+
+import org.apache.avro.*;
+import org.apache.avro.Schema.Type;
+import org.apache.avro.mapred.*;
+import org.apache.hadoop.conf.*;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.*;
+import org.apache.hadoop.mapred.*;
+import org.apache.hadoop.util.*;
+
+/**
+ * The classic WordCount example modified to output Avro Pair<CharSequence,
+ * Integer> records instead of text.
+ */
+public class AvroWordCount extends Configured implements Tool {
+
+  public static class Map extends MapReduceBase implements Mapper<LongWritable, Text,
Text, IntWritable> {
+    private final static IntWritable one = new IntWritable(1);
+    private Text word = new Text();
+
+    public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable>
output, Reporter reporter)
+        throws IOException {
+      String line = value.toString();
+      StringTokenizer tokenizer = new StringTokenizer(line);
+      while (tokenizer.hasMoreTokens()) {
+        word.set(tokenizer.nextToken());
+        output.collect(word, one);
+      }
+    }
+  }
+
+  public static class Reduce extends MapReduceBase
+    implements Reducer<Text, IntWritable,
+                       AvroWrapper<Pair<CharSequence, Integer>>, NullWritable>
{
+
+    public void reduce(Text key, Iterator<IntWritable> values,
+        OutputCollector<AvroWrapper<Pair<CharSequence, Integer>>, NullWritable>
output,
+        Reporter reporter) throws IOException {
+      int sum = 0;
+      while (values.hasNext()) {
+        sum += values.next().get();
+      }
+      output.collect(new AvroWrapper<Pair<CharSequence, Integer>>(
+          new Pair<CharSequence, Integer>(key.toString(), sum)),
+          NullWritable.get());
+    }
+  }
+
+  public int run(String[] args) throws Exception {
+    if (args.length != 2) {
+      System.err.println("Usage: AvroWordCount <input path> <output path>");
+      return -1;
+    }
+
+    JobConf conf = new JobConf(AvroWordCount.class);
+    conf.setJobName("wordcount");
+
+    // We call setOutputSchema first so we can override the configuration
+    // parameters it sets
+    AvroJob.setOutputSchema(conf, Pair.getPairSchema(Schema.create(Type.STRING),
+        Schema.create(Type.INT)));
+
+    conf.setMapperClass(Map.class);
+    conf.setReducerClass(Reduce.class);
+
+    conf.setInputFormat(TextInputFormat.class);
+
+    conf.setMapOutputKeyClass(Text.class);
+    conf.setMapOutputValueClass(IntWritable.class);
+    conf.setOutputKeyComparatorClass(Text.Comparator.class);
+
+    FileInputFormat.setInputPaths(conf, new Path(args[0]));
+    FileOutputFormat.setOutputPath(conf, new Path(args[1]));
+
+    JobClient.runJob(conf);
+    return 0;
+  }
+
+  public static void main(String[] args) throws Exception {
+    int res = ToolRunner.run(new Configuration(), new AvroWordCount(), args);
+    System.exit(res);
+  }
+}

Propchange: avro/trunk/doc/examples/mr-example/src/main/java/example/AvroWordCount.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: avro/trunk/doc/examples/mr-example/src/main/java/example/ColorCount.java
URL: http://svn.apache.org/viewvc/avro/trunk/doc/examples/mr-example/src/main/java/example/ColorCount.java?rev=1424014&view=auto
==============================================================================
--- avro/trunk/doc/examples/mr-example/src/main/java/example/ColorCount.java (added)
+++ avro/trunk/doc/examples/mr-example/src/main/java/example/ColorCount.java Wed Dec 19 18:32:12
2012
@@ -0,0 +1,93 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package example;
+
+import java.io.IOException;
+
+import org.apache.avro.*;
+import org.apache.avro.Schema.Type;
+import org.apache.avro.mapred.*;
+import org.apache.hadoop.conf.*;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapred.*;
+import org.apache.hadoop.util.*;
+
+import example.avro.User;
+
+public class ColorCount extends Configured implements Tool {
+
+  public static class ColorCountMapper extends AvroMapper<User, Pair<CharSequence,
Integer>> {
+    @Override
+    public void map(User user, AvroCollector<Pair<CharSequence, Integer>> collector,
Reporter reporter)
+        throws IOException {
+      CharSequence color = user.getFavoriteColor();
+      // We need this check because the User.favorite_color field has type ["string", "null"]
+      if (color == null) {
+        color = "none";
+      }
+      collector.collect(new Pair<CharSequence, Integer>(color, 1));
+    }
+  }
+
+  public static class ColorCountReducer extends AvroReducer<CharSequence, Integer,
+                                                            Pair<CharSequence, Integer>>
{
+    @Override
+    public void reduce(CharSequence key, Iterable<Integer> values,
+                       AvroCollector<Pair<CharSequence, Integer>> collector,
+                       Reporter reporter)
+        throws IOException {
+      int sum = 0;
+      for (Integer value : values) {
+        sum += value;
+      }
+      collector.collect(new Pair<CharSequence, Integer>(key, sum));
+    }
+  }
+
+  public int run(String[] args) throws Exception {
+    if (args.length != 2) {
+      System.err.println("Usage: ColorCount <input path> <output path>");
+      return -1;
+    }
+
+    JobConf conf = new JobConf(getConf(), ColorCount.class);
+    conf.setJobName("colorcount");
+
+    FileInputFormat.setInputPaths(conf, new Path(args[0]));
+    FileOutputFormat.setOutputPath(conf, new Path(args[1]));
+
+    AvroJob.setMapperClass(conf, ColorCountMapper.class);
+    AvroJob.setReducerClass(conf, ColorCountReducer.class);
+
+    // Note that AvroJob.setInputSchema and AvroJob.setOutputSchema set
+    // relevant config options such as input/output format, map output
+    // classes, and output key class.
+    AvroJob.setInputSchema(conf, User.SCHEMA$);
+    AvroJob.setOutputSchema(conf, Pair.getPairSchema(Schema.create(Type.STRING),
+        Schema.create(Type.INT)));
+
+    JobClient.runJob(conf);
+    return 0;
+  }
+
+  public static void main(String[] args) throws Exception {
+    int res = ToolRunner.run(new Configuration(), new ColorCount(), args);
+    System.exit(res);
+  }
+}

Propchange: avro/trunk/doc/examples/mr-example/src/main/java/example/ColorCount.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: avro/trunk/doc/examples/mr-example/src/main/java/example/GenerateData.java
URL: http://svn.apache.org/viewvc/avro/trunk/doc/examples/mr-example/src/main/java/example/GenerateData.java?rev=1424014&view=auto
==============================================================================
--- avro/trunk/doc/examples/mr-example/src/main/java/example/GenerateData.java (added)
+++ avro/trunk/doc/examples/mr-example/src/main/java/example/GenerateData.java Wed Dec 19
18:32:12 2012
@@ -0,0 +1,39 @@
+package example;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Random;
+
+import org.apache.avro.file.DataFileWriter;
+import org.apache.avro.io.DatumWriter;
+import org.apache.avro.specific.SpecificDatumWriter;
+
+import example.avro.User;
+
+public class GenerateData {
+  public static final String[] COLORS = {"red", "orange", "yellow", "green", "blue", "purple",
null};
+  public static final int USERS = 20;
+  public static final String PATH = "./input/users.avro";
+
+  public static void main(String[] args) throws IOException {
+    // Open data file
+    File file = new File(PATH);
+    if (file.getParentFile() != null) {
+      file.getParentFile().mkdirs();
+    }
+    DatumWriter<User> userDatumWriter = new SpecificDatumWriter<User>(User.class);
+    DataFileWriter<User> dataFileWriter = new DataFileWriter<User>(userDatumWriter);
+    dataFileWriter.create(User.SCHEMA$, file);
+
+    // Create random users
+    User user;
+    Random random = new Random();
+    for (int i = 0; i < USERS; i++) {
+      user = new User("user", null, COLORS[random.nextInt(COLORS.length)]);
+      dataFileWriter.append(user);
+      System.out.println(user);
+    }
+
+    dataFileWriter.close();
+  }
+}

Propchange: avro/trunk/doc/examples/mr-example/src/main/java/example/GenerateData.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: avro/trunk/doc/src/content/xdocs/mr.xml
URL: http://svn.apache.org/viewvc/avro/trunk/doc/src/content/xdocs/mr.xml?rev=1424014&view=auto
==============================================================================
--- avro/trunk/doc/src/content/xdocs/mr.xml (added)
+++ avro/trunk/doc/src/content/xdocs/mr.xml Wed Dec 19 18:32:12 2012
@@ -0,0 +1,410 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+  -->
+<!DOCTYPE document PUBLIC "-//APACHE//DTD Documentation V2.0//EN"
+   "http://forrest.apache.org/dtd/document-v20.dtd" [
+  <!ENTITY % avro-entities PUBLIC "-//Apache//ENTITIES Avro//EN"
+	   "../../../../build/avro.ent">
+  %avro-entities;
+]>
+<document>
+  <header>
+    <title>Apache Avro&#153; &AvroVersion; Hadoop MapReduce guide</title>
+  </header>
+  <body>
+    <p>
+      Avro provides a convenient way to represent complex data structures within
+      a Hadoop MapReduce job.  Avro data can be used as both input to and output
+      from a MapReduce job, as well as the intermediate format.  The example in
+      this guide uses Avro data for all three, but it's possible to mix and
+      match; for instance, MapReduce can be used to aggregate a particular field
+      in an Avro record.
+    </p>
+    <p>
+      This guide assumes basic familiarity with both Hadoop MapReduce and Avro.
+      See the <a href="http://hadoop.apache.org/docs/current/">Hadoop
+      documentation</a> and the <a href="gettingstartedjava.html">Avro getting
+      started guide</a> for introductions to these projects.  This guide uses
+      the old MapReduce API (<code>org.apache.hadoop.mapred</code>).
+    </p>
+    <section>
+      <title>Setup</title>
+      <p>
+        The code from this guide is included in the Avro docs under
+        <em>examples/mr-example</em>.  The example is set up as a Maven project
+        that includes the necessary Avro and MapReduce dependencies and the Avro
+        Maven plugin for code generation, so no external jars are needed to run
+        the example.  In particular, the POM includes the following dependencies:
+      </p>
+      <source>
+&#60;dependency&#62;
+  &#60;groupId&#62;org.apache.avro&#60;/groupId&#62;
+  &#60;artifactId&#62;avro&#60;/artifactId&#62;
+  &#60;version&#62;&AvroVersion;&#60;/version&#62;
+&#60;/dependency&#62;
+&#60;dependency&#62;
+  &#60;groupId&#62;org.apache.avro&#60;/groupId&#62;
+  &#60;artifactId&#62;avro-mapred&#60;/artifactId&#62;
+  &#60;version&#62;&AvroVersion;&#60;/version&#62;
+&#60;/dependency&#62;
+&#60;dependency&#62;
+  &#60;groupId&#62;org.apache.hadoop&#60;/groupId&#62;
+  &#60;artifactId&#62;hadoop-core&#60;/artifactId&#62;
+  &#60;version&#62;1.1.0&#60;/version&#62;
+&#60;/dependency&#62;
+      </source>
+      <p>
+        And the following plugin:
+      </p>
+      <source>
+&#60;plugin>
+  &#60;groupId>org.apache.avro&#60;/groupId>
+  &#60;artifactId>avro-maven-plugin&#60;/artifactId>
+  &#60;version>&AvroVersion;&#60;/version>
+  &#60;executions>
+    &#60;execution>
+      &#60;phase>generate-sources&#60;/phase>
+      &#60;goals>
+        &#60;goal>schema&#60;/goal>
+      &#60;/goals>
+      &#60;configuration>
+        &#60;sourceDirectory>${project.basedir}/../&#60;/sourceDirectory>
+        &#60;outputDirectory>${project.basedir}/src/main/java/&#60;/outputDirectory>
+      &#60;/configuration>
+    &#60;/execution>
+  &#60;/executions>
+&#60;/plugin>
+      </source>
+      <p>
+        Alternatively, Avro jars can be downloaded directly from the <a
+        href="http://avro.apache.org/releases.html">Apache Avro&#153;
+        Releases</a> page.  The relevant Avro jars for this guide are
+        <em>avro-&AvroVersion;.jar</em> and
+        <em>avro-mapred-&AvroVersion;.jar</em>, as well as
+        <em>avro-tools-&AvroVersion;.jar</em> for code generation and viewing
+        Avro data files as JSON.  In addition, you will need to install Hadoop
+        in order to use MapReduce.
+      </p>
+    </section>
+
+    <section>
+      <title>Example: ColorCount</title>
+      <p>
+        Below is a simple example of a MapReduce that uses Avro.  This example
+        can be found in the Avro docs under
+        <em>examples/mr-example/src/main/java/example/ColorCount.java</em>.
+        We'll go over the specifics of what's going on in subsequent sections.
+      </p>
+      <source>
+package example;
+
+import java.io.IOException;
+
+import org.apache.avro.*;
+import org.apache.avro.Schema.Type;
+import org.apache.avro.mapred.*;
+import org.apache.hadoop.conf.*;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapred.*;
+import org.apache.hadoop.util.*;
+
+import example.avro.User;
+
+public class ColorCount extends Configured implements Tool {
+
+  public static class ColorCountMapper extends AvroMapper&#60;User, Pair&#60;CharSequence,
Integer>> {
+    @Override
+    public void map(User user, AvroCollector&#60;Pair&#60;CharSequence, Integer>>
collector, Reporter reporter)
+        throws IOException {
+      CharSequence color = user.getFavoriteColor();
+      // We need this check because the User.favorite_color field has type ["string", "null"]
+      if (color == null) {
+        color = "none";
+      }
+      collector.collect(new Pair&#60;CharSequence, Integer>(color, 1));
+    }
+  }
+
+  public static class ColorCountReducer extends AvroReducer&#60;CharSequence, Integer,
+                                                            Pair&#60;CharSequence, Integer>>
{
+    @Override
+    public void reduce(CharSequence key, Iterable&#60;Integer> values,
+                       AvroCollector&#60;Pair&#60;CharSequence, Integer>> collector,
+                       Reporter reporter)
+        throws IOException {
+      int sum = 0;
+      for (Integer value : values) {
+        sum += value;
+      }
+      collector.collect(new Pair&#60;CharSequence, Integer>(key, sum));
+    }
+  }
+
+  public int run(String[] args) throws Exception {
+    if (args.length != 2) {
+      System.err.println("Usage: ColorCount &#60;input path> &#60;output path>");
+      return -1;
+    }
+
+    JobConf conf = new JobConf(getConf(), ColorCount.class);
+    conf.setJobName("colorcount");
+
+    FileInputFormat.setInputPaths(conf, new Path(args[0]));
+    FileOutputFormat.setOutputPath(conf, new Path(args[1]));
+
+    AvroJob.setMapperClass(conf, ColorCountMapper.class);
+    AvroJob.setReducerClass(conf, ColorCountReducer.class);
+
+    // Note that AvroJob.setInputSchema and AvroJob.setOutputSchema set
+    // relevant config options such as input/output format, map output
+    // classes, and output key class.
+    AvroJob.setInputSchema(conf, User.SCHEMA$);
+    AvroJob.setOutputSchema(conf, Pair.getPairSchema(Schema.create(Type.STRING),
+        Schema.create(Type.INT)));
+
+    JobClient.runJob(conf);
+    return 0;
+  }
+
+  public static void main(String[] args) throws Exception {
+    int res = ToolRunner.run(new Configuration(), new ColorCount(), args);
+    System.exit(res);
+  }
+}
+      </source>
+      <p>
+        ColorCount reads in data files containing <code>User</code> records,
+        defined in <em>examples/user.avsc</em>, and counts the number of
+        instances of each favorite color.  (This example draws inspiration from
+        the canonical WordCount MapReduce application.)  The <code>User</code>
+        schema is defined as follows:
+      </p>
+      <source>
+{"namespace": "example.avro",
+ "type": "record",
+ "name": "User",
+ "fields": [
+     {"name": "name", "type": "string"},
+     {"name": "favorite_number",  "type": ["int", "null"]},
+     {"name": "favorite_color", "type": ["string", "null"]}
+ ]
+}
+      </source>
+      <p>
+        This schema is compiled into the <code>User</code> class used by
+        ColorCount via the Avro Maven plugin (see
+        <em>examples/mr-example/pom.xml</em> for how this is set up).
+      </p>
+      <p>
+        ColorCountMapper essentially takes a <code>User</code> as input and
+        extracts the <code>User</code>'s favorite color, emitting the key-value
+        pair <code>&#60;</code><em>favoriteColor</em><code>,
1></code>.
+        ColorCountReducer then adds up how many occurrences of a particular
+        favorite color were emitted, and outputs the result as a
+        <code>Pair</code> record.  These <code>Pair</code>s are serialized
to an
+        Avro data file.
+      </p>
+      <section>
+        <title>Running ColorCount</title>
+        <p>
+          The ColorCount application is provided as a Maven project in the Avro
+          docs under <em>examples/mr-example</em>.  To build the project,
+          including the code generation of the User schema, run:
+        </p>
+        <source>
+mvn compile
+        </source>
+        <p>
+          Next, run GenerateData to create an Avro data file,
+          <em>input/users.avro</em>, containing 20 <code>User</code>s
with
+          favorite colors chosen randomly from a list:
+        </p>
+        <source>
+mvn exec:java -q -Dexec.mainClass=example.GenerateData
+        </source>
+        <p>
+          Besides creating the data file, GenerateData prints the JSON
+          representations of the Users generated to stdout, for example:
+        </p>
+        <source>
+{"name": "user", "favorite_number": null, "favorite_color": "red"}
+{"name": "user", "favorite_number": null, "favorite_color": "green"}
+{"name": "user", "favorite_number": null, "favorite_color": "purple"}
+{"name": "user", "favorite_number": null, "favorite_color": null}
+...
+        </source>
+        <p>
+          Now we're ready to run ColorCount.  We specify our freshly-generated
+          <em>input</em> folder as the input path and <em>output</em>
as our
+          output folder (note that MapReduce will not start a job if the output
+          folder already exists):
+        </p>
+        <source>
+mvn exec:java -q -Dexec.mainClass=example.ColorCount -Dexec.args="input output"
+        </source>
+        <p>
+          Once ColorCount completes, checking the contents of the new
+          <em>output</em> directory should yield the following:
+        </p>
+        <source>
+$ ls output/
+part-00000.avro  _SUCCESS
+        </source>
+        <p>
+          You can check the contents of the generated Avro file using the avro-tools jar:
+        </p>
+        <source>
+$ java -jar /path/to/avro-tools-&AvroVersion;.jar tojson output/part-00000.avro
+{"value": 3, "key": "blue"}
+{"value": 7, "key": "green"}
+{"value": 1, "key": "none"}
+{"value": 2, "key": "orange"}
+{"value": 3, "key": "purple"}
+{"value": 2, "key": "red"}
+{"value": 2, "key": "yellow"}
+        </source>
+      </section>
+    </section>
+    <p>Now let's go over the ColorCount example in detail.</p>
+    <section>
+      <title>AvroMapper</title>
+      <p>
+        The easiest way to use Avro data files as input to a MapReduce job is to
+        subclass <code>AvroMapper</code>.  An <code>AvroMapper</code>
defines a
+        map function that takes an Avro datum as input and outputs a key/value
+        pair represented as a <code>Pair</code> record.  In the ColorCount
+        example, <code>ColorCountMapper</code> is an <code>AvroMapper</code>
+        that takes a <code>User</code> as input and outputs a
+        <code>Pair&#60;CharSequence, Integer>></code>, where the
+        <code>CharSequence</code> key is the user's favorite color and the
+        <code>Integer</code> value is 1.
+      </p>
+      <source>
+public static class ColorCountMapper extends AvroMapper&#60;User, Pair&#60;CharSequence,
Integer>> {
+  @Override
+  public void map(User user, AvroCollector&#60;Pair&#60;CharSequence, Integer>>
collector, Reporter reporter)
+      throws IOException {
+    CharSequence color = user.getFavoriteColor();
+    // We need this check because the User.favorite_color field has type ["string", "null"]
+    if (color == null) {
+      color = "none";
+    }
+    collector.collect(new Pair&#60;CharSequence, Integer>(color, 1));
+  }
+}
+      </source>
+      <p>
+        In order to use our <code>AvroMapper</code>, we must call
+        <code>AvroJob.setMapperClass</code> and
+        <code>AvroJob.setInputSchema</code>.
+      </p>
+      <source>
+AvroJob.setMapperClass(conf, ColorCountMapper.class);
+AvroJob.setInputSchema(conf, User.SCHEMA$);
+      </source>
+      <p>
+        Note that <code>AvroMapper</code> does not implement the
+        <code>Mapper</code> interface.  Under the hood, the specified Avro data
+        files are deserialized into <code>AvroWrapper</code>s containing the
+        actual data, which are processed by a <code>Mapper</code> that calls
the
+        configured <code>AvroMapper</code>'s map function.
+        <code>AvroJob.setInputSchema</code> sets up the relevant configuration
+        parameters needed to make this happen, thus you should not need to call
+        <code>JobConf.setMapperClass</code>,
+        <code>JobConf.setInputFormat</code>,
+        <code>JobConf.setMapOutputKeyClass</code>,
+        <code>JobConf.setMapOutputValueClass</code>, or
+        <code>JobConf.setOutputKeyComparatorClass</code>.
+      </p>
+    </section>
+    <section>
+      <title>AvroReducer</title>
+      <p>
+        Analogously to <code>AvroMapper</code>, an <code>AvroReducer</code>
+        defines a reducer function that takes the key/value types output by an
+        <code>AvroMapper</code> (or any mapper that outputs <code>Pair</code>s)
+        and outputs a key/value pair represented a <code>Pair</code> record.
 In
+        the ColorCount example, <code>ColorCountReducer</code> is an
+        <code>AvroReducer</code> that takes the <code>CharSequence</code>
key
+        representing a favorite color and the <code>Iterable&#60;Integer></code>
+        representing the counts for that color (they should all be 1 in this
+        example) and adds up the counts.
+      </p>
+      <source>
+public static class ColorCountReducer extends AvroReducer&#60;CharSequence, Integer,
+                                                          Pair&#60;CharSequence, Integer>>
{
+  @Override
+  public void reduce(CharSequence key, Iterable&#60;Integer> values,
+                     AvroCollector&#60;Pair&#60;CharSequence, Integer>> collector,
+                     Reporter reporter)
+      throws IOException {
+    int sum = 0;
+    for (Integer value : values) {
+      sum += value;
+    }
+    collector.collect(new Pair&#60;CharSequence, Integer>(key, sum));
+  }
+}
+      </source>
+      <p>
+        In order to use our <code>AvroReducer</code>, we must call
+        <code>AvroJob.setReducerClass</code> and
+        <code>AvroJob.setOutputSchema</code>.
+      </p>
+      <source>
+AvroJob.setReducerClass(conf, ColorCountReducer.class);
+AvroJob.setOutputSchema(conf, Pair.getPairSchema(Schema.create(Type.STRING),
+                                                 Schema.create(Type.INT)));
+      </source>
+      <p>
+        Note that <code>AvroReducer</code> does not implement the
+        <code>Reducer</code> interface.  The intermediate <code>Pair</code>s
+        output by the mapper are split into <code>AvroKey</code>s and
+        <code>AvroValue</code>s, which are processed by a <code>Reducer</code>
+        that calls the configured <code>AvroReducer</code>'s reduce function.
+        <code>AvroJob.setOutputSchema</code> sets up the relevant configuration
+        parameters needed to make this happen, thus you should not need to call
+        <code>JobConf.setReducerClass</code>,
+        <code>JobConf.setOutputFormat</code>,
+        <code>JobConf.setOutputKeyClass</code>,
+        <code>JobConf.setMapOutputKeyClass</code>,
+        <code>JobConf.setMapOutputValueClass</code>, or
+        <code>JobConf.setOutputKeyComparatorClass</code>.
+      </p>
+    </section>
+    <section>
+      <title>Learning more</title>
+      <p>
+        It's possible to mix <code>AvroMapper</code>s and
+        <code>AvroReducer</code>s with non-Avro <code>Mapper</code>s
and
+        <code>Reducer</code>s.  See the <a
+        href="http://avro.apache.org/docs/current/api/java/org/apache/avro/mapred/package-summary.html">
+        <code>org.apache.avro.mapred</code> documentation</a> for more
details.
+        There is also a <a
+        href="http://avro.apache.org/docs/current/api/java/org/apache/avro/mapreduce/package-summary.html">
+        <code>org.apache.avro.mapreduce</code> package</a> for use with
the new
+        MapReduce API (<code>org.apache.hadoop.mapreduce</code>).  It's also
+        possible to implement your own <code>Mapper</code>s and
+        <code>Reducer</code>s directly using the public classes provided in
+        these libraries.  See the AvroWordCount application, found under
+        <em>examples/mr-example/src/main/java/example/AvroWordCount.java</em>
in
+        the Avro documentation, for an example of implementing a
+        <code>Reducer</code> that outputs Avro data.
+      </p>
+    </section>
+  </body>
+</document>

Propchange: avro/trunk/doc/src/content/xdocs/mr.xml
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: avro/trunk/doc/src/content/xdocs/site.xml
URL: http://svn.apache.org/viewvc/avro/trunk/doc/src/content/xdocs/site.xml?rev=1424014&r1=1424013&r2=1424014&view=diff
==============================================================================
--- avro/trunk/doc/src/content/xdocs/site.xml (original)
+++ avro/trunk/doc/src/content/xdocs/site.xml Wed Dec 19 18:32:12 2012
@@ -49,6 +49,7 @@ See http://forrest.apache.org/docs/linki
     <c-api      label="C API"             href="ext:api/c/index" />
     <cpp-api    label="C++ API"           href="ext:api/cpp/index" />
     <csharp-api label="C# API"            href="ext:api/csharp/index" />
+    <mr         label="MapReduce guide"  href="mr.html" />
     <idl        label="IDL language"      href="idl.html" />
     <sasl       label="SASL profile"      href="sasl.html" />
     <wiki       label="Wiki"              href="ext:wiki" />



Mime
View raw message