• No results found

Introduction To Hadoop

N/A
N/A
Protected

Academic year: 2021

Share "Introduction To Hadoop"

Copied!
22
0
0

Loading.... (view fulltext now)

Full text

(1)

Introduction To Hadoop

Kenneth Heafield

Google Inc

January 14, 2008

Example code from Hadoop 0.13.1 used under the Apache License Version 2.0 and modified for presentation. Except as otherwise noted, the content of this

(2)

Outline

1 Word Count Code Mapper

Reducer Main

2 How it Works Serialization Data Flow

3 Lab

(3)

Word Count Code Mapper

Mapper

< “wikipedia.org”, “The Free” >→ < “The”, 1 >, < “Free”, 1 >

public void map(WritableComparable key, Writable value, OutputCollector output, Reporter reporter) throws IOException { String line = ((Text)value).toString();

StringTokenizer itr = new StringTokenizer(line);

Text word = new Text();

while (itr.hasMoreTokens()) { word.set(itr.nextToken());

output.collect(word, new IntWritable(1));

} }

(4)

Word Count Code Mapper

Mapper

<“wikipedia.org”, “The Free” >→ < “The”, 1 >, < “Free”, 1 >

public void map(WritableComparable key, Writable value, OutputCollector output, Reporter reporter) throws IOException { String line = ((Text)value).toString();

StringTokenizer itr = new StringTokenizer(line);

Text word = new Text();

while (itr.hasMoreTokens()) { word.set(itr.nextToken());

output.collect(word, new IntWritable(1));

} }

(5)

Word Count Code Mapper

Mapper

< “wikipedia.org”,“The Free”>→ < “The”, 1 >, < “Free”, 1 >

public void map(WritableComparable key, Writable value, OutputCollector output, Reporter reporter) throws IOException { String line = ((Text)value).toString();

StringTokenizer itr = new StringTokenizer(line);

Text word = new Text();

while (itr.hasMoreTokens()) { word.set(itr.nextToken());

output.collect(word, new IntWritable(1));

} }

(6)

Word Count Code Mapper

Mapper

< “wikipedia.org”, “The Free” >→ < “The”, 1 >, < “Free”, 1 >

public void map(WritableComparable key, Writable value, OutputCollector output, Reporter reporter) throws IOException { String line = ((Text)value).toString();

StringTokenizer itr = new StringTokenizer(line);

Text word = new Text();

while (itr.hasMoreTokens()) { word.set(itr.nextToken());

output.collect(word, new IntWritable(1));

} }

(7)

Word Count Code Mapper

Mapper

< “wikipedia.org”, “The Free” >→ < “The”, 1 >, < “Free”, 1 >

public void map(WritableComparable key, Writable value, OutputCollector output, Reporter reporter) throws IOException { String line = ((Text)value).toString();

StringTokenizer itr = new StringTokenizer(line);

Text word = new Text();

while (itr.hasMoreTokens()) { word.set(itr.nextToken());

output.collect(word, new IntWritable(1));

} }

(8)

Word Count Code Reducer

Reducer

< “The”, 1 >, < “The”, 1 >→ < “The”, 2 >

public void reduce(WritableComparable key, Iterator values,

OutputCollector output, Reporter reporter) throws IOException { int sum = 0;

while (values.hasNext()) {

sum += ((IntWritable) values.next()).get();

}

output.collect(key, new IntWritable(sum));

}

(9)

Word Count Code Reducer

Reducer

<“The”, 1 >, <“The”, 1 >→ < “The”, 2 >

public void reduce(WritableComparable key, Iterator values,

OutputCollector output, Reporter reporter) throws IOException { int sum = 0;

while (values.hasNext()) {

sum += ((IntWritable) values.next()).get();

}

output.collect(key, new IntWritable(sum));

}

(10)

Word Count Code Reducer

Reducer

< “The”,1>, < “The”,1>→ < “The”, 2 >

public void reduce(WritableComparable key, Iterator values,

OutputCollector output, Reporter reporter) throws IOException { int sum = 0;

while (values.hasNext()) {

sum += ((IntWritable) values.next()).get();

}

output.collect(key, new IntWritable(sum));

}

(11)

Word Count Code Reducer

Reducer

< “The”, 1 >, < “The”, 1 >→ < “The”, 2 >

public void reduce(WritableComparable key, Iterator values,

OutputCollector output, Reporter reporter) throws IOException { int sum = 0;

while (values.hasNext()) {

sum += ((IntWritable) values.next()).get();

}

output.collect(key, new IntWritable(sum));

}

(12)

Word Count Code Main

Main

public static void main(String[] args) throws IOException {

JobConf conf = new JobConf(WordCount.class);

conf.setJobName("wordcount");

conf.setMapperClass(MapClass.class);

conf.setCombinerClass(ReduceClass.class);

conf.setReducerClass(ReduceClass.class);

conf.setNumMapTasks(new Integer(40));

conf.setNumReduceTasks(new Integer(30));

conf.setInputPath(new Path("/shared/wikipedia_small"));

conf.setOutputPath(new Path("/user/kheafield/word_count"));

conf.setOutputKeyClass(Text.class);

conf.setOutputValueClass(IntWritable.class);

(13)

Word Count Code Main

Main

public static void main(String[] args) throws IOException {

JobConf conf = new JobConf(WordCount.class);

conf.setJobName("wordcount");

conf.setMapperClass(MapClass.class);

conf.setCombinerClass(ReduceClass.class);

conf.setReducerClass(ReduceClass.class);

conf.setNumMapTasks(new Integer(40));

conf.setNumReduceTasks(new Integer(30));

conf.setInputPath(new Path("/shared/wikipedia_small"));

conf.setOutputPath(new Path("/user/kheafield/word_count"));

conf.setOutputKeyClass(Text.class);

conf.setOutputValueClass(IntWritable.class);

(14)

Word Count Code Main

Main

public static void main(String[] args) throws IOException {

JobConf conf = new JobConf(WordCount.class);

conf.setJobName("wordcount");

conf.setMapperClass(MapClass.class);

conf.setCombinerClass(ReduceClass.class);

conf.setReducerClass(ReduceClass.class);

conf.setNumMapTasks(new Integer(40));

conf.setNumReduceTasks(new Integer(30));

conf.setInputPath(new Path("/shared/wikipedia_small"));

conf.setOutputPath(new Path("/user/kheafield/word_count"));

conf.setOutputKeyClass(Text.class);

conf.setOutputValueClass(IntWritable.class);

(15)

Word Count Code Main

Main

public static void main(String[] args) throws IOException {

JobConf conf = new JobConf(WordCount.class);

conf.setJobName("wordcount");

conf.setMapperClass(MapClass.class);

conf.setCombinerClass(ReduceClass.class);

conf.setReducerClass(ReduceClass.class);

conf.setNumMapTasks(new Integer(40));

conf.setNumReduceTasks(new Integer(30));

conf.setInputPath(new Path("/shared/wikipedia_small"));

conf.setOutputPath(new Path("/user/kheafield/word_count"));

conf.setOutputKeyClass(Text.class);

conf.setOutputValueClass(IntWritable.class);

(16)

How it Works Serialization

Types

Purpose

Simple serialization for keys, values, and other data

Interface Writable

Read and write binary format Convert to String for text formats

WritableComparable adds sorting order for keys

Example Implementations

ArrayWritable is only Writable BooleanWritable

(17)

How it Works Serialization

A Writable

public class IntPairWritable implements Writable { public int first;

public int second;

public void write(DataOutput out) throws IOException { out.writeInt(first);

out.writeInt(second);

}

public void readFields(DataInput in) throws IOException { first = in.readInt();

second = in.readInt();

}

public int hashCode() { return first + second; } public String toString() {

(18)

How it Works Serialization

WritableComparable Method

public int compareTo(Object other) {

IntPairWritable o = (IntPairWritable)other;

if (first < o.first) return -1;

if (first > o.first) return 1;

if (second < o.second) return -1;

if (second > o.second) return 1;

return 0;

}

(19)

How it Works Data Flow

Data Flow

Default Flow

1 Mappers read from HDFS

2 Map output is partitioned by key and sent to Reducers

3 Reducers sort input by key

4 Reduce output is written to HDFS

1 HDFS 2 Mapper 3 Reducer 4 HDFS

Input Map Sort Reduce Output

(20)

How it Works Data Flow

Combiners

Concept

Add counts at Mapper before sending to Reducer.

Word count is 6 minutes with combiners and 14 without.

Implementation

Mapper caches output and periodically calls Combiner Input to Combine may be from Map or Combine Combiner uses interface as Reducer

Mapper

Input Map

Combine Sort Reduce Output

(21)

Lab

Exercises

Recommended: Word Count Get word count running.

Bigrams

Count bigrams and unigrams efficiently.

Capitalization

With what probability is a word capitalized?

Indexer

In what documents does each word appear? Where in the documents?

(22)

Lab

Instructions

1 Login to the cluster successfully (and set your password).

2 Get Eclipse installed, so you can build Java code.

3 Install the Hadoop plugin for Eclipse so you can deploy jobs to the cluster.

4 Set up your Eclipse workspace from a template that we provide.

5 Run the word counter example over the Wikipedia data set.

References

Related documents

public String createFile(String title, String type, String description, String parentId, InputStream content) throws UserRecoverableAuthIOException, IOException;. public

public void doGet( HttpServletRequest request , HttpServletResponse response) throws IOException, ServletException { response.setContentType(&#34;text/html&#34;);.. PrintWriter out

public void doGet (HttpServletRequest req, HttpServletResponse res) throws ServletException, IOException {.. String who = req.getParameter(&#34;who&#34;); HttpSession sesh

public void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {. response.setContentType(&#34;text/html&#34;); PrintWriter out

z void map(K1 key, V1 value, OutputCollector&lt;K2, V2&gt; output, Reporter reporter).. z void

public void service(HttpServletRequest request, HttpServletResponse response) throws IOException { response.setContentType(&#34;text/html&#34;); PrintWriter ut =

• map function implemented as public void map() – key and value as input (according to type parameters) – OutputCollector used to emit key/value pairs – Reporter for logging

public static void main(String[] args) throws IOException { Configuration conf = HBaseConfiguration.create();. HBaseAdmin admin = new HBaseAdmin(conf); byte [] tableName