Word Count Example
Word Count Example
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.*;
import org.apache.hadoop.mapreduce.lib.output.*;
//Main Class
//Name of the Main Class or Driver Class within which we have our Mapper and Reducer Class and
//the main Method
//MAPPER CLASS
/*We override the map method which is defined in the Parent (Mapper) Class. It takes 3 arguments
as Inputs map (KEYIN key, VALUEIN value, Context context )
In the map method, we receive a record (single line). It is stored in a string variable line. Using
StringTokenizer, we are breaking the line into individual words called tokens, on the basis of space as
delimiter. If the line was Hello There, StringTokenizer will give two tokens Hello and There. Finally
using the context object we are dumping the Mapper output. So as per our example the Output
from the Mapper will be Hello 1 & There 1 and so on. The Output from the Mapper is taken as
Input by the Reducer */
public void map(LongWritable key, Text value, Context context) throws IOException,
InterruptedException {
//Converting the record (single line) to String and storing it in a String variable line
String line = value.toString();
//Running while loop to get each token(word) one by one from StringTokenizer
//Writing the output as (word, one), the value of word will be equal to token and value of one is 1
context.write(word, one); }
//Reducer Class takes 4 Arguments i.e. Reducer <KEYIN, VALUEIN, KEYOUT, VALUEOUT>
/*We override the reduce method which is defined in the Parent (Reduce) Class. It takes 3
arguments as Inputs reduce (KEYIN key, VALUEIN value, Context context )
In the reduce method, we receive a key as word and a list of values as input. For eg: Hello <1,1,1,1>
So to find out the occurrence of the word Hello in the input file then we simply have to sum all the
values of the list. Hence we run a for loop to iterate over the values one by one and adding it to
variable sum. Finally we will write the output i.e key (word) & value (sum) using the context object.
So as per the above example the output will be: Hello 4 */
public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException,
InterruptedException {
int sum = 0;
//We are adding the value to the variable over every iteration
//Finally writing the key and the value of sum(number of times the word occurred in the input file)
//to the output file
context.write(key, new IntWritable(sum)); }
//main method known as entry point of the application. This is the method which is called as soon as
//jar is executed
//Creating the object of Job class and passing the conf object and Job name as arguments. The Job
//class allows the user to configure the job, submit it and control its execution.
job.setJarByClass(WordCountNew.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
//Adding a path which will act as a input for MR job. args[0] means it will use the first argument
//written on terminal as input path
//Setting the path to a directory where MR job will dump the output. args[1] means it will use the
//second argument written on terminal as output path
FileOutputFormat.setOutputPath(job,new Path(args[1]));
//Submitting the job to the cluster and waiting for its completion
job.waitForCompletion(true); }