Practical 2-3
Practical 2-3
package in.project.mapreduce;
import java.io.IOException;
import java.util.StringTokenizer;
/* All org.apache.hadoop packages can be imported using the jar present in lib directory of this java project. */
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
/* The Patent program finds the number of sub-patents associated with each id in the provided input file. We write a
map reduce code to achieve this, where mapper makes key value pair from the input file and reducer does
aggregation on this key value pair. */
/*Map class is static and extends MapReduceBase and implements Mapper interface having four hadoop generics
type LongWritable, Text, Text, Text. */
//Mapper
/*This method takes the input as text data type and and tokenizes input by taking whitespace as delimiter. Now
key value pair is made and this key value pair is passed to reducer. @method_arguments key, value, output, reporter
@return void */
@Override
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//Converting the record (single line) to String and storing it in a String variable line
String line = value.toString();
//StringTokenizer is breaking the record (line) according to the delimiter whitespace
StringTokenizer tokenizer = new StringTokenizer(line," ");
//Iterating through all the tokens and forming the key value pair
while (tokenizer.hasMoreTokens()) {
/* The first token is going in jiten, second token in jiten1, third token in jiten,fourth token in jiten1 and so on. */
//Reducer
/* Reduce class is static and extends MapReduceBase and implements Reducer interface having four hadoop
generics type Text, Text, Text, IntWritable. */
@Override
public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
/* Iterates through all the values available with a key and add them together and give the final result as the
key and sum of its values */
for(Text x : values)
{
sum++;
}
/*Driver
\* This method is used for setting all the configuration properties. It acts as a driver for map reduce code.
@return void, @method_arguments args, @throws Exception */
//reads the default configuration of cluster from the configuration xml files
Configuration conf = new Configuration();
//Initializing the job with the default configuration of the cluster
Job job = new Job(conf, "patent");
//Assigning the driver class name
job.setJarByClass(Patent.class);
//Explicitly setting the out key/value type from the mapper if it is not same as that of reducer
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
//Defining the output key class for the final output i.e. from reducer
job.setOutputKeyClass(Text.class);
//Defining the output value class for the final output i.e. from reducer
job.setOutputValueClass(IntWritable.class);
//Defining the output key class for the final output i.e. from reducer
job.setOutputKeyClass(Text.class);
//Defining the output value class for the final output i.e. from reducer
job.setOutputValueClass(Text.class);
//Defining input Format class which is responsible to parse the dataset into a key value pair
job.setInputFormatClass(TextInputFormat.class);
//Defining output Format class which is responsible to parse the final key-value output from MR framework to a
text file into the hard disk
job.setOutputFormatClass(TextOutputFormat.class);
//Configuring the input/output path from the filesystem into the job
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
//deleting the output path automatically from hdfs so that we don't have delete it explicitly
outputPath.getFileSystem(conf).delete(outputPath);
}
}
Output:
Patent Number of Associated Sub-patents
1 13
2 10
3 4