Hive Pig PDF
Hive Pig PDF
Juliana Freire
New York University
Some slides from J. Lin
Need for High-Level Languages!
• Hadoop is great for large-data processing!
• But writing Java programs for everything is verbose and slow
• Not everyone wants to (or can) write Java code
• Solution: develop higher-level data processing languages
• Hive: HQL is like SQL
• Pig: Pig Latin is a bit like Perl
Hive and Pig!
• Hive: data warehousing application in Hadoop
• Query language is HQL, variant of SQL
• Tables stored on HDFS as flat files
• Developed by Facebook, now open source
• Pig: large-scale data processing system
• Scripts are written in Pig Latin, a dataflow language
• Developed by Yahoo!, now open source
• Roughly 1/3 of all Yahoo! internal jobs
• Common idea:
• Provide higher-level language to facilitate large-data processing
• Higher-level language “compiles down” to Hadoop jobs
Hive: Background!
• Started at Facebook
• Data was collected by nightly cron jobs into Oracle DB
• “ETL” via hand-coded python
• Grew from 10s of GBs (2006) to 1 TB/day new data (2007), now 10x
that
ETL
(Extract, Transform, and Load)
OLTP
Hadoop
OLAP
HDFS,
INTO TABLE SerDe
LOAD DATA LOCAL INPATH ‘/logs/status_updates’
status_updates PARTITION (ds=’2009-03-20’)
Visits
Pages
user url time url pagerank
Amy www.cnn.com 8:00 www.cnn.com 0.9
Amy www.crap.com 8:05 www.flickr.com 0.9
Amy www.myblog.com 10:00 www.myblog.com 0.7
Amy www.flickr.com 10:05 www.crap.com 0.2
Fred cnn.com/index.htm 12:00
. . .
. . .
Canonicalize URLs
Join
url = url
Group by user
Filter
avgPR > 0.5
load
. . .
. . .
load
canonicalize
join by url
. . .
group by user
. . .
compute average pagerank
filter
the answer
Pig Slides adapted from Olston et al.
MapReduce Code!
import java.io.IOException; reporter.setStatus("OK"); lp.setOutputKeyClass(Text.class);
import java.util.ArrayList; } lp.setOutputValueClass(Text.class);
import java.util.Iterator; lp.setMapperClass(LoadPages.class);
import java.util.List; // Do the cross product and collect the values FileInputFormat.addInputPath(lp, new
for (String s1 : first) { Path("/user/gates/pages"));
import org.apache.hadoop.fs.Path; for (String s2 : second) { FileOutputFormat.setOutputPath(lp,
import org.apache.hadoop.io.LongWritable; String outval = key + "," + s1 + "," + s2; new Path("/user/gates/tmp/indexed_pages"));
import org.apache.hadoop.io.Text; oc.collect(null, new Text(outval)); lp.setNumReduceTasks(0);
import org.apache.hadoop.io.Writable; reporter.setStatus("OK"); Job loadPages = new Job(lp);
import org.apache.hadoop.io.WritableComparable; }
import org.apache.hadoop.mapred.FileInputFormat; } JobConf lfu = new JobConf(MRExample.class);
import org.apache.hadoop.mapred.FileOutputFormat; } lfu.setJobName("Load and Filter Users");
import org.apache.hadoop.mapred.JobConf; } lfu.setInputFormat(TextInputFormat.class);
import org.apache.hadoop.mapred.KeyValueTextInputFormat; public static class LoadJoined extends MapReduceBase lfu.setOutputKeyClass(Text.class);
import org.apache.hadoop.mapred.Mapper; implements Mapper<Text, Text, Text, LongWritable> { lfu.setOutputValueClass(Text.class);
import org.apache.hadoop.mapred.MapReduceBase; lfu.setMapperClass(LoadAndFilterUsers.class);
import org.apache.hadoop.mapred.OutputCollector; public void map( FileInputFormat.add InputPath(lfu, new
import org.apache.hadoop.mapred.RecordReader; Text k, Path("/user/gates/users"));
import org.apache.hadoop.mapred.Reducer; Text val, FileOutputFormat.setOutputPath(lfu,
import org.apache.hadoop.mapred.Reporter; OutputColle ctor<Text, LongWritable> oc, new Path("/user/gates/tmp/filtered_users"));
import org.apache.hadoop.mapred.SequenceFileInputFormat; Reporter reporter) throws IOException { lfu.setNumReduceTasks(0);
import org.apache.hadoop.mapred.SequenceFileOutputFormat; // Find the url Job loadUsers = new Job(lfu);
import org.apache.hadoop.mapred.TextInputFormat; String line = val.toString();
import org.apache.hadoop.mapred.jobcontrol.Job; int firstComma = line.indexOf(','); JobConf join = new JobConf( MRExample.class);
import org.apache.hadoop.mapred.jobcontrol.JobC ontrol; int secondComma = line.indexOf(',', first Comma); join.setJobName("Join Users and Pages");
import org.apache.hadoop.mapred.lib.IdentityMapper; String key = line.substring(firstComma, secondComma); join.setInputFormat(KeyValueTextInputFormat.class);
// drop the rest of the record, I don't need it anymore, join.setOutputKeyClass(Text.class);
public class MRExample { // just pass a 1 for the combiner/reducer to sum instead. join.setOutputValueClass(Text.class);
public static class LoadPages extends MapReduceBase Text outKey = new Text(key); join.setMapperClass(IdentityMap per.class);
implements Mapper<LongWritable, Text, Text, Text> { oc.collect(outKey, new LongWritable(1L)); join.setReducerClass(Join.class);
} FileInputFormat.addInputPath(join, new
public void map(LongWritable k, Text val, } Path("/user/gates/tmp/indexed_pages"));
OutputCollector<Text, Text> oc, public static class ReduceUrls extends MapReduceBase FileInputFormat.addInputPath(join, new
Reporter reporter) throws IOException { implements Reducer<Text, LongWritable, WritableComparable, Path("/user/gates/tmp/filtered_users"));
// Pull the key out Writable> { FileOutputFormat.se tOutputPath(join, new
String line = val.toString(); Path("/user/gates/tmp/joined"));
int firstComma = line.indexOf(','); public void reduce( join.setNumReduceTasks(50);
String key = line.sub string(0, firstComma); Text ke y, Job joinJob = new Job(join);
String value = line.substring(firstComma + 1); Iterator<LongWritable> iter, joinJob.addDependingJob(loadPages);
Text outKey = new Text(key); OutputCollector<WritableComparable, Writable> oc, joinJob.addDependingJob(loadUsers);
// Prepend an index to the value so we know which file Reporter reporter) throws IOException {
// it came from. // Add up all the values we see JobConf group = new JobConf(MRE xample.class);
Text outVal = new Text("1 " + value); group.setJobName("Group URLs");
oc.collect(outKey, outVal); long sum = 0; group.setInputFormat(KeyValueTextInputFormat.class);
} wh ile (iter.hasNext()) { group.setOutputKeyClass(Text.class);
} sum += iter.next().get(); group.setOutputValueClass(LongWritable.class);
public static class LoadAndFilterUsers extends MapReduceBase reporter.setStatus("OK"); group.setOutputFormat(SequenceFi leOutputFormat.class);
implements Mapper<LongWritable, Text, Text, Text> { } group.setMapperClass(LoadJoined.class);
group.setCombinerClass(ReduceUrls.class);
public void map(LongWritable k, Text val, oc.collect(key, new LongWritable(sum)); group.setReducerClass(ReduceUrls.class);
OutputCollector<Text, Text> oc, } FileInputFormat.addInputPath(group, new
Reporter reporter) throws IOException { } Path("/user/gates/tmp/joined"));
// Pull the key out public static class LoadClicks extends MapReduceBase FileOutputFormat.setOutputPath(group, new
String line = val.toString(); implements Mapper<WritableComparable, Writable, LongWritable, Path("/user/gates/tmp/grouped"));
int firstComma = line.indexOf(','); Text> { group.setNumReduceTasks(50);
String value = line.substring( firstComma + 1); Job groupJob = new Job(group);
int age = Integer.parseInt(value); public void map( groupJob.addDependingJob(joinJob);
if (age < 18 || age > 25) return; WritableComparable key,
String key = line.substring(0, firstComma); Writable val, JobConf top100 = new JobConf(MRExample.class);
Text outKey = new Text(key); OutputCollector<LongWritable, Text> oc, top100.setJobName("Top 100 sites");
// Prepend an index to the value so w e know which file Reporter reporter) throws IOException { top100.setInputFormat(SequenceFileInputFormat.class);
// it came from. oc.collect((LongWritable)val, (Text)key); top100.setOutputKeyClass(LongWritable.class);
Text outVal = new Text("2" + value); } top100.setOutputValueClass(Text.class);
oc.collect(outKey, outVal); } top100.setOutputFormat(SequenceFileOutputF ormat.class);
} public static class LimitClicks extends MapReduceBase top100.setMapperClass(LoadClicks.class);
} implements Reducer<LongWritable, Text, LongWritable, Text> { top100.setCombinerClass(LimitClicks.class);
public static class Join extends MapReduceBase top100.setReducerClass(LimitClicks.class);
implements Reducer<Text, Text, Text, Text> { int count = 0; FileInputFormat.addInputPath(top100, new
public void reduce( Path("/user/gates/tmp/grouped"));
public void reduce(Text key, LongWritable key, FileOutputFormat.setOutputPath(top100, new
Iterator<Text> iter, Iterator<Text> iter, Path("/user/gates/top100sitesforusers18to25"));
OutputCollector<Text, Text> oc, OutputCollector<LongWritable, Text> oc, top100.setNumReduceTasks(1);
Reporter reporter) throws IOException { Reporter reporter) throws IOException { Job limit = new Job(top100);
// For each value, figure out which file it's from and limit.addDependingJob(groupJob);
store it // Only output the first 100 records
// accordingly. while (count < 100 && iter.hasNext()) { JobControl jc = new JobControl("Find top 100 sites for users
List<String> first = new ArrayList<String>(); oc.collect(key, iter.next()); 18 to 25");
List<String> second = new ArrayList<String>(); count++; jc.addJob(loadPages);
} jc.addJob(loadUsers);
while (iter.hasNext()) { } jc.addJob(joinJob);
Text t = iter.next(); } jc.addJob(groupJob);
String value = t.to String(); public static void main(String[] args) throws IOException { jc.addJob(limit);
if (value.charAt(0) == '1') JobConf lp = new JobConf(MRExample.class); jc.run();
first.add(value.substring(1)); lp.setJobName("Load Pages"); }
else second.add(value.substring(1)); lp.setInputFormat(TextInputFormat.class); }
Minutes
120 200
100
150
80
60 100
40
50
20
0 0