MapReduce example
map(String key, String value):
for each word w in value:
emit(w, 1);
reduce(String key, Iterator values):
int result = 0;
for each v in values:
results += v;
emit(result);
More Sawzall examples
count: table sum of int;
total: table sum of float;
squares: table sum of float;
x : float = input;
emit count <- 1;
emit sum <- x;
emit squares <- x * x;
proto "document.proto"
max_pagerank: table maximum(1) [domain: string] of url: string weight pagerank: int;
doc : Document = input
emit max_pagerank[domain(doc.url)] <- doc.url weight doc.pagerank;
['google.com' => 'google.com/index.html',
'adobe.com' => 'adobe.com/download/acrobat/',
'bangkok.com' => 'bangkok.com/nightlife/whatsup.htm']
Yahoo Pig!
-
Yahoo's high-level algebra (unpublished)
- Tuple (record): string, bag, map, tuple
- Operators: load, group, cogroup, foreach
- Open-source, stand-alone or on top of Hadoop
- Feels like relational operators, same as Sawzall
VISITS = load '/visits' as (user, url, time);
USER_VISITS = group VISITS by user;
USER_COUNTS = foreach USER_VISITS generate group as user, COUNT(VISITS) as numvisits;
ALL_COUNTS = group USER_COUNTS all;
AVG_COUNT = foreach ALL_COUNTS generate AVG(USER_COUNTS.numvisits);
VISITS = load '/visits' as (user, url, time);
PAGES = load '/pages' as (url, pagerank);
VISITS_PAGES = join VISITS by url, PAGES by url;
USER_VISITS = group VISITS_PAGES by user;
USER_AVGPR = foreach USER_VISITS generate group,
AVG(VISITS_PAGES.pagerank) as avgpr;
GOOD_USERS = filter USER_AVGPR by avgpr > '0.5';