Hadoop Streaming 用PHP寫MapReducer
OS:
Ubuntu 12.4 LTS (x86)
jdk 1.7.0_67
Hadoop 2.2.0
$ cd /usr/local/hadoop
$ mkdir input/
$ cd input
$ gedit file1.txt
hello world
hello ray
hello Hadoop
$ gedit file2.txt
Hadoop ok
Hadoop fail
Hadoop 2.3
$ cd ..
加入hadoop系統
$ ./bin/hadoop fs -mkdir /data
$ ./bin/hadoop fs -put -f input/file1.txt input/file2.txt /data
(出現 name node is in safe mode
$ cd /usr/local/hadoop
$ ./bin/hadoop dfsadmin -safemode leave
把 safemode解除)
$ cd /usr/local/hadoop
$ mkdir php/
$ cd php
$ sudo gedit mapper.php
複製以下內容 ( 網路上找來的範例 )
#!/usr/bin/php
<?php
$word2count = array();
while (($line = fgets(STDIN)) !== false) {
$line = strtolower(trim($line));
$words = preg_split('/\W/', $line, 0, PREG_SPLIT_NO_EMPTY);
foreach ($words as $word) {
$word2count[$word] += 1;
}
}
foreach ($word2count as $word => $count) {
echo $word, chr(9), $count, PHP_EOL;
}
?>
$ sudo gedit reducer.php
複製以下內容 ( 網路上找來的範例 )
#!/usr/bin/php
<?php
$word2count = array();
while (($line = fgets(STDIN)) !== false) {
$line = trim($line);
list($word, $count) = explode(chr(9), $line);
$count = intval($count);
if ($count > 0) $word2count[$word] += $count;
}
ksort($word2count);
foreach ($word2count as $word => $count) {
echo $word, chr(9), $count, PHP_EOL;
}
?>
執行
$ cd ..
$ bin/hadoop jar share/hadoop/tools/lib/hadoop-streaming-2.2.0.jar -mapper /usr/local/hadoop/php/mapper.php -reducer /usr/local/hadoop/php/reducer.php -input /data/* -output /output/
顯示結果
$ ./bin/hadoop d fs -cat /tmp/out/part-00000
fail 1
hadoop 4
hello 3
ok 1
ray 1
world 1
2 1
3 1