mapreduce中使用python
1.创建文件目录
mkdir -p /opt/pyshell/mapreduce/
2.新建mapper脚本
vi /opt/pyshell/mapreduce/mapper.py
#!/usr/bin/env python
#coding=utf-8
import sys
for line in sys.stdin:
line=line.strip()
words=line.split()
for word in words:
print("{0}\t{1}".format(word,1))
3.新建reducer脚本
vi /opt/pyshell/mapreduce/reducer.py
#!/usr/bin/env python
#coding=utf-8
from operator import itemgetter
import sys
current_word = None
current_count = 0
word = None
for line in sys.stdin:
line = line.strip(' ')
word, count = line.split('\t', 1)
try:
count = int(count)
except ValueError: #count如果不是数字的话,直接忽略掉
continue
if current_word == word:
current_count += count
else:
if current_word:
print "%s\t%s" % (current_word, current_count)
current_count = count
current_word = word
if word == current_word: #不要忘记最后的输出
print "%s\t%s" % (current_word, current_count)
4.上传文件到hdsp
hadoop fs -put /opt/data/*.txt /input
5.启动yarn
service yarn start
参考 注册yarn为 chkconfig管理
6.执行脚本
cd /usr/apps/hadoop/hadoop-2.6.4/share/hadoop/tools/lib/
hadoop jar hadoop-streaming-2.6.4.jar \
-file /opt/pyshell/mapreduce/mapper.py -mapper /opt/pyshell/mapreduce/mapper.py \
-file /opt/pyshell/mapreduce/reducer.py -reducer /opt/pyshell/mapreduce/reducer.py \
-input /input/* -output /output/out