//仅限于lucene-2.2.0版本
一、Eclipse添加lib文件
1. lucene-analyzers-2.2.0.jar
2. lucene-core-2.2.0.jar
3. paoding-analysis.jar
4. commons-logging.jar
二、配置庖丁dic路径(所有节点)
1. 解压到/home/hadoop目录下得到dic文件夹(应该也可以是别的路径)
2. 在hadoop-env.sh文件末尾添加export PAODING_DIC_HOME=/home/hadoop/dic
3. 重启所有节点
三、修改编码以防乱码
Eclipse -> Window -> Preference -> General -> Workspace,把Text file encoding 改成 UTF-8
Linux节点编码#vi /etc/sysconfig/i18n,把LANG的值改成zh_CN.UTF-8
四、代码
1. Mapper类
package org.znufe.cnwc; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import net.paoding.analysis.analyzer.PaodingAnalyzer; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; public class CNWordMapper extends Mapper<LongWritable, Text, Text, IntWritable> { private final static IntWritable one = new IntWritable(1); private Text word = new Text(); public void map(LongWritable ikey, Text ivalue, Context context) throws IOException, InterruptedException { byte[] bt = ivalue.getBytes(); InputStream ip = new ByteArrayInputStream(bt); Reader read = new InputStreamReader(ip); Analyzer analyzer = new PaodingAnalyzer(); //添加庖丁分词 TokenStream tokenStream = analyzer.tokenStream(word.toString(), read); Token t; while ((t = tokenStream.next()) != null) { word.set(t.termText()); context.write(word, one); } } }
2. Reducer类
package org.znufe.cnwc; import java.io.IOException; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; public class CNWordReducer extends Reducer<Text, IntWritable, Text, IntWritable> { private IntWritable result = new IntWritable(); public void reduce(Text _key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { // process values int sum = 0; for (IntWritable val : values) { sum += val.get(); } result.set(sum); context.write(_key, result); } }
3. Driver类
package org.znufe.cnwc; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.util.GenericOptionsParser; public class CNWordMain { public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); /** * ����������������� * */ if(otherArgs.length != 2){ System.err.println("Usage: wordcount <in> <out>"); System.exit(2); } Job job = Job.getInstance(conf, "CN Word Count"); job.setJarByClass(org.znufe.cnwc.CNWordMain.class); // TODO: specify a mapper job.setMapperClass(org.znufe.cnwc.CNWordMapper.class); // TODO: specify a reducer job.setCombinerClass(org.znufe.cnwc.CNWordReducer.class); //添加中文功能 job.setReducerClass(org.znufe.cnwc.CNWordReducer.class); // TODO: specify output types job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); // TODO: specify input and output DIRECTORIES (not files) FileInputFormat.setInputPaths(job, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); } }
五、打包部署运行
随便找了个素材输出大概如下:
*六、关于在Eclipse中直接操作dfs的权限问题
在所有节点的hdfs-site.xml中添加代码
<property>
<name>dfs.permissions</name>
<value>false</value>
</property>
重启所有节点即可。