自定义wordCount程序、
1.MyWordCount代码:
package com.hadoop.mr; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; public class MyWordCount { Configuration conf =new Configuration(true); Job job= Job.getInstance(conf); // Create a new Job // Job job = Job.getInstance(); job.setJarByClass(MyWordCount.class); // Specify various job-specific parameters job.setJobName("myjob"); // job.setInputPath(new Path("in")); // job.setOutputPath(new Path("out")); Path inPath=new Path("/input/LICENSE.txt"); FileInputFormat.addInputPath(job, inPath); //可以支持多个输入文件处理 Path outPath=new Path("/outpath"); if( outPath.getFileSystem(conf).exists(outPath)){ outPath.getFileSystem(conf).delete(outPath,true);//如果存在这个目录就递归删除 } job.setMapperClass(MyMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setReducerClass(MyReducer.class); //上面的各种set就是产生一个对应的xml文件 // Submit the job, then poll for progress until the job is complete job.waitForCompletion(true);
2.MyMapper代码
package com.hadoop.mr; import java.io.IOException; import java.util.StringTokenizer; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; public class MyMapper extends Mapper<Object, Text, Text, IntWritable>{ //将这两个对象放在循环体外可以避免多次创建对象造成jvm内存过大,gc处理过于频繁影响程序运行。 private final static IntWritable one = new IntWritable(1); private Text word = new Text(); public void map(Object key, Text value, Context context) throws IOException, InterruptedException { StringTokenizer itr = new StringTokenizer(value.toString()); while (itr.hasMoreTokens()) { word.set(itr.nextToken()); context.write(word, one); } } }
3.MyReducer 代码:
package com.hadoop.mr; import java.io.IOException; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; /** * * @author Administrator * 1.shuffer :reduce从map中copy属于自己的数据过程。 * 2.sort * 3.reduce */ public class MyReducer extends Reducer<Text, IntWritable, Text, IntWritable>{ //相同的key为一组,调用一次reduce方法,在方法迭代这一组数据,进行计算:sum count max min ... private IntWritable result = new IntWritable(); public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { //hello 1 //hello 1 //hello 1 //hello 1 //hello 1 //key: hello //values:(1,1,1,1,1) int sum = 0; for (IntWritable val : values) { sum += val.get(); } result.set(sum); context.write(key, result); } }
4.代码导出为jar包
代码写完毕后
右键export
这个jar包导入到服务器
5.运行自定义wordcount程序
使用这个jar包运行 运行自定义wordcount程序
[root@node01 ~]# hadoop jar MyWordCount.jar com.hadoop.mr.MyWordCount 2018-12-27 02:32:35,703 INFO client.ConfiguredRMFailoverProxyProvider: Failing over to rm2 2018-12-27 02:32:36,301 WARN mapreduce.JobResourceUploader: Hadoop command-line option parsing not performed. Implement the Tool interface and execute your application with ToolRunner to remedy this. 2018-12-27 02:32:36,393 INFO mapreduce.JobResourceUploader: Disabling Erasure Coding for path: /user/root/.staging/job_1545843601871_0001 2018-12-27 02:32:37,281 INFO input.FileInputFormat: Total input files to process : 1 2018-12-27 02:32:37,531 INFO mapreduce.JobSubmitter: number of splits:1 2018-12-27 02:32:37,853 INFO Configuration.deprecation: yarn.resourcemanager.zk-address is deprecated. Instead, use hadoop.zk.address 2018-12-27 02:32:37,854 INFO Configuration.deprecation: yarn.resourcemanager.system-metrics-publisher.enabled is deprecated. Instead, use yarn.system-metrics-publisher.enabled 2018-12-27 02:32:38,548 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1545843601871_0001 2018-12-27 02:32:38,550 INFO mapreduce.JobSubmitter: Executing with tokens: [] 2018-12-27 02:32:39,466 INFO conf.Configuration: resource-types.xml not found 2018-12-27 02:32:39,466 INFO resource.ResourceUtils: Unable to find 'resource-types.xml'. 2018-12-27 02:32:39,684 INFO impl.YarnClientImpl: Submitted application application_1545843601871_0001 2018-12-27 02:32:39,788 INFO mapreduce.Job: The url to track the job: http://node04:8088/proxy/application_1545843601871_0001/ 2018-12-27 02:32:39,788 INFO mapreduce.Job: Running job: job_1545843601871_0001 2018-12-27 02:32:53,436 INFO mapreduce.Job: Job job_1545843601871_0001 running in uber mode : false 2018-12-27 02:32:53,437 INFO mapreduce.Job: map 0% reduce 0% 2018-12-27 02:33:14,764 INFO mapreduce.Job: map 100% reduce 0% 2018-12-27 02:33:21,853 INFO mapreduce.Job: map 100% reduce 100% 2018-12-27 02:33:22,896 INFO mapreduce.Job: Job job_1545843601871_0001 completed successfully 2018-12-27 02:33:23,093 INFO mapreduce.Job: Counters: 53 File System Counters FILE: Number of bytes read=271802 FILE: Number of bytes written=977919 FILE: Number of read operations=0 FILE: Number of large read operations=0 FILE: Number of write operations=0 HDFS: Number of bytes read=147243 HDFS: Number of bytes written=34795 HDFS: Number of read operations=8 HDFS: Number of large read operations=0 HDFS: Number of write operations=2 Job Counters Launched map tasks=1 Launched reduce tasks=1 Rack-local map tasks=1 Total time spent by all maps in occupied slots (ms)=17818 Total time spent by all reduces in occupied slots (ms)=4648 Total time spent by all map tasks (ms)=17818 Total time spent by all reduce tasks (ms)=4648 Total vcore-milliseconds taken by all map tasks=17818 Total vcore-milliseconds taken by all reduce tasks=4648 Total megabyte-milliseconds taken by all map tasks=18245632 Total megabyte-milliseconds taken by all reduce tasks=4759552 Map-Reduce Framework Map input records=2746 Map output records=21463 Map output bytes=228869 Map output materialized bytes=271802 Input split bytes=99 Combine input records=0 Combine output records=0 Reduce input groups=2965 Reduce shuffle bytes=271802 Reduce input records=21463 Reduce output records=2965 Spilled Records=42926 Shuffled Maps =1 Failed Shuffles=0 Merged Map outputs=1 GC time elapsed (ms)=543 CPU time spent (ms)=7040 Physical memory (bytes) snapshot=329469952 Virtual memory (bytes) snapshot=5474177024 Total committed heap usage (bytes)=143904768 Peak Map Physical memory (bytes)=206819328 Peak Map Virtual memory (bytes)=2734362624 Peak Reduce Physical memory (bytes)=122650624 Peak Reduce Virtual memory (bytes)=2739814400 Shuffle Errors BAD_ID=0 CONNECTION=0 IO_ERROR=0 WRONG_LENGTH=0 WRONG_MAP=0 WRONG_REDUCE=0 File Input Format Counters Bytes Read=147144 File Output Format Counters Bytes Written=34795