hadoop基本操作

hadoop的基本操作是对6个类的重写实现的

TextInputFormat，Mapper，Combiner，HashPartitioner，Reducer，TextOutFormat

//基本的设置，对于同的问题，只需要重写6个类即可</span>

	public static void test2() throws Exception {
		Configuration conf = new Configuration();
		String arg1 = "hdfs://101.200.200.114:9008/yj/input";
		String arg2 = "hdfs://101.200.200.114:9008/yj/output";
		Job job = new Job(conf, "word count");
		job.setJarByClass(WordCount.class);
		//对hadoop的操作基本上是通过对下面6个类重写实现的
		job.setInputFormatClass(TextInputFormat.class);//<span style="font-family: Arial, Helvetica, sans-serif; font-size: 12px;">TextInputFormat为默认，将输入文件分割成小文件，解析成<key,value>对，key默认为字符偏移量，value默认为行值</span>
		job.setMapperClass(MyMapper.class);
		job.setCombinerClass(IntSumReducer.class);
		job.setPartitionerClass(HashPartitioner.class);//<span style="font-family: Arial, Helvetica, sans-serif; font-size: 12px;">HashPartitioner为默认</span>
		job.setReducerClass(MyReducer.class);
		job.setOutputFormatClass(TextOutFormat.class);//<span style="font-family: Arial, Helvetica, sans-serif; font-size: 12px;">TextOutFormat为默认</span>
		
		FileInputFormat.addInputPath(job, new Path(arg1));
		FileOutputFormat.setOutputPath(job, new Path(arg2));
		System.exit(job.waitForCompletion(true) ? 0 : 1);
	}

	//统计单词个数
	public static void test3() throws Exception {
		Configuration conf = new Configuration();
		String arg1 = "hdfs://101.200.200.114:9008/yj/input";
		String arg2 = "hdfs://101.200.200.114:9008/yj/output";
		Job job = new Job(conf, "word count");
		job.setJarByClass(WordCount.class);
		
		job.setInputFormatClass(TextInputFormat.class);//使用默认
		job.setMapperClass(TokenizerMapper.class);//将每行分割成<word,1>的格式传给下一个过程
		job.setCombinerClass(null);//将map后的相同key先进行一个合并，增加这个过程可以减少io，提高速度
		job.setPartitionerClass(HashPartitioner.class);//shuffle过程中将数据分给不同reducer的策略，默认使用hash
		job.setReducerClass(IntSumReducer.class);//合并中间结果
		job.setOutputKeyClass(Text.class);//由于输出的每行包含两个，分别是key和value，故需如此设置，也可重写OutFormat
		job.setOutputValueClass(IntWritable.class);
		
		FileInputFormat.addInputPath(job, new Path(arg1));
		FileOutputFormat.setOutputPath(job, new Path(arg2));
		System.exit(job.waitForCompletion(true) ? 0 : 1);
	}

posted on 2016-04-08 12:29 长456风阅读(243) 评论(0) 编辑收藏举报

刷新页面返回顶部

长456风

hadoop基本操作

导航

公告