hadoop参数传递
传参关键代码:
//从配置文件获取参数,必须在作业创建的前面
conf.addResource("hadoop-bigdata.xml");
keepUrl=conf.get("KeepUrlString","");
filterUrl=conf.get("FilterUrlString","");
conf.set("FilterUrl", filterUrl);
conf.set("KeepUrl", keepUrl);
//获取参数
String fstr=context.getConfiguration().get("FilterUrl");
String kstr=context.getConfiguration().get("KeepUrl");
package org.apache.hadoop.examples; import java.io.IOException; import java.util.StringTokenizer; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; public class FilterUrl { public static class FilterUrlMap extends Mapper<Object,Text,Text,Text> { private static Text word=new Text(); public void map(Object key,Text values,Context context) throws IOException,InterruptedException { boolean fflag=false; boolean kflag=false; //获取参数 String fstr=context.getConfiguration().get("FilterUrl"); String kstr=context.getConfiguration().get("KeepUrl"); //循环的方式 // StringTokenizer fitr=new StringTokenizer(fstr,"|"); // StringTokenizer kitr=new StringTokenizer(kstr,"|"); //正则表达式,替换特殊字符 Pattern filter=Pattern.compile(fstr.replace(".","\\.")); Pattern keep=Pattern.compile(kstr.replace(".","\\.")); //有一大段的内容 StringTokenizer itr = new StringTokenizer(values.toString(),"\n"); String url=""; while(itr.hasMoreTokens()) { url=itr.nextToken().toLowerCase(); //正则表达式的模式匹配 Matcher mkeep=keep.matcher(url); if(mkeep.find()) { kflag=true; Matcher mfilter=filter.matcher(url); if(mfilter.find()) fflag=true; } //需要保留的URL /** //循环的模式匹配 while(kitr.hasMoreTokens()) { if(url.indexOf(kitr.nextToken())>0) { kflag=true; break; } } //需要过滤掉的URL while(kflag && fitr.hasMoreTokens()) { if(url.indexOf(fitr.nextToken())>0) { fflag=true; break; } } */ //是需要保留的并且不是需要过滤掉的URL if(kflag && !fflag) { word.set(url); context.write(word,new Text("")); } } } } public static class FilterUrlReduce extends Reducer<Text,Text,Text,Text> { public void reduce(Text key,Iterable<Text> values,Context context) throws IOException,InterruptedException { context.write(key, new Text("")); } } public static void main(String[] args) throws Exception{ // TODO Auto-generated method stub Configuration conf=new Configuration(); String filterUrl=new String(); String keepUrl=new String(); if(args.length!=2) { System.err.println("please input two args:<in> <out>"); System.exit(2); } //从配置文件获取参数,必须在作业创建的前面 conf.addResource("hadoop-bigdata.xml"); keepUrl=conf.get("KeepUrlString",""); filterUrl=conf.get("FilterUrlString",""); conf.set("FilterUrl", filterUrl); conf.set("KeepUrl", keepUrl); //这句必须在参数设置语句的后面,否则参数获取失败 Job job=new Job(conf,"filter url"); job.setJarByClass(FilterUrl.class); job.setMapperClass(FilterUrlMap.class); job.setReducerClass(FilterUrlReduce.class); //job.setNumReduceTasks(0); //如果不要的话会有多个小的文件 job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job,new Path(args[1])); System.exit(job.waitForCompletion(true)?0:1); } }
需要从配置文件获取的参数:
<?xml version="1.0"?> <?xml-stylesheet type="text/xsl" href="configuration.xsl"?> <configuration> <property> <!--C net keep url string --> <name>KeepUrlString</name> <value>anjueke.com|soufun.com</value> </property> <property> <!--filter url--> <name>FilterUrlString</name> <value>.js|.jpg|.jpeg|.gif|.png|.css|error.html</value> </property> </configuration>