hadoop参数传递

传参关键代码:

  //从配置文件获取参数,必须在作业创建的前面            

conf.addResource("hadoop-bigdata.xml");                          
keepUrl=conf.get("KeepUrlString","");        
filterUrl=conf.get("FilterUrlString","");
conf.set("FilterUrl", filterUrl);
conf.set("KeepUrl", keepUrl);
//获取参数
String fstr=context.getConfiguration().get("FilterUrl");
String kstr=context.getConfiguration().get("KeepUrl");
 
package org.apache.hadoop.examples;

import java.io.IOException;
import java.util.StringTokenizer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class FilterUrl {
    
    public static class FilterUrlMap extends Mapper<Object,Text,Text,Text>
    {
        private static Text word=new Text();
        
        public void map(Object key,Text values,Context context) throws
        IOException,InterruptedException
        {
            boolean fflag=false;
            boolean kflag=false;
            //获取参数
            String fstr=context.getConfiguration().get("FilterUrl");
            String kstr=context.getConfiguration().get("KeepUrl");
            //循环的方式
//            StringTokenizer fitr=new StringTokenizer(fstr,"|");
//            StringTokenizer kitr=new StringTokenizer(kstr,"|");
    
            //正则表达式,替换特殊字符
            Pattern filter=Pattern.compile(fstr.replace(".","\\."));
            Pattern keep=Pattern.compile(kstr.replace(".","\\."));            
            
            //有一大段的内容
            StringTokenizer itr = new StringTokenizer(values.toString(),"\n");
            String url="";
            while(itr.hasMoreTokens())
            {
                url=itr.nextToken().toLowerCase();
                
                //正则表达式的模式匹配                
                Matcher mkeep=keep.matcher(url);
                if(mkeep.find())
                {
                    kflag=true;                                    
                    Matcher mfilter=filter.matcher(url);    
                    if(mfilter.find())
                        fflag=true;                
                }
                
                //需要保留的URL                
                /**
                 //循环的模式匹配
                while(kitr.hasMoreTokens())
                {
                    if(url.indexOf(kitr.nextToken())>0)
                    {
                        kflag=true;
                        break;
                    }
                }
                //需要过滤掉的URL
                while(kflag && fitr.hasMoreTokens())
                {
                    if(url.indexOf(fitr.nextToken())>0)
                    {
                        fflag=true;
                        break;
                    }
                }
                */
                //是需要保留的并且不是需要过滤掉的URL
                if(kflag && !fflag)
                {
                    word.set(url);
                    context.write(word,new Text(""));
                }
            }
        }
    }
    public static class FilterUrlReduce extends Reducer<Text,Text,Text,Text>
    {
        public void reduce(Text key,Iterable<Text> values,Context context) throws 
        IOException,InterruptedException
        {
            context.write(key, new Text(""));
        }
    }
    public static void main(String[] args) throws Exception{
        // TODO Auto-generated method stub
        Configuration conf=new Configuration();
        String filterUrl=new String();
        String keepUrl=new String();
        if(args.length!=2)
        {
            System.err.println("please input two args:<in> <out>");
            System.exit(2);
        }
        //从配置文件获取参数,必须在作业创建的前面            
        conf.addResource("hadoop-bigdata.xml");                          
        keepUrl=conf.get("KeepUrlString","");        
        filterUrl=conf.get("FilterUrlString","");
        conf.set("FilterUrl", filterUrl);
        conf.set("KeepUrl", keepUrl);
        
        //这句必须在参数设置语句的后面,否则参数获取失败
        Job job=new Job(conf,"filter url");
        job.setJarByClass(FilterUrl.class);
        job.setMapperClass(FilterUrlMap.class);
        job.setReducerClass(FilterUrlReduce.class);
        //job.setNumReduceTasks(0); //如果不要的话会有多个小的文件
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);
        FileInputFormat.addInputPath(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job,new Path(args[1]));
        System.exit(job.waitForCompletion(true)?0:1);
    }
}

需要从配置文件获取的参数:

<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>

<configuration>
     <property>
        <!--C net keep url string  -->
        <name>KeepUrlString</name>
        <value>anjueke.com|soufun.com</value>
     </property>
     <property>
        <!--filter url-->
        <name>FilterUrlString</name>
        <value>.js|.jpg|.jpeg|.gif|.png|.css|error.html</value>
    </property>
</configuration>

 

posted on 2014-03-21 10:03  ringwang  阅读(3850)  评论(0编辑  收藏  举报