WordCount基于本地和java的使用

直接使用hadoop中的wordcount中的jar包进行使用

JAVA实现WordCount

import java.io.IOException;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;



    public class Demo1 {

        // map类
        // 第一对kv,是决定数据输入的格式
        // 第二队kv 是决定数据输出的格式
        public static class map extends Mapper<LongWritable,Text,Text,LongWritable>{
            @Override
            protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
                //一行一行的读，先LongWrtable再value是因为第一个LongWritable是偏移量
                String line=value.toString();
                //需要读出内容和行数1，所以要对结果进行类型转换
                context.write(new Text(line),new LongWritable(1));
            }
        }

        // reduce类
        // 用来接收map端输出的数据
        public static class reduce extends Reducer<Text,LongWritable,Text,LongWritable>{
            /**
             * reduce 聚合程序 每一个k都会调用一次
             * 默认是一个节点
             * key:每一个单词
             * values:map端 当前k所对应的所有的v
             */
            @Override
            protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
                long sum=0;
                
                //进行整合后values值变成（key，1,1,1,1），values需要遍历
                for (LongWritable value : values) {
                    //这里同理需要将value转换类型，LongWritable是一个接口可以用get方法转为long型整数
                    sum+=value.get();
                }
                //同理long类型sum转换为LongWritable类型
                context.write(key,new LongWritable(sum));
            }
        }


        //mapreduce的程序入口
        public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
            //创造一个job任务
            Job job = Job.getInstance();
            //命名job名称
            job.setJobName("第一次通过自己的jar包连接");

            //指定当前main坐在类端口
            job.setJarByClass(Demo1.class);

            //指定map类端口
            job.setMapperClass(map.class);
            //指定map输出的kv类型
            job.setMapOutputKeyClass(Text.class);
            job.setMapOutputValueClass(LongWritable.class);

            //指定reduce类端口
            job.setReducerClass(reduce.class);
            //指定reduce输出的kv类型
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(LongWritable.class);

            //指定输入路径 hdfs路径
            Path in = new Path("/wordcount");
            FileInputFormat.addInputPath(job,in);

            //指定输出路径
            Path out = new Path("/output1");
            //如果路径存在，进行删除操作
            FileSystem fs = FileSystem.get(new Configuration());
            if (fs.exists(out)){
                fs.delete(out,true);  //true可以删除多级目录
            }
            FileOutputFormat.setOutputPath(job,out);

            //启动任务
            job.waitForCompletion(true);

            /**
             * 提交任务
             * 1.通过maven中package将项目打包上传服务器然后执行
             * 2.执行任务 hadoop jar hadoop-mapreduce-examples-2.7.6.jar com.shujia.hadoop.Demo01WordCount /word  /output
             *
             */

            System.out.println("wordcount实现成功");


        }
    }

实现玩代码后进行打包，打完后的包xftp上传到

/usr/local/soft/hadoop-2.7.6/share/hadoop/mapreduce

开始正式对包进行解析（jar）

路径在idea中查看，是mian函数的路径

对数据进行逗号分隔代码

只需对map阶段进行操作即可

public static class map extends Mapper<LongWritable,Text,Text,LongWritable>{
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String s = value.toString();
            String[] split = s.split(",");
            for (String s1 : split) {
                context.write(new Text(s1),new LongWritable(1));
            }

        }
    }

对students中clazz中年龄的总和

public static class map extends Mapper<LongWritable,Text,Text,LongWritable>{
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String s = value.toString();
            String[] split = s.split(",");

            String s1 = split[2];
            LongWritable age = new LongWritable(Integer.valueOf(s1));
            String s2 = split[4];
            Text clazz = new Text(s2);
            context.write(clazz, age);
        }
    }

对students.txt中进行男女性别人数的统计

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class Demo4 {
    public static class map extends Mapper<LongWritable,Text,Text,LongWritable>{
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String s = value.toString().split(",")[3];
            context.write(new Text(s),new LongWritable(1));
        }
    }

    public static class reduce extends Reducer<Text,LongWritable,Text,LongWritable>{
        @Override
        protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
            long sum=0l;
            for (LongWritable value : values) {
                sum+=value.get();
            }
            context.write(key,new LongWritable(sum));
        }
    }

    public static void main(String[] args) throws Exception{

        Job job = Job.getInstance();
        job.setJobName("男女性别人数的统计");
        job.setJarByClass(Demo4.class);

        job.setMapperClass(map.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(LongWritable.class);

        job.setReducerClass(reduce.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(LongWritable.class);

        Path in = new Path("/data/students.txt");
        FileInputFormat.addInputPath(job,in);

        Path out = new Path("/output4");
        FileSystem fs = FileSystem.get(new Configuration());
        if (fs.exists(out)){
            fs.delete(out,true);
        }
        FileOutputFormat.setOutputPath(job,out);

        job.waitForCompletion(true);
        System.out.println("第四个了");


    }
}

Students.txt中筛选出男生的所有信息，无reduce阶段，因为无需计算

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class Demo5 {

    public static class map extends Mapper<LongWritable,Text,Text,NullWritable> {
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String s = value.toString().split(",")[3];
            if (s.equals("男")){
                context.write(value,NullWritable.get());
            }
        }
    }


    public static void main(String[] args) throws Exception {
        Job job = Job.getInstance();
        job.setJobName("students中只筛选出男生，无reduce操作");
        job.setJarByClass(Demo5.class);

        job.setMapperClass(map.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(NullWritable.class);

        Path in = new Path("/data/students.txt");
        FileInputFormat.addInputPath(job,in);

        Path out = new Path("/output5");
        FileSystem fs = FileSystem.get(new Configuration());
        if (fs.exists(out)){
            fs.delete(out,true);
        }
        FileOutputFormat.setOutputPath(job,out);

        job.waitForCompletion(true);

    }
}

对两张表进行拼接操作：

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;
import java.util.ArrayList;

public class Demo6 {

    public static class map extends Mapper<LongWritable,Text,Text,Text>{
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            //context获取切片，上面是hdfs就从hdfs，下面是reduce
            //获取路径InputSplit
            InputSplit is = context.getInputSplit(); //InputSplit获取切片，然后从hdfs中获取文件名或者路径
            FileSplit fileSplit= (FileSplit) is; //InputSplit是抽象类，不能使用自己的方法，所以用FileSplit来实现
            String s = fileSplit.getPath().toString(); //获取切片的文件路径，是path不是name

            if (s.contains("students")){
                //打上标签
                String s1 = "*"+value.toString();
                String id = value.toString().split(",")[0];
                context.write(new Text(id),new Text(s1));
            }else {
                String s1 = "#"+value.toString();
                String id = value.toString().split(",")[0];
                context.write(new Text(id),new Text(s1));
            }

        }
    }

    public static class reduce extends Reducer<Text,Text,Text,NullWritable> {
        @Override
        protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
            //此时进行了一个reducetask任务，key是学号，而values是相同key所对应的所有的数据，包括学生信息和分数信息，
            //此时里面有七个，六个是score信息，对其进行集合存储

            String st="";
            ArrayList<String> sc = new ArrayList<String>();
            //分数弄成一个集合是因为一个学生对应六个分数，可以通过对集合的遍历将六个成绩逐一算到学生中去

            for (Text value : values) {
                String s = value.toString();
                if (s.startsWith("*")){
                     st = s.substring(1); //此时注意s是包含标签的，记得索引0是标签
                }else {
                    sc.add(s.substring(1));
                }
            }

            //两张表进行拼接
            for (String s : sc) {
                String s1 = s.split(",")[2];
                String end=st+","+s1;
                context.write(new Text(end),NullWritable.get());
            }
        }
    }


    public static void main(String[] args) throws Exception{
        Job job = Job.getInstance();
        job.setJobName("两个文件进行拼接");
        job.setJarByClass(Demo6.class);

        job.setMapperClass(map.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);

        job.setReducerClass(reduce.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);

        Path in = new Path("/datajava");
        FileInputFormat.addInputPath(job,in);

        Path out = new Path("/output6");
        FileOutputFormat.setOutputPath(job,out);

        job.waitForCompletion(true);
        System.out.println("可以了第六次");

    }
}

combine对数据进行性别进行计数

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class Demo8 {

    public static class map extends Mapper<LongWritable,Text,Text,LongWritable>{
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String sex = value.toString().split(",")[3];
            context.write(new Text(sex),new LongWritable(1));

        }
    }

    public static class combine extends Reducer<Text,LongWritable,Text,LongWritable>{
        @Override
        protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
            long sum=0l;
            for (LongWritable value : values) {
                sum+=value.get();
            }
            context.write(key,new LongWritable(sum));
        }
    }

    public static class reduce extends Reducer<Text,LongWritable,Text,LongWritable>{
        @Override
        protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
            long sum=0l;
            for (LongWritable value : values) {
                sum+=value.get();
            }
            context.write(key,new LongWritable(sum));
        }
    }



    public static void main(String[] args) throws Exception{

        Job job = Job.getInstance();
        job.setJobName("combine对性别进行计数");
        job.setJarByClass(Demo8.class);

        job.setMapperClass(map.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(LongWritable.class);

        job.setCombinerClass(combine.class);

        job.setReducerClass(reduce.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(LongWritable.class);

        Path in = new Path("/data/students.txt");
        FileInputFormat.addInputPath(job,in);

        Path out = new Path("/output8");
        FileSystem fs = FileSystem.get(new Configuration());
        if (fs.exists(out)){
            fs.delete(out,true);
        }
        FileOutputFormat.setOutputPath(job,out);

        job.waitForCompletion(true);
        System.out.println("你又可以了");

    }
}