hadoop word count

1.环境准备

1.讲centos的hadoop安装包解压到windows桌面
2.添加环境变量
    HADOOP_HOME:E:\03-software_java\04-hadoop\hadoop-2.6.0-cdh5.9.0
    path:%HADOOP_HOME%/bin
3.把winutils.exe放在hadoop/bin文件下
    https://github.com/steveloughran/winutils   下载地址
4.更改NativeIO.java源码
    public static boolean access(String path, AccessRight desiredAccess)
      throws IOException {
//    return access0(path, desiredAccess.accessRight());   // 直接返回 true
    return true;
    }
    在项目目录创建一个的目录文件 org.apache.hadoop.io.nativeio
    创建 NativeIO.java 把源码复制过来更改

2.FileUtilsDelete.java // 删除文件

package com.imooc.bigdata.hadoop.mr.wc;

import java.io.File;

public class FileUtilsDelete {
    /**
     * 删除文件，可以是文件或文件夹
     *
     * @param fileName：要删除的文件名
     * @return 删除成功返回true，否则返回false
     */
    public static boolean delete(String fileName) {
        File file = new File(fileName);
        if (!file.exists()) {
            System.out.println("删除文件失败:" + fileName + "不存在！");
            return false;
        } else {
            if (file.isFile())
                return deleteFile(fileName);
            else
                return deleteDirectory(fileName);
        }
    }

    /**
     * 删除单个文件
     *
     * @param fileName：要删除的文件的文件名
     * @return 单个文件删除成功返回true，否则返回false
     */
    public static boolean deleteFile(String fileName) {
        File file = new File(fileName);
        // 如果文件路径所对应的文件存在，并且是一个文件，则直接删除
        if (file.exists() && file.isFile()) {
            if (file.delete()) {
                System.out.println("删除单个文件" + fileName + "成功！");
                return true;
            } else {
                System.out.println("删除单个文件" + fileName + "失败！");
                return false;
            }
        } else {
            System.out.println("删除单个文件失败：" + fileName + "不存在！");
            return false;
        }
    }

    /**
     * 删除目录及目录下的文件
     *
     * @param dir：要删除的目录的文件路径
     * @return 目录删除成功返回true，否则返回false
     */
    public static boolean deleteDirectory(String dir) {
        // 如果dir不以文件分隔符结尾，自动添加文件分隔符
        if (!dir.endsWith(File.separator))
            dir = dir + File.separator;
        File dirFile = new File(dir);
        // 如果dir对应的文件不存在，或者不是一个目录，则退出
        if ((!dirFile.exists()) || (!dirFile.isDirectory())) {
            System.out.println("删除目录失败：" + dir + "不存在！");
            return false;
        }
        boolean flag = true;
        // 删除文件夹中的所有文件包括子目录
        File[] files = dirFile.listFiles();
        for (int i = 0; i < files.length; i++) {
            // 删除子文件
            if (files[i].isFile()) {
                flag = deleteFile(files[i].getAbsolutePath());
                if (!flag)
                    break;
            }
            // 删除子目录
            else if (files[i].isDirectory()) {
                flag = deleteDirectory(files[i].getAbsolutePath());
                if (!flag)
                    break;
            }
        }
        if (!flag) {
            System.out.println("删除目录失败！");
            return false;
        }
        // 删除当前目录
        if (dirFile.delete()) {
            System.out.println("删除目录" + dir + "成功！");
            return true;
        } else {
            return false;
        }
    }

    public static void main(String[] args) {
        //测试
        String dir = "C:/Users/Administrator/Downloads/a";
        delete(dir);
    }

}

3.WordCountMapper.java // mapper逻辑处理

package com.imooc.bigdata.hadoop.mr.wc;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

public class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
    /**
     * KEYIN: Map任务读数据的key类型，offset，是每行数据起始位置的偏移量，Long
     * VALUEIN:Map任务读数据的value类型，其实就是一行行的字符串，String
     * <p>
     * hello world welcome
     * hello welcome
     * <p>
     * KEYOUT: map方法自定义实现输出的key的类型，String
     * VALUEOUT: map方法自定义实现输出的value的类型，Integer
     * <p>
     * <p>
     * 词频统计：相同单词的次数 (word,1)
     * <p>
     * Long,String,String,Integer是Java里面的数据类型
     * Hadoop自定义类型：序列化和反序列化
     * <p>
     * LongWritable,Text
     */
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String[] words = value.toString().split(" ");// 把value对应的行数据按照指定的分隔符拆开
        for (String word : words) {
            context.write(new Text(word), new IntWritable(1)); // 不区分大小写
//    context.write(new Text(word.toLowerCase()), new IntWritable(1)); // 区分大小写
        }

    }
}

4.WordCountReducer.java // reducer逻辑处理

package com.imooc.bigdata.hadoop.mr.wc;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;
import java.util.Iterator;

public class WordCountReducer extends Reducer {
    /**
     * (hello,1)  (world,1)
     * (hello,1)  (world,1)
     * (hello,1)  (world,1)
     * (welcome,1)
     * <p>
     * map的输出到reduce端，是按照相同的key分发到一个reduce上去执行
     * <p>
     * reduce1： (hello,1)(hello,1)(hello,1)  ==> (hello, <1,1,1>)
     * reduce2: (world,1)(world,1)(world,1) ==> (world, <1,1,1>)
     * reduce3 (welcome,1)  ==> (welcome, <1>)
     * <p>
     * <p>
     * Reducer和Mapper中其实使用到了什么设计模式：模板
     */
    @Override
    protected void reduce(Object key, Iterable values, Context context) throws IOException, InterruptedException {
        int count = 0;
        Iterator<IntWritable> iterator = values.iterator();
        while (iterator.hasNext()) {
            IntWritable value = iterator.next();
            count += value.get();
        }
        context.write(key, new IntWritable(count));
    }
}

5.WordCountApp.java // 连接hadoop 处理hadoop文件进行 word count 统计

package com.imooc.bigdata.hadoop.mr.wc;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.Job;


import java.net.URI;

/**
 * 使用MR统计HDFS上的文件对应的词频
 * <p>
 * Driver: 配置Mapper，Reducer的相关属性
 * <p>
 * 提交到本地运行：开发过程中使用
 */
public class WordCountApp {
    public static void main(String[] args) throws Exception {
        System.setProperty("HADOOP_USER_NAME", "root"); // 绑定用户
        Configuration configuration = new Configuration(); // 初始化一个任务
        configuration.set("fs.defaultFS", "hdfs://192.168.107.216:8020");

        Job job = Job.getInstance(configuration);  // 创建一个Job
        job.setJarByClass(WordCountApp.class);  // 设置Job对应的参数: 主类
        job.setMapperClass(WordCountMapper.class); // 设置Job对应的参数: 设置自定义的Mapper处理类
        job.setReducerClass(WordCountReducer.class); // 设置Job对应的参数: 设置自定义的Reducer处理类
        job.setCombinerClass(WordCountReducer.class); // 添加Combiner的设置即可
        job.setMapOutputKeyClass(Text.class); // 设置Job对应的参数: Mapper输出key的类型
        job.setMapOutputValueClass(IntWritable.class);  // 设置Job对应的参数: Mapper输出value的类型
        job.setOutputKeyClass(Text.class); // 设置Job对应的参数: Reduce输出key的类型
        job.setOutputValueClass(IntWritable.class);  // 设置Job对应的参数: Reduce输出value的类型

        // 如果输出目录已经存在，则先删除
        FileSystem fileSystem = FileSystem.get(new URI("hdfs://192.168.107.216:8020/"), configuration, "root");
        Path outputPath = new Path("/hdfsapi/output");
        if (fileSystem.exists(outputPath)) {
            fileSystem.delete(outputPath, true);
        }

        FileInputFormat.setInputPaths(job, new Path("/hdfsapi/test"));  // 设置Job对应的参数: Mapper输出key和value的类型：作业输入和输出的路径
        FileOutputFormat.setOutputPath(job, outputPath);  // 设置Job对应的参数: Mapper输出key和value的类型：作业输入和输出的路径
//    FileOutputFormat.setOutputPath(job, new Path("/hdfsapi/output"));  // 设置Job对应的参数: Mapper输出key和value的类型：作业输入和输出的路径

        boolean result = job.waitForCompletion(true); // 提交job
        System.exit(result ? 0 : -1);
    }
}

6.WordCountCombinerLocalApp.java // 处理本地文件进行 word count 统计

package com.imooc.bigdata.hadoop.mr.wc;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;


import java.io.File;
import java.net.URI;

public class WordCountCombinerLocalApp {
  public static void main(String[] args) throws Exception {
    Configuration configuration = new Configuration();

    // 创建一个Job
    Job job = Job.getInstance(configuration);
    // 设置Job对应的参数: 主类
    job.setJarByClass(WordCountCombinerLocalApp.class);
    // 设置Job对应的参数: 设置自定义的Mapper和Reducer处理类
    job.setMapperClass(WordCountMapper.class);
    job.setReducerClass(WordCountReducer.class);
    // 添加Combiner的设置即可
    job.setCombinerClass(WordCountReducer.class);
    // 设置Job对应的参数: Mapper输出key和value的类型
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);
    // 设置Job对应的参数: Reduce输出key和value的类型
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    FileUtilsDelete.deleteDirectory("output");  // // 如果输出目录已经存在，则先删除

    // 设置Job对应的参数: Mapper输出key和value的类型：作业输入和输出的路径
    FileInputFormat.setInputPaths(job, new Path("input"));
    FileOutputFormat.setOutputPath(job, new Path("output"));

    // 提交job
    boolean result = job.waitForCompletion(true);

    System.exit(result ? 0 : -1);

  }
}

发表于 2020-05-05 10:57 守护式等待阅读(52) 评论(0) 编辑收藏举报

1.环境准备

2.FileUtilsDelete.java // 删除文件

3.WordCountMapper.java // mapper逻辑处理

4.WordCountReducer.java // reducer逻辑处理

5.WordCountApp.java // 连接hadoop 处理hadoop文件 进行 word count 统计

6.WordCountCombinerLocalApp.java // 处理本地文件 进行 word count 统计

5.WordCountApp.java // 连接hadoop 处理hadoop文件进行 word count 统计

6.WordCountCombinerLocalApp.java // 处理本地文件进行 word count 统计