Hadoop组件---HDFS&Yarn&MapReduce

一. HDFS

1. namenode作用

元数据信息存储

2. datanode作用

3. 文件副本机制

4. 机架感知

5. 命令操作

DEMO

  1 import org.apache.commons.io.IOUtils;
  2 import org.apache.hadoop.conf.Configuration;
  3 import org.apache.hadoop.fs.*;
  4 import org.junit.Test;
  5 
  6 import java.io.File;
  7 import java.io.FileOutputStream;
  8 import java.io.IOException;
  9 import java.net.URI;
 10 
 11 public class HadoopTestDemo {
 12 
 13     // 连接hadoop获取filesystem的四种方式
 14     @Test
 15     public void getFileSystem1() throws IOException {
 16         Configuration configuration = new Configuration();
 17         configuration.set("fs.defaultFS", "hdfs://192.168.217.4:8020/");
 18         configuration.set("fs.hdfs.impl", "org.apache.hadoop.hdfs.DistributedFileSystem");
 19         FileSystem fileSystem = FileSystem.get(configuration);
 20         System.out.println(fileSystem.toString());
 21     }
 22 
 23     @Test
 24     public void getFileSystem2() throws Exception {
 25         FileSystem fileSystem = FileSystem.get(new URI("hdfs://192.168.217.4:8020"), new Configuration());
 26         System.out.println(fileSystem.toString());
 27     }
 28 
 29     @Test
 30     public void getFileSystem3() throws IOException {
 31         Configuration configuration = new Configuration();
 32         configuration.set("fs.defaultFS", "hdfs://192.168.217.4:8020/");
 33         configuration.set("fs.hdfs.impl", "org.apache.hadoop.hdfs.DistributedFileSystem");
 34         FileSystem fileSystem = FileSystem.newInstance(configuration);
 35         System.out.println(fileSystem.toString());
 36     }
 37 
 38     @Test
 39     public void getFileSystem4() throws Exception {
 40         FileSystem fileSystem = FileSystem.newInstance(new URI("hdfs://192.168.217.4:8020"), new Configuration());
 41         System.out.println(fileSystem.toString());
 42     }
 43 
 44     // 遍历HDFS中所有文件
 45     @Test
 46     public void listMyFiles() throws Exception {
 47         FileSystem fileSystem = FileSystem.newInstance(new URI("hdfs://192.168.217.4:8020"), new Configuration());
 48         RemoteIterator<LocatedFileStatus> locatedFileStatusRemoteIterator = fileSystem.listFiles(new Path("/"), true);
 49         while (locatedFileStatusRemoteIterator.hasNext()) {
 50             LocatedFileStatus next = locatedFileStatusRemoteIterator.next();
 51             System.out.println(next.getPath().toString());
 52         }
 53         fileSystem.close();
 54     }
 55 
 56     // HDFS上创建文件夹
 57     @Test
 58     public void mkdirs() throws Exception {
 59         FileSystem fileSystem = FileSystem.newInstance(new URI("hdfs://192.168.217.4:8020"), new Configuration());
 60         boolean mkdirs = fileSystem.mkdirs(new Path("/hello/mydir/test"));
 61         System.out.println(mkdirs);
 62         fileSystem.close();
 63     }
 64 
 65     // 下载文件
 66     @Test
 67     public void downloadFile() throws Exception {
 68         FileSystem fileSystem = FileSystem.newInstance(new URI("hdfs://192.168.217.4:8020"), new Configuration());
 69         FSDataInputStream inputStream = fileSystem.open(new Path("/test/a.txt"));
 70         FileOutputStream fileOutputStream = new FileOutputStream(new File("./a.txt"));
 71         IOUtils.copy(inputStream, fileOutputStream);
 72         IOUtils.closeQuietly(inputStream);
 73         IOUtils.closeQuietly(fileOutputStream);
 74         System.out.println("end");
 75         fileSystem.close();
 76     }
 77 
 78     // 文件上传
 79     @Test
 80     public void uploadFile() throws Exception {
 81         FileSystem fileSystem = FileSystem.newInstance(new URI("hdfs://192.168.217.4:8020"), new Configuration());
 82         fileSystem.copyFromLocalFile(new Path("./delubi.jpg"), new Path("/test"));
 83         fileSystem.close();
 84     }
 85 
 86     // 小文件合并
 87     @Test
 88     public void mergeFile() throws Exception {
 89         FileSystem fileSystem = FileSystem.newInstance(new URI("hdfs://192.168.217.4:8020"), new Configuration());
 90         FSDataOutputStream fsDataOutputStream = fileSystem.create(new Path("/test/lucian.txt"));
 91         LocalFileSystem local = FileSystem.getLocal(new Configuration());
 92         FileStatus[] fileStatuses = local.listStatus(new Path("./input"));
 93         for (FileStatus fileStatus : fileStatuses) {
 94             FSDataInputStream inputStream = local.open(fileStatus.getPath());
 95             IOUtils.copy(inputStream, fsDataOutputStream);
 96             IOUtils.closeQuietly(inputStream);
 97         }
 98         IOUtils.closeQuietly(fsDataOutputStream);
 99         local.close();
100         fileSystem.close();
101     }
102 
103 }

二. MapReduce

1.基本使用

WordCountMapper.java

 1 package com.luyizhou.mapreduce;
 2 
 3 import org.apache.hadoop.io.LongWritable;
 4 import org.apache.hadoop.io.Text;
 5 import org.apache.hadoop.mapreduce.Mapper;
 6 
 7 import java.io.IOException;
 8 
 9 /*
10 四个泛型
11  */
12 public class WordCountMapper extends Mapper<LongWritable, Text, Text, LongWritable> {
13 
14     // map方法就是将k1/v1转为k2,v2
15     // k1 偏移量，v1 每一行的文本数据
16     // context 上下文对象
17     @Override
18     protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
19         Text text = new Text();
20         LongWritable longWritable = new LongWritable();
21         // 1.将一行的文本数据拆分
22         String[] split = value.toString().split(",");
23         // 2.遍历数组，组装k2,v2
24         for (String word : split) {
25             // 3.将k2,v2写入上下文
26             text.set(word);
27             longWritable.set(1);
28             context.write(text, longWritable);
29         }
30 
31     }
32 }

WordCountReducer.java

 1 package com.luyizhou.mapreduce;
 2 
 3 import org.apache.hadoop.io.LongWritable;
 4 import org.apache.hadoop.io.Text;
 5 import org.apache.hadoop.mapreduce.Reducer;
 6 
 7 import java.io.IOException;
 8 
 9 public class WordCountReducer extends Reducer<Text, LongWritable, Text, LongWritable> {
10 
11     /*
12     * 将新的k2,v2转为k3,v3,并写入上下文
13     * key:新k2
14     * values:集合，新v2
15     * context:表示上下文对象
16     * */
17     @Override
18     protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
19         long count = 0;
20         //1.遍历集合，将集合中的数字相加，得到v3
21         for (LongWritable value : values) {
22             count += value.get();
23         }
24         //2.将k3和v3写入上下文
25         context.write(key, new LongWritable(count));
26     }
27 }

JobMain.java

 1 package com.luyizhou.mapreduce;
 2 
 3 import org.apache.hadoop.conf.Configuration;
 4 import org.apache.hadoop.conf.Configured;
 5 import org.apache.hadoop.fs.FileSystem;
 6 import org.apache.hadoop.fs.Path;
 7 import org.apache.hadoop.io.LongWritable;
 8 import org.apache.hadoop.io.Text;
 9 import org.apache.hadoop.mapreduce.Job;
10 import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
11 import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
12 import org.apache.hadoop.util.Tool;
13 import org.apache.hadoop.util.ToolRunner;
14 
15 import java.net.URI;
16 
17 
18 public class JobMain extends Configured implements Tool {
19 
20     // 该方法用于指定一个Job任务
21     @Override
22     public int run(String[] strings) throws Exception {
23         // 1.创建一个job任务对象
24         Job job = Job.getInstance(super.getConf(), "wordcountOfluyizhou");
25         job.setJarByClass(JobMain.class);   // 必须添加，否则打包jar报错,跟版本有关
26         // 2.配置job任务对象, 八个步骤
27 
28         // map阶段
29         // 第一步：指定文件的读取路径和读取方式
30         job.setInputFormatClass(TextInputFormat.class);
31         TextInputFormat.addInputPath(job, new Path("hdfs://192.168.217.4:8020/wordcount"));
32         // 本地模式
33         // TextInputFormat.addInputPath(job, new Path("file:///F:\\tmp\\input"));
34         // 第二步：指定map阶段的处理方式
35         job.setMapperClass(WordCountMapper.class);
36         job.setMapOutputKeyClass(Text.class);
37         job.setMapOutputValueClass(LongWritable.class);
38 
39         // shuffle阶段
40         // 第三、四、五、六步：采用默认方式，暂时不做处理
41 
42         // reduce阶段
43         // 第七步：指定reduce阶段处理方式和数据类型
44         job.setReducerClass(WordCountReducer.class);
45         job.setOutputKeyClass(Text.class);
46         job.setOutputValueClass(LongWritable.class);
47         // 第八步：设置输出类型和输出路径
48         job.setOutputFormatClass(TextOutputFormat.class);
49         Path path = new Path("hdfs://192.168.217.4:8020/wordcount_out");
50         TextOutputFormat.setOutputPath(job, path);
51 
52         // 本地模式运行
53         // TextOutputFormat.setOutputPath(job, new Path("file:///F:\\tmp\\output"));
54 
55         // 获取filesystem,判断目录是否存在
56         FileSystem fileSystem = FileSystem.get(new URI("hdfs://192.168.217.4:8020"), new Configuration());
57         boolean exists = fileSystem.exists(path);
58         if (exists) {
59             // 删除目录
60             fileSystem.delete(path, true);
61         }
62 
63         // 等待任务结束
64         boolean b = job.waitForCompletion(true);
65         return b ? 0:1;
66     }
67 
68     public static void main(String[] args) throws Exception {
69         Configuration configuration = new Configuration();
70         // 启动job任务
71         int run = ToolRunner.run(configuration, new JobMain(), args);
72         System.exit(run);
73     }
74 }

2. 分区

com.luyizhou.partition.PartitionMapper

 1 package com.luyizhou.partition;
 2 
 3 import org.apache.hadoop.io.LongWritable;
 4 import org.apache.hadoop.io.NullWritable;
 5 import org.apache.hadoop.io.Text;
 6 import org.apache.hadoop.mapreduce.Counter;
 7 import org.apache.hadoop.mapreduce.Mapper;
 8 
 9 import java.io.IOException;
10 
11 /*
12 * k1 行偏移量
13 * v1 行文本数据
14 * k2 一整行文本数据
15 * v2 占位符
16 * */
17 public class PartitionMapper extends Mapper<LongWritable, Text, Text, NullWritable> {
18 
19     // map方法将k1，v1 转为k2,v2
20     @Override
21     protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
22         // 方式一，定义计数器
23         Counter counter = context.getCounter("MR_COUNTER", "partition_counter");
24         // 每次执行改方法，计数器数据加1
25         counter.increment(1L);
26 
27         context.write(value, NullWritable.get());
28     }
29 }

com.luyizhou.partition.PartitionReducer

 1 package com.luyizhou.partition;
 2 
 3 import org.apache.hadoop.io.NullWritable;
 4 import org.apache.hadoop.io.Text;
 5 import org.apache.hadoop.mapreduce.Reducer;
 6 
 7 import java.io.IOException;
 8 
 9 /*
10 * k2 Text 一行文本数据
11 * V2 NullWritable
12 * k3 text 一行文本数据
13 * v2 NullWritable
14 * */
15 public class PartitionReducer extends Reducer<Text, NullWritable, Text, NullWritable> {
16 
17     public static enum Counter{
18         MY_INPUT_RECORDS, MY_INPUT_BYTES
19     }
20 
21     @Override
22     protected void reduce(Text key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
23 
24         // 方式二：使用枚举来定义计数器
25         context.getCounter(Counter.MY_INPUT_RECORDS).increment(1L);
26 
27         context.write(key, NullWritable.get());
28     }
29 }

com.luyizhou.partition.MyPartition

 1 package com.luyizhou.partition;
 2 
 3 import org.apache.hadoop.io.NullWritable;
 4 import org.apache.hadoop.io.Text;
 5 import org.apache.hadoop.mapreduce.Partitioner;
 6 
 7 
 8 public class MyPartition extends Partitioner<Text, NullWritable> {
 9 
10     /*
11     * 定义分区规则
12     * 返回对应的分区编号
13     *
14     * */
15     @Override
16     public int getPartition(Text text, NullWritable nullWritable, int i) {
17         // 1.拆分行文本数据(k2),获取中间字段值
18         String[] split = text.toString().split("\t");
19         String numStr = split[5];
20         // 2.判断中将字段的值和15的关系，返回对应的分区编号
21         if (Integer.parseInt(numStr) > 15) {
22             return 1;
23         } else {
24             return 0;
25         }
26     }
27 }

com.luyizhou.partition.JobMain

 1 package com.luyizhou.partition;
 2 
 3 import org.apache.hadoop.conf.Configuration;
 4 import org.apache.hadoop.conf.Configured;
 5 import org.apache.hadoop.fs.Path;
 6 import org.apache.hadoop.io.NullWritable;
 7 import org.apache.hadoop.io.Text;
 8 import org.apache.hadoop.mapreduce.Job;
 9 import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
10 import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
11 import org.apache.hadoop.util.Tool;
12 import org.apache.hadoop.util.ToolRunner;
13 
14 public class JobMain extends Configured implements Tool {
15 
16     @Override
17     public int run(String[] strings) throws Exception {
18         Job job = Job.getInstance(super.getConf(), "partition_mapreduce");
19         job.setJarByClass(JobMain.class);
20 
21         // map
22         job.setInputFormatClass(TextInputFormat.class);
23         TextInputFormat.addInputPath(job, new Path("hdfs://192.168.217.4:8020/input"));
24         job.setMapperClass(PartitionMapper.class);
25         job.setMapOutputKeyClass(Text.class);
26         job.setMapOutputValueClass(NullWritable.class);
27 
28         // shuffle
29         // 分区
30         job.setPartitionerClass(MyPartition.class);
31         // 排序，规约，分组
32 
33         // reduce
34         job.setReducerClass(PartitionReducer.class);
35         job.setOutputKeyClass(Text.class);
36         job.setOutputValueClass(NullWritable.class);
37         job.setNumReduceTasks(2);
38 
39         job.setOutputFormatClass(TextOutputFormat.class);
40         TextOutputFormat.setOutputPath(job, new Path("hdfs://192.168.217.4:8020/output/partition_out"));
41 
42         boolean b = job.waitForCompletion(true);
43 
44         return b ? 0:1;
45     }
46 
47     public static void main(String[] args) throws Exception {
48         Configuration configuration = new Configuration();
49         int run = ToolRunner.run(configuration, new JobMain(), args);
50         System.exit(run);
51     }
52 }

3. 排序&序列化

4. 规约

理解:规约就是提前进行局部汇总,本质上属于reduce的一部分,减少网络IO次数,提高性能

posted @ 2022-01-19 17:47 luyizhou 阅读(74) 评论(0) 编辑收藏举报

刷新页面返回顶部