数据清洗--使用本机的java代码

驱动类

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class ETLDriver implements Tool {

private Configuration configuration;

public int run(String[] strings) throws Exception {
//创建Job
Job job = Job.getInstance(configuration);

//设置运行环境
job.setJarByClass(ETLDriver.class);

//设置对应的MapperReduce类
job.setMapperClass(ETLMapper.class);

//设置Mapper输出的
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(NullWritable.class);

//设置全局的输出
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);

//设置输出输入路径
FileInputFormat.setInputPaths(job,new Path(strings[0]));
FileOutputFormat.setOutputPath(job,new Path(strings[1]));

//不需要reduce
job.setNumReduceTasks(0);

//提交
job.submit();
return 1;
}

public void setConf(Configuration configuration) {
this.configuration=configuration;
}

public Configuration getConf() {
return configuration;
}

//主函数
public static void main(String[] args) throws Exception{
ToolRunner.run(new ETLDriver(),args);
}
}

2maven依赖

<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>RELEASE</version>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-core</artifactId>
<version>2.8.2</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.7.2</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.7.2</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>2.7.2</version>
</dependency>

</dependencies>

posted @ 2021-12-17 19:38  晨enough  阅读(299)  评论(0编辑  收藏  举报