说明
通过编写mapreduce,来清洗数据
| 清洗的原始数据格式: |
| 180.153.11.130 |
| - |
| - |
| 2018-03-18 11:42:44 |
| "POST https://www.taobao.com/category/d HTTP/1.1" |
| 200 |
| 12901 |
| https://www.taobao.com/category/b |
| Google Chrome Chromium/Blinkwindows |
| 山西 |
| 37.54 |
| 112.33 |
| 57 |
| |
| |
| 清洗完成之后,把字段重新组装 返回我们需要的数据格式 |
| ip, |
| time, |
| request_url, |
| status, |
| body_bytes, |
| referer_url, |
| user_agent, |
| province, |
| latitude, |
| longitude, |
| age |
编写Java代码
pom.xml文件
| <?xml version="1.0" encoding="UTF-8"?> |
| <project xmlns="http://maven.apache.org/POM/4.0.0" |
| xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" |
| xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> |
| <modelVersion>4.0.0</modelVersion> |
| |
| <groupId>org.example</groupId> |
| <artifactId>dataclean</artifactId> |
| <version>1.0-SNAPSHOT</version> |
| |
| <properties> |
| <hadoop.version>2.8.5</hadoop.version> |
| </properties> |
| |
| <dependencies> |
| <dependency> |
| <groupId>org.apache.hadoop</groupId> |
| <artifactId>hadoop-common</artifactId> |
| <version>${hadoop.version}</version> |
| </dependency> |
| |
| <dependency> |
| <groupId>org.apache.hadoop</groupId> |
| <artifactId>hadoop-client</artifactId> |
| <version>${hadoop.version}</version> |
| </dependency> |
| </dependencies> |
| |
| <build> |
| <finalName>dataClean</finalName> |
| </build> |
| </project> |
编写mapper代码
| import org.apache.hadoop.io.LongWritable; |
| import org.apache.hadoop.io.NullWritable; |
| import org.apache.hadoop.io.Text; |
| import org.apache.hadoop.mapreduce.Mapper; |
| |
| import java.io.IOException; |
| |
| |
| |
| |
| public class DataCleanMapper extends Mapper<LongWritable, Text, Text, NullWritable> { |
| |
| |
| |
| |
| |
| |
| |
| |
| @Override |
| protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { |
| String line = value.toString(); |
| String[] array = line.split(" "); |
| |
| if (array.length < 16) { |
| return; |
| } |
| |
| |
| int status = Integer.parseInt(array[8]); |
| if (status >= 400) { |
| return; |
| } |
| |
| |
| String ipAddr = array[0]; |
| if (ipAddr.contains("-")) { |
| return; |
| } |
| String province = array[array.length - 4]; |
| if (province.contains("-")) { |
| return; |
| } |
| String latitude = array[array.length - 3]; |
| if (latitude.contains("-")) { |
| return; |
| } |
| String longitude = array[array.length - 2]; |
| if (longitude.contains("-")) { |
| return; |
| } |
| String time = array[3] + " " + array[4]; |
| String requestUrl = array[6]; |
| String bodyBytes = array[9]; |
| String refererUrl = array[10]; |
| String userAgent = ""; |
| for (int i = 11; i <= array.length - 5; i++) { |
| userAgent += array[1]; |
| } |
| String age = array[array.length - 1]; |
| String result = ipAddr + "," + time + "," + requestUrl + "," + status + "," + bodyBytes + "," + refererUrl + "," + userAgent + "," + province + "," + latitude + "," + longitude + "," + age; |
| context.write(new Text(result), NullWritable.get()); |
| } |
| } |
编写Driver代码
| import org.apache.hadoop.conf.Configuration; |
| import org.apache.hadoop.fs.FileSystem; |
| import org.apache.hadoop.fs.Path; |
| import org.apache.hadoop.io.NullWritable; |
| import org.apache.hadoop.io.Text; |
| import org.apache.hadoop.mapreduce.Job; |
| import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; |
| import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; |
| |
| import java.io.IOException; |
| import java.net.URI; |
| import java.net.URISyntaxException; |
| |
| public class DataCleanDriver { |
| public static void main(String[] args) throws IOException, URISyntaxException, InterruptedException, ClassNotFoundException { |
| Configuration conf = new Configuration(); |
| conf.set("fs.defaultFS", "hdfs://node1:9000"); |
| Job job = Job.getInstance(); |
| job.setJarByClass(DataCleanDriver.class); |
| |
| job.setMapperClass(DataCleanMapper.class); |
| job.setMapOutputKeyClass(Text.class); |
| job.setMapOutputValueClass(NullWritable.class); |
| |
| job.setNumReduceTasks(0); |
| |
| |
| |
| |
| System.out.println(args[0]); |
| FileInputFormat.setInputPaths(job, new Path(args[0])); |
| Path path = new Path(args[1]); |
| FileSystem fileSystem = FileSystem.get(new URI("hdfs://node1:9000"), conf, "root"); |
| if (fileSystem.exists(path)) { |
| fileSystem.delete(path, true); |
| } |
| FileOutputFormat.setOutputPath(job, path); |
| |
| boolean result = job.waitForCompletion(true); |
| System.exit(result?0:1); |
| } |
| } |
在本地测试执行



在xshell端通过命令行执行
打jar包

上传jar包

执行jar包
hadoop jar /opt/project/dataClean/dataClean.jar DataCleanDriver /project/20220809/* /dataClean
编写shell脚本执行命令,执行脚本即可

| #!/bin/bash |
| |
| echo "=============================项目数据清洗数据启动成功==============================" |
| timeStr=`date "+%Y%m%d"` |
| |
| inpath="/project/$timeStr/*" |
| echo "MR程序的清洗路径定义完成,清洗数据路径为$inpath" |
| outpath="/dataClean" |
| hadoop jar /opt/project/dataClean/dataClean.jar DataCleanDriver $inpath $outpath |
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· TypeScript + Deepseek 打造卜卦网站:技术与玄学的结合
· Manus的开源复刻OpenManus初探
· AI 智能体引爆开源社区「GitHub 热点速览」
· 三行代码完成国际化适配,妙~啊~
· .NET Core 中如何实现缓存的预热?