刘一辰的软件工程随笔
石家庄铁道大学2021年秋季
2019 级课堂测试试卷-MongDB日志分析
课程名称:大型数据库应用技术 任课教师:王建民 测试时间:150分钟
Result文件数据说明:
Ip:106.39.41.166,(城市)
Date:10/Nov/2016:00:01:02 +0800,(日期)
Day:10,(天数)
Traffic: 54 ,(流量)
Type: video,(类型:视频video或文章article)
Id: 8701(视频或者文章的id)
测试要求:
1、 数据清洗:按照进行数据清洗,并将清洗后的数据导入MongDB数据库中。
两阶段数据清洗:
(1)第一阶段:把需要的信息从原始日志中提取出来
ip: 199.30.25.88
time: 10/Nov/2016:00:01:03 +0800
traffic: 62
文章: article/11325
视频: video/3235
源代码:
package com.Use;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
public class cleanData {
public static class Map extends Mapper<Object , Text , Text , IntWritable>{
private static Text newKey=new Text();
private static String chage(String data) {
char[] str = data.toCharArray();
String[] time = new String[7];
int j = 0;
int k = 0;
for(int i=0;i<str.length;i++) {
if(str[i]=='/'||str[i]==':'||str[i]==32) {
time[k] = data.substring(j,i);
j = i+1;
k++;
}
}
time[k] = data.substring(j, data.length());
switch(time[1]) { case "Jan":time[1]="01";break; case
"Feb":time[1]="02";break; case "Mar":time[1]="03";break; case
"Apr":time[1]="04";break; case "May":time[1]="05";break; case
"Jun":time[1]="06";break; case "Jul":time[1]="07";break; case
"Aug":time[1]="08";break; case "Sep":time[1]="09";break; case
"Oct":time[1]="10";break; case "Nov":time[1]="11";break; case
"Dec":time[1]="12";break; }
data = time[2]+"-"+time[1]+"-"+time[0]+" "+time[3]+":"+time[4]+":"+time[5];
return data;
}
public void map(Object key,Text value,Context context) throws IOException, InterruptedException{
String line=value.toString();
System.out.println(line);
String arr[]=line.split(",");
String ip = arr[0];
String date = arr[1];
String day = arr[2];
String traffic = arr[3];
String type = arr[4];
String id = arr[5];
date = chage(date);
traffic = traffic.substring(0, traffic.length()-1);
newKey.set(ip+'\t'+date+'\t'+day+'\t'+traffic+'\t'+type);
//newKey.set(ip+','+date+','+day+','+traffic+','+type);
int click=Integer.parseInt(id);
context.write(newKey, new IntWritable(click));
}
}
public static class Reduce extends Reducer<Text, IntWritable, Text, IntWritable>{
public void reduce(Text key,Iterable<IntWritable> values,Context context) throws IOException, InterruptedException{
for(IntWritable val : values){
context.write(key, val);
}
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException{
Configuration conf=new Configuration();
System.out.println("start");
Job job =new Job(conf,"cleanData");
job.setJarByClass(cleanData.class);
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
Path in=new Path("hdfs://192.168.137.112:9000/tutorial/in/result.txt");
Path out=new Path("hdfs://192.168.137.112:9000/tutorial/out");
FileInputFormat.addInputPath(job,in);
FileOutputFormat.setOutputPath(job,out);
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
CleanData
清洗后的样子:
将其导入excel表中并转化成csv格式(设置参考为Tab)
(2)第二阶段:根据提取出来的信息做精细化操作
ip--->城市 city(IP)
date--> time:2016-11-10 00:01:03
day: 10
traffic:62
type:article/video
id:11325
(3)MongDB数据库表结构:
create table data( ip string, time string , day string, traffic bigint,
type string, id string )
导入到MongoDB
D:\>cd \Program Files\MongoDB\Server\4.0
D:\Program Files\MongoDB\Server\4.0>cd bin
D:\Program Files\MongoDB\Server\4.0\bin>mongod -dbpath D:\Program Files\MongoDB\Server\4.0\db
启动
mongoimport -d movie -c Rec --type csv --headerline --file C:\Users\79096.LAPTOP-1607ORT2\Desktop\result.csv
文件导入
导入成功
2、数据处理:
·统计最受欢迎的视频/文章的Top10访问次数 (video/article)
·按照地市统计最受欢迎的Top10课程 (ip)
·按照流量统计最受欢迎的Top10课程 (traffic)
public class Map extends Mapper<Object , Text , Text , NullWritable>
{
public void map(Object key,Text value,Context context) throws IOException, InterruptedException
{
//获取并输出每一次的处理过程
String line=value.toString();
String arr[]=line.split(",");
String oldData=arr[1];
String dataTemp[] =oldData.split("/");
if(dataTemp[1].equals("Nov")) {
dataTemp[1]="11";
}else {
dataTemp[1]="1";
}
String dataYear=dataTemp[2].substring(0,4);
String dataTime=dataTemp[2].substring(5,13);
String newData=dataYear+"-"+dataTemp[1]+"-"+dataTemp[0]+" "+dataTime;
//traffic
String traffic=arr[3].replace(" ", "");
String type=arr[4];
//id
String id=arr[5];
//String sum="ip:"+arr[0]+",date:"+newData+",day:"+arr[2]+",traffice:"+traffic+",type:"+type+",id:"+id;
String sum=arr[0]+","+newData+","+arr[2]+","+traffic+","+type+","+id;
context.write(new Text(sum), NullWritable.get());
}
}
3、数据可视化:将统计结果倒入MySql数据库中,通过图形化展示的方式展现出来。(不完全)
package Test;
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class WordCount{
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Job job = Job.getInstance();
job.setJobName("WordCount");
job.setJarByClass(WordCount.class);
job.setMapperClass(doMapper.class);
job.setReducerClass(doReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
Path in = new Path("hdfs://localhost:9000/user/hadoop/name/result.txt");
Path out = new Path("hdfs://localhost:9000/user/hadoop/name/out2");
FileInputFormat.addInputPath(job, in);
FileOutputFormat.setOutputPath(job, out);
System.exit(job.waitForCompletion(true) ? 0 : 1);
public static class doMapper extends Mapper{
public static final IntWritable one = new IntWritable(1);
public static Text word = new Text();
@Overrid
protected void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
StringTokenizer tokenizer = new StringTokenizer(value.toString(), "")
word.set(tokenizer.nextToken());
context.write(word, one)
}
public static class doReducer extends Reducer{
private IntWritable result = new IntWritable();
@Override
protected void reduce(Text key, Iterable values, Context context)
throws IOException, InterruptedException {
int sum = 0;
for (IntWritable value : values) {
sum += value.get();
}
result.set(sum);
context.write(key, result);
}
}
}
得到截图
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· Linux系列:如何用heaptrack跟踪.NET程序的非托管内存泄露
· 开发者必知的日志记录最佳实践
· SQL Server 2025 AI相关能力初探
· Linux系列:如何用 C#调用 C方法造成内存泄露
· AI与.NET技术实操系列(二):开始使用ML.NET
· 被坑几百块钱后,我竟然真的恢复了删除的微信聊天记录!
· 没有Manus邀请码?试试免邀请码的MGX或者开源的OpenManus吧
· 【自荐】一款简洁、开源的在线白板工具 Drawnix
· 园子的第一款AI主题卫衣上架——"HELLO! HOW CAN I ASSIST YOU TODAY
· Docker 太简单,K8s 太复杂?w7panel 让容器管理更轻松!