刘一辰的软件工程随笔

石家庄铁道大学2021年秋季

2019 级课堂测试试卷-MongDB日志分析

课程名称：大型数据库应用技术任课教师：王建民测试时间：150分钟

Result文件数据说明：

Ip：106.39.41.166,（城市）

Date：10/Nov/2016:00:01:02 +0800,（日期）

Day：10,（天数）

Traffic: 54 ,（流量）

Type: video,（类型：视频video或文章article）

Id: 8701（视频或者文章的id）

测试要求：

1、 数据清洗：按照进行数据清洗，并将清洗后的数据导入MongDB数据库中。

两阶段数据清洗：

（1）第一阶段：把需要的信息从原始日志中提取出来

ip: 199.30.25.88

time: 10/Nov/2016:00:01:03 +0800

traffic: 62

文章： article/11325

视频： video/3235

源代码：

package com.Use;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

public class cleanData {

public static class Map extends Mapper<Object , Text , Text , IntWritable>{

private static Text newKey=new Text();

private static String chage(String data) {

char[] str = data.toCharArray();

String[] time = new String[7];

int j = 0;

int k = 0;

for(int i=0;i<str.length;i++) {

if(str[i]=='/'||str[i]==':'||str[i]==32) {

time[k] = data.substring(j,i);

j = i+1;

k++;

}

time[k] = data.substring(j, data.length());

switch(time[1]) { case "Jan":time[1]="01";break; case

"Feb":time[1]="02";break; case "Mar":time[1]="03";break; case

"Apr":time[1]="04";break; case "May":time[1]="05";break; case

"Jun":time[1]="06";break; case "Jul":time[1]="07";break; case

"Aug":time[1]="08";break; case "Sep":time[1]="09";break; case

"Oct":time[1]="10";break; case "Nov":time[1]="11";break; case

"Dec":time[1]="12";break; }

data = time[2]+"-"+time[1]+"-"+time[0]+" "+time[3]+":"+time[4]+":"+time[5];

return data;

}

public void map(Object key,Text value,Context context) throws IOException, InterruptedException{

String line=value.toString();

System.out.println(line);

String arr[]=line.split(",");

String ip = arr[0];

String date = arr[1];

String day = arr[2];

String traffic = arr[3];

String type = arr[4];

String id = arr[5];

date = chage(date);

traffic = traffic.substring(0, traffic.length()-1);

newKey.set(ip+'\t'+date+'\t'+day+'\t'+traffic+'\t'+type);

//newKey.set(ip+','+date+','+day+','+traffic+','+type);

int click=Integer.parseInt(id);

context.write(newKey, new IntWritable(click));

}

public static class Reduce extends Reducer<Text, IntWritable, Text, IntWritable>{

public void reduce(Text key,Iterable<IntWritable> values,Context context) throws IOException, InterruptedException{

for(IntWritable val : values){

context.write(key, val);

}

public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException{

Configuration conf=new Configuration();

System.out.println("start");

Job job =new Job(conf,"cleanData");

job.setJarByClass(cleanData.class);

job.setMapperClass(Map.class);

job.setReducerClass(Reduce.class);

job.setOutputKeyClass(Text.class);

job.setOutputValueClass(IntWritable.class);

job.setInputFormatClass(TextInputFormat.class);

job.setOutputFormatClass(TextOutputFormat.class);

Path in=new Path("hdfs://192.168.137.112:9000/tutorial/in/result.txt");

Path out=new Path("hdfs://192.168.137.112:9000/tutorial/out");

FileInputFormat.addInputPath(job,in);

FileOutputFormat.setOutputPath(job,out);

System.exit(job.waitForCompletion(true) ? 0 : 1);

}

CleanData

清洗后的样子：

将其导入excel表中并转化成csv格式（设置参考为Tab）

（2）第二阶段：根据提取出来的信息做精细化操作

ip--->城市 city（IP）

date--> time:2016-11-10 00:01:03

day: 10

traffic:62

type:article/video

id:11325

（3）MongDB数据库表结构:

create table data( ip string, time string , day string, traffic bigint,

type string, id string )

导入到MongoDB

D:\>cd \Program Files\MongoDB\Server\4.0

D:\Program Files\MongoDB\Server\4.0>cd bin

D:\Program Files\MongoDB\Server\4.0\bin>mongod -dbpath D:\Program Files\MongoDB\Server\4.0\db

启动

mongoimport -d movie -c Rec --type csv --headerline --file C:\Users\79096.LAPTOP-1607ORT2\Desktop\result.csv

文件导入

导入成功

2、数据处理：

·统计最受欢迎的视频/文章的Top10访问次数（video/article）

·按照地市统计最受欢迎的Top10课程（ip）

·按照流量统计最受欢迎的Top10课程（traffic）

public class Map extends Mapper<Object , Text , Text , NullWritable>

{

public void map(Object key,Text value,Context context) throws IOException, InterruptedException

{

//获取并输出每一次的处理过程

String line=value.toString();

String arr[]=line.split(",");

String oldData=arr[1];

String dataTemp[] =oldData.split("/");

if(dataTemp[1].equals("Nov")) {

dataTemp[1]="11";

}else {

dataTemp[1]="1";

}

String dataYear=dataTemp[2].substring(0,4);

String dataTime=dataTemp[2].substring(5,13);

String newData=dataYear+"-"+dataTemp[1]+"-"+dataTemp[0]+" "+dataTime;

//traffic

String traffic=arr[3].replace(" ", "");

String type=arr[4];

//id

String id=arr[5];

//String sum="ip:"+arr[0]+",date:"+newData+",day:"+arr[2]+",traffice:"+traffic+",type:"+type+",id:"+id;

String sum=arr[0]+","+newData+","+arr[2]+","+traffic+","+type+","+id;

context.write(new Text(sum), NullWritable.get());

}

3、数据可视化：将统计结果倒入MySql数据库中，通过图形化展示的方式展现出来。（不完全）

package Test;

import java.io.IOException;

import java.util.StringTokenizer;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class WordCount{

public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

Job job = Job.getInstance();

job.setJobName("WordCount");

job.setJarByClass(WordCount.class);

job.setMapperClass(doMapper.class);

job.setReducerClass(doReducer.class);

job.setOutputKeyClass(Text.class);

job.setOutputValueClass(IntWritable.class);

Path in = new Path("hdfs://localhost:9000/user/hadoop/name/result.txt");

Path out = new Path("hdfs://localhost:9000/user/hadoop/name/out2");

FileInputFormat.addInputPath(job, in);

FileOutputFormat.setOutputPath(job, out);

System.exit(job.waitForCompletion(true) ? 0 : 1);

public static class doMapper extends Mapper{

public static final IntWritable one = new IntWritable(1);

public static Text word = new Text();

@Overrid

protected void map(Object key, Text value, Context context)

throws IOException, InterruptedException {

StringTokenizer tokenizer = new StringTokenizer(value.toString(), "")

word.set(tokenizer.nextToken());

context.write(word, one)

}

public static class doReducer extends Reducer{

private IntWritable result = new IntWritable();

@Override

protected void reduce(Text key, Iterable values, Context context)

throws IOException, InterruptedException {

int sum = 0;

for (IntWritable value : values) {

sum += value.get();

}

result.set(sum);

context.write(key, result);

}

得到截图

posted @ 2021-09-05 21:03 崤函隳阅读(32) 评论(0) 编辑收藏举报

刷新页面返回顶部

登录后才能查看或发表评论，立即登录或者逛逛博客园首页

公告

欢迎阅读『刘一辰的软件工程随笔』

如果你觉得你看懂了我说的话，那么你一定是误解我的意思了

昵称：崤函隳
园龄： 4年8个月
粉丝： 2
关注： 0

+加关注

2025年3月

日

一

二

三

四

五

六

Arabic	Hebrew	Polish
Bulgarian	Hindi	Portuguese
Catalan	Hmong Daw	Romanian
Chinese Simplified	Hungarian	Russian
Chinese Traditional	Indonesian	Slovak
Czech	Italian	Slovenian
Danish	Japanese	Spanish
Dutch	Klingon	Swedish
English	Korean	Thai
Estonian	Latvian	Turkish
Finnish	Lithuanian	Ukrainian
French	Malay	Urdu
German	Maltese	Vietnamese
Greek	Norwegian	Welsh
Haitian Creole	Persian

崤函隳

刘一辰的软件工程随笔

公告

搜索

常用链接

随笔分类

随笔档案

阅读排行榜