随堂测试——数据清洗
Result文件数据说明:
Ip:106.39.41.166,(城市)
Date:10/Nov/2016:00:01:02 +0800,(日期)
Day:10,(天数)
Traffic: 54 ,(流量)
Type: video,(类型:视频video或文章article)
Id: 8701(视频或者文章的id)
测试要求:
1、 数据清洗:按照进行数据清洗,并将清洗后的数据导入hive数据库中。
两阶段数据清洗:
(1)第一阶段:把需要的信息从原始日志中提取出来
ip: 199.30.25.88
time: 10/Nov/2016:00:01:03 +0800
traffic: 62
文章: article/11325
视频: video/3235
(2)第二阶段:根据提取出来的信息做精细化操作
ip--->城市 city(IP)
date--> time:2016-11-10 00:01:03
day: 10
traffic:62
type:article/video
id:11325
(3)hive数据库表结构:
create table data( ip string, time string , day string, traffic bigint,
type string, id string )
第一阶段结果展示
第二阶段
第二阶段目前有些问题,只完成一部分代码
一二阶段部分代码
import java.io.IOException;
import java.sql.Date;
import java.text.SimpleDateFormat;
import java.util.Locale;
import java.util.StringTokenizer;
import java.sql.Date;
import java.text.SimpleDateFormat;
import java.util.Locale;
import java.util.StringTokenizer;
import javax.xml.bind.helpers.ParseConversionEventImpl;
import org.omg.CORBA.PUBLIC_MEMBER;
import com.sun.org.apache.bcel.internal.generic.RETURN;
import com.sun.org.apache.xerces.internal.impl.xs.identity.FieldActivator;
import com.sun.org.apache.xerces.internal.impl.xs.identity.Selector.Matcher;
import com.sun.org.apache.xerces.internal.impl.xs.identity.Selector.XPath;
import com.sun.xml.internal.bind.CycleRecoverable.Context;
import com.sun.xml.internal.bind.Locatable;
import com.sun.xml.internal.ws.config.management.policy.ManagementPrefixMapper;
import com.sun.xml.internal.ws.policy.privateutil.PolicyUtils.Text;
import com.sun.org.apache.xerces.internal.impl.xs.identity.FieldActivator;
import com.sun.org.apache.xerces.internal.impl.xs.identity.Selector.Matcher;
import com.sun.org.apache.xerces.internal.impl.xs.identity.Selector.XPath;
import com.sun.xml.internal.bind.CycleRecoverable.Context;
import com.sun.xml.internal.bind.Locatable;
import com.sun.xml.internal.ws.config.management.policy.ManagementPrefixMapper;
import com.sun.xml.internal.ws.policy.privateutil.PolicyUtils.Text;
import javafx.scene.chart.PieChart.Data;
import javafx.scene.shape.Line;
import jdk.internal.dynalink.beans.StaticClass;
import javafx.scene.shape.Line;
import jdk.internal.dynalink.beans.StaticClass;
public class Namecount {
public String[] parse(String line) {
String ip=parseIp(line);
String time=parseTime(line);
String day=parseDay(line);
String traffic=parseTraffic(line);
String type=parseType(line);
String id=parseId(line);
return new String[] {
ip,time,day,traffic,type,id
};
}
private String parseId(String line)
{
final String trim=line.substring(line.lastIndexOf("\"")+1).trim();
String id=trim.split(",")[1];
return id;
}
private String parseType(String line)
{
final String trim=line.substring(line.lastIndexOf("\"")+1).trim();
String type=trim.split(",")[1];
return type;
}
private String parseTraffic(String line)
{
final String trim=line.substring(line.lastIndexOf("\"")+1).trim();
String traffic=trim.split(",")[1];
return traffic;
}
private String parseDay(String line)
{
final String trim=line.substring(line.lastIndexOf("\"")+1).trim();
String day=trim.split(",")[1];
return day;
}
private String parseTime(String line)
{
final int first=line.indexOf("[");
final int last=line.indexOf("+0800]");
String date=line.substring(first + 1,last).trim();
String time = null;
Date date1=parseDateFormat(time);
return dateformat1.format(date1);
}
private String parseIp(String line) {
String ip=line.split(",")[0].trim();
return ip;
}
public String[] parse(String line) {
String ip=parseIp(line);
String time=parseTime(line);
String day=parseDay(line);
String traffic=parseTraffic(line);
String type=parseType(line);
String id=parseId(line);
return new String[] {
ip,time,day,traffic,type,id
};
}
private String parseId(String line)
{
final String trim=line.substring(line.lastIndexOf("\"")+1).trim();
String id=trim.split(",")[1];
return id;
}
private String parseType(String line)
{
final String trim=line.substring(line.lastIndexOf("\"")+1).trim();
String type=trim.split(",")[1];
return type;
}
private String parseTraffic(String line)
{
final String trim=line.substring(line.lastIndexOf("\"")+1).trim();
String traffic=trim.split(",")[1];
return traffic;
}
private String parseDay(String line)
{
final String trim=line.substring(line.lastIndexOf("\"")+1).trim();
String day=trim.split(",")[1];
return day;
}
private String parseTime(String line)
{
final int first=line.indexOf("[");
final int last=line.indexOf("+0800]");
String date=line.substring(first + 1,last).trim();
String time = null;
Date date1=parseDateFormat(time);
return dateformat1.format(date1);
}
private String parseIp(String line) {
String ip=line.split(",")[0].trim();
return ip;
}
public void map(Locatable key,Text value,Context context)
throws IOException,InterruptedException{
Text outputValue=new Text();
String line =value.toString();
Namecount aa=new Namecount();
StringTokenizer tokenizerArticle=new StringTokenizer(line,"\n");
while (tokenizerArticle.hasMoreElements()) {
String stra=tokenizerArticle.nextToken().toString();
String [] Newstr=aa.parse(stra);
}
}
public static final SimpleDateFormat FORMAT=new SimpleDateFormat("xx/yyy/nnnn:ss:ff:mm",Locale.ENGLISH);
public static final SimpleDateFormat dateformat1=new SimpleDateFormat("nnnn-yyy-xx ss:ff;mm");
private Date parseDateFormat(String string) {
Date parse=null;
try {
parse=(Date) FORMAT.parse(string);
}catch(Exception e) {
e.printStackTrace();
}
return parse;
}
}
throws IOException,InterruptedException{
Text outputValue=new Text();
String line =value.toString();
Namecount aa=new Namecount();
StringTokenizer tokenizerArticle=new StringTokenizer(line,"\n");
while (tokenizerArticle.hasMoreElements()) {
String stra=tokenizerArticle.nextToken().toString();
String [] Newstr=aa.parse(stra);
}
}
public static final SimpleDateFormat FORMAT=new SimpleDateFormat("xx/yyy/nnnn:ss:ff:mm",Locale.ENGLISH);
public static final SimpleDateFormat dateformat1=new SimpleDateFormat("nnnn-yyy-xx ss:ff;mm");
private Date parseDateFormat(String string) {
Date parse=null;
try {
parse=(Date) FORMAT.parse(string);
}catch(Exception e) {
e.printStackTrace();
}
return parse;
}
}