hive课堂测试1

1、 数据清洗：按照进行数据清洗，并将清洗后的数据导入hive数据库中。

两阶段数据清洗：

（1）第一阶段：把需要的信息从原始日志中提取出来

ip: 199.30.25.88

time: 10/Nov/2016:00:01:03 +0800

traffic: 62

文章： article/11325

视频： video/3235

（2）第二阶段：根据提取出来的信息做精细化操作

ip--->城市 city（IP）

date--> time:2016-11-10 00:01:03

day: 10

traffic:62

type:article/video

id:11325

（3）hive数据库表结构:

create table data( ip string, time string , day string, traffic bigint,

type string, id string )

直接放源码：

package test;

import java.io.IOException;
import java.lang.String;
import java.util.*;
import java.text.SimpleDateFormat;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
public class test3{
    public static final SimpleDateFormat FORMAT = new SimpleDateFormat("d/MMM/yyyy:HH:mm:ss", Locale.ENGLISH); //原时间格式
    public static final SimpleDateFormat dateformat1 = new SimpleDateFormat("yyyy-MM-dd-HH:mm:ss");//现时间格式
    private  static Date parseDateFormat(String string) {         //转换时间格式
        Date parse = null;
        try {
            parse = FORMAT.parse(string);
        } catch (Exception e) {
            e.printStackTrace();
        }
        return parse;
    }
    public static  String[] parse(String line) {
        String ip = parseIP(line);       //ip
        String time = parseTime(line);   //时间
        String day=parseDay(line);//天数
        String type = parseType(line);     //视频video或文章article
        String id = parseId(line); //视频或者文章的id
        String traffic = parseTraffic(line);//流量
        return new String[] { ip, time,day,traffic,type,id};
    }

    private  static  String parseIP(String line) {     //ip
        String ip = line.split(",")[0].trim();//str.trim(); 去掉首尾空格
        return ip;
    }

    private  static  String parseTime(String line) {    //时间
        final int first = line.indexOf(",");
        final int last = line.indexOf(" +0800,");
        String time = line.substring(first + 1, last).trim();
        Date date = parseDateFormat(time);
        return dateformat1.format(date);
    }
    private  static  String parseDay(String line) {    //天数
        String day = line.split(",")[2].trim();
        return day;
    }
    private static  String parseTraffic(String line) {    //流量,转为int型
        String traffic= line.split(",")[3].trim();
        return traffic;
    }
    private  static String parseType(String line) {
        String day = line.split(",")[4].replace(" ", "");
        return day;
    }
    private static String parseId(String line) {
        String day = line.split(",")[5].replace(" ", "");//去掉所有空格
        return day;
    }
    public static class Map extends Mapper<Object, Text, Text, NullWritable> {
        public static Text word = new Text();
        public void map(Object key, Text value, Context context)throws IOException, InterruptedException {
            // 将输入的纯文本文件的数据转化成String
            String line = value.toString();
            String arr[] = parse(line);
            word.set(arr[0]+"\t"+arr[1]+"\t"+arr[2]+"\t"+arr[3]+"\t"+arr[4]+"\t"+arr[5]+"\t");//一定用'\t'，空格容易乱会有意想不到的问题
            context.write(word,NullWritable.get());
        }
    }
    public static class Reduce extends Reducer<Text, NullWritable, Text, NullWritable> {
        // 实现reduce函数
        public void reduce(Text key, Iterable<NullWritable> values,Context context) throws IOException, InterruptedException {
            context.write(key, NullWritable.get());
        }
    }
    public static void main(String[] args) throws Exception {
        Configuration conf=new Configuration();
        System.out.println("start");
        Job job=Job.getInstance(conf);
        job.setJarByClass(test3.class);
        job.setMapperClass(Map.class);
        job.setReducerClass(Reduce.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);//设置map的输出格式
        job.setInputFormatClass(TextInputFormat.class);
        job.setOutputFormatClass(TextOutputFormat.class);
        Path in = new Path("hdfs://localhost:8020/mapReduce/mymapreduce1/result.txt");
        Path out = new Path("hdfs://localhost:8020/mapReduce/mymapreduce1/out");
        FileInputFormat.addInputPath(job,in );
        FileOutputFormat.setOutputPath(job,out);
        boolean flag = job.waitForCompletion(true);
        System.out.println(flag);
        System.exit(flag? 0 : 1);
    }
}

　　记得创建的是一个maven项目，然后在pom.xml里导入hadoop等的依赖

posted @ 2022-10-20 08:18 Lindseyyip 阅读(26) 评论(0) 编辑收藏举报

刷新页面返回顶部

hive课堂测试1

公告