MapReduce+hive数据操作-------第一天
题目:现有一文本文件,要将其中的数据进行清洗,以及存入hive数据库,在进行相关的数据统计。
这是要求我们使用mapReuce进行数据清洗,以及进行数据的统计。作为一名mapreduce的初学者,对于mapreduce的原理还不是很清楚。这是我使用Java进行数据清洗,在进行数据库的录入。
上代码:
Java数据清洗代码:
package Data; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.FileReader; import java.io.IOException; import java.io.OutputStreamWriter; import java.io.UnsupportedEncodingException; import java.io.Writer; public class Data { public static void main(String[] args) throws FileNotFoundException, UnsupportedEncodingException { FileReader read = new FileReader("result.txt"); BufferedReader br = new BufferedReader(read); Writer writer = null; File outFile = new File("result2.txt"); writer = new OutputStreamWriter(new FileOutputStream(outFile),"utf-8"); BufferedWriter bw = new BufferedWriter(writer); String row; String[] data=new String[6]; int hang=1; try { while((row = br.readLine())!=null){ data=change(row); data=chage(data); for(int i=0;i<data.length;i++) { System.out.print(data[i]+"\t"); } System.out.println(); row=data[0]+","+data[1]+","+data[2]+","+data[3]+","+data[4]+","+data[5]; bw.write(row + "\r\n"); //i++; } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } private static String[] chage(String[] data) { /* * for(int i=0;i<data.length;i++) { data[] } */ data[0]=data[0]; char[] str=data[1].toCharArray(); String[] time=new String[7]; int j=0; int k=0; for(int i=0;i<str.length;i++) { if(str[i]=='/'||str[i]==':'||str[i]==32) { time[k]=data[1].substring(j,i); j=i+1; k++; } } time[k]=data[1].substring(j, data[1].length()); switch(time[1]) { case "Jan":time[1]="01";break; case "Feb":time[1]="02";break; case "Mar":time[1]="03";break; case "Apr":time[1]="04";break; case "May":time[1]="05";break; case "Jun":time[1]="06";break; case "Jul":time[1]="07";break; case "Aug":time[1]="08";break; case "Sep":time[1]="09";break; case "Oct":time[1]="10";break; case "Nov":time[1]="11";break; case "Dec":time[1]="12";break; } data[1]=time[2]+"-"+time[1]+"-"+time[0]+" "+time[3]+":"+time[4]+":"+time[5]; data[3]=data[3].substring(0, data[3].length()-1); return data; } private static String [] change(String row) { char [] str1=row.toCharArray(); String [] data =new String [6]; int j=0; int k=0; for(int i=0;i<str1.length;i++) { if(str1[i]==',') { data[k]=row.substring(j, i); j=i+1; k++; } } data[k]=row.substring(j, str1.length); return data; } }
上传到数据库代码:
package Hive; import java.sql.Connection; import java.sql.DriverManager; import java.sql.ResultSet; import java.sql.SQLException; import java.sql.Statement; import org.apache.log4j.Logger; public class Data { private static String driverName = "org.apache.hive.jdbc.HiveDriver"; private static String url = "jdbc:hive2://192.168.43.18:10000/text"; private static String user = "hive"; private static String password = "hive"; private static String sql; //private static ResultSet res; private static final Logger log = Logger.getLogger(Text.class); public static void main(String[] args) { try { Class.forName(driverName); Connection conn = DriverManager.getConnection(url, user, password); Statement stmt = conn.createStatement(); sql = "load data local inpath '/home/hadoop/下载/result2.txt' overwrite into table data";//显示全部表 System.out.println("Running:" + sql); boolean f=stmt.execute(sql); System.out.println("显示结果:" + sql); System.out.println("result:" + f); conn.close(); conn = null; } catch (ClassNotFoundException e) { e.printStackTrace(); log.error(driverName + " not found!", e); System.exit(1); } catch (SQLException e) { e.printStackTrace(); log.error("Connection error!", e); System.exit(1); } } }
截图:
目前遇到的问题:
一:对于BufferedWriter,在数据的最后阶段会有一部分的数据录入不进去,这作为一个问题之后进行相关的探索。
二:对于Map Reduce还是不是很熟悉,以及不知道相关的原理。