对爬取到的数据进行清洗,按照一定的规则把“脏数据”“洗掉”。
数据清洗是对数据进行重新审查和校验的过程,目的在于删除重复信息、纠正存在的错误,并提供数据一致性。
import java.io.BufferedReader; import java.io.InputStreamReader; import java.io.IOException; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.JobContext; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.lib.input.FileSplit; public class FileRecordReader extends RecordReader<text,text>{ private FileSplit fileSplit; private JobContext jobContext; private Text currentKey = new Text(); private Text currentValue = new Text(); private boolean finishConverting = false; @Override public void close() throws IOException { @Override public Text getCurrentKey() throws IOException, InterruptedException { return currentKey; } @Override public Text getCurrentValue() throws IOException, InterruptedException { return currentValue; } @Override public float getProgress() throws IOException, InterruptedException { float progress = 0; if(finishConverting){ progress = 1; } return progress; } @Override public void initialize(InputSplit arg0, TaskAttemptContext arg1) throws IOException, InterruptedException { this.fileSplit = (FileSplit) arg0; this.jobContext = arg1; String filename = fileSplit.getPath().getName(); this.currentKey = new Text(filename); } @Override public boolean nextKeyValue() throws IOException, InterruptedException { if(!finishConverting){ int len = (int)fileSplit.getLength(); // byte[] content = new byte[len]; Path file = fileSplit.getPath(); FileSystem fs = file.getFileSystem(jobContext.getConfiguration()); FSDataInputStream in = fs.open(file); //根据实际网页的编码格式修改 // BufferedReader br = new BufferedReader(new InputStreamReader(in,"gbk")); BufferedReader br = new BufferedReader(new InputStreamReader(in,"utf-8")); String line=""; String total=""; while((line= br.readLine())!= null){ total =total+line+"\n"; } br.close(); in.close(); fs.close(); currentValue = new Text(total); finishConverting = true; return true; } return false; } }
完整代码:
import java.io.IOException; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import cn.wanghaomiao.xpath.exception.XpathSyntaxErrorException; import cn.wanghaomiao.xpath.model.JXDocument; public class QingxiHtml { public static class doMapper extends Mapper<object, text,="" text=""> { public static final IntWritable one = new IntWritable(1); public static Text word = new Text(); @Override protected void map(Object key, Text value, Context context) throws IOException, InterruptedException { String htmlStr = value.toString(); JXDocument Document = new JXDocument(htmlStr); if (htmlStr.indexOf("mail_track_h2") > 0) { try { String leixing = Document .sel("//span[@class='font12 gray']/a[2]/text()") .get(0).toString(); String biaoti = Document .sel("//h2[@class='mail_track_h2']/text()").get(0) .toString(); String leixinren = Document .sel("//p[@class='font12 gray time_mail']/span[1]/text()") .get(0).toString().replaceAll("来信人:", ""); String shijian = Document .sel("//p[@class='font12 gray time_mail']/span[2]/text()") .get(0).toString().replaceAll("时间:", ""); String number = Document .sel("//p[@class='font12 gray time_mail']/span[3]/allText()") .get(0).toString().replace("网友同问: ", "").replace("网友评价数: ", ""); String problem = Document .sel("//span[@class='font14 mail_problem']/text()") .get(0).toString(); if (htmlStr.indexOf("margin-bottom:31px") > 0) { String offic = Document .sel("//div[@class='con_left float_left']/div[2]/span[1]/text()") .get(0).toString(); String officpt = Document .sel("//div[@class='con_left float_left']/div[2]/span[2]/text()") .get(0).toString(); String officp = Document .sel("//div[@class='con_left float_left']/div[2]/p[1]/text()") .get(0).toString(); String dataout = leixing + "\t" + biaoti + "\t" + leixinren + "\t" + shijian + "\t" + number + "\t" + problem + "\t" + offic + "\t" + officpt + "\t"+ officp; System.out.println(dataout); Text oneLines = new Text(dataout); context.write(oneLines, new Text("")); } else { String dataout = leixing + "\t" + biaoti + "\t" + leixinren + "\t" + shijian + "\t" + number + "\t" + problem; System.out.println(dataout); Text oneLines = new Text(dataout); context.write(oneLines, new Text("")); } } catch (XpathSyntaxErrorException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } } public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { Job job = Job.getInstance(); job.setJobName("QingxiHtml"); job.setJarByClass(QingxiHtml.class); job.setMapperClass(doMapper.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setInputFormatClass(FileInput.class); Path in = new Path("hdfs://localhost:9000//myedu2/in"); Path out = new Path("hdfs://localhost:9000//myedu2/out/1"); FileInputFormat.addInputPath(job, in); FileOutputFormat.setOutputPath(job, out); System.exit(job.waitForCompletion(true) ? 0 : 1); } }