Java正则匹配之 淘宝信息爬取

爬取页面分析:

1)每件商品以?开头,以?结尾

 

 2) 商品中有用的数据为

 

 

 

 

 

 

一、工具类 DataCenter

package cn.kgc.regex.file;

import java.io.*;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class DataCenter {
    // 每一个商品中有一个list的数据
    private List<List<String>> list = new ArrayList<>();

    // 文件解析到String
    public StringBuilder combine(String file){
        BufferedReader br = null;
        StringBuilder builder = new StringBuilder();
        try {
            br = new BufferedReader(new FileReader(file));
            String line = null;
            while (null != (line=br.readLine())){
                if((line=line.trim()).length()==0){
                    continue;
                }
                builder.append(line);
            }
        } catch (Exception e) {
            e.printStackTrace();
            return null;
        } finally {
            if (null != br) {
                try {
                    br.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
        return builder;
    }

    // 页面筛选之 商品筛选
    public List<String> split(String regex, String src){
        List<String> list = new ArrayList<>();
        Matcher matcher = Pattern.compile(regex).matcher(src);
        while (matcher.find()){
            list.add(matcher.group());
        }
        return list;
    }

    // 商品解析
    public void parse(String regex,String src){
        List<String> list = new ArrayList<>(4);
        Matcher matcher = Pattern.compile(regex).matcher(src);
        if (matcher.find()) {
            list.add(matcher.group(1));
            list.add(matcher.group(2));
            list.add(matcher.group(3));
            list.add(matcher.group(4));
            list.add(matcher.group(5));
            list.add(matcher.group(6));
            list.add(matcher.group(7));
        }
        this.list.add(list);
    }

    // 当作状态值,为了判断是否解析结束
    public int getCount(){
        return list.size();
    }

    // 遍历输出展示
    public void foreach(){
        for (List<String> strs : list) {
            for (String str : strs) {
                System.out.print(str+"\t");
            }
            System.out.println();
        }
    }
}

二、App类

package cn.kgc.regex;

import cn.kgc.regex.file.DataCenter;

import java.util.Iterator;
import java.util.List;

/**
 * Hello world!
 *
 */
public class App 
{
    public static void main( String[] args ) throws InterruptedException {
        DataCenter dc = new DataCenter();
        // 1. 文件流读取到String中
        String src = dc.combine("D:\\JAVA学习资料\\Java正则爬取信息\\regexdemo\\src\\main\\java\\cn\\kgc\\regex\\file\\tb-shirt.html")
                .toString();
        // 2.regex1: 每一件商品,都在这种结构中
        final String regex = "<div class=\"item J_MouserOnverReq .*?</div></div></div></div>";
        //   regex2:每一件商品中各种类型的数据在(.*?)中
        final String regex2 = ".*?trace-nid=\"(.*?)\" .*? trace-price=\"(.*?)\" .*?><img.*?src=\"(.*?)\".*?alt=\"(.*?)\".*?<div class=\"deal-cnt\">(.*?)</div>.*?<div class=\"location\">(.*?)</div>.*?data-nick=\"(.*?)\".*?";

        // list的size = 商品数量
        List<String> split = dc.split(regex, src);
//        for (String s : split) {
//            System.out.println(s);
//        }

        Iterator<String> it = split.iterator();
        while (it.hasNext()) {
            final String line = it.next();
            // 3. 遍历商品,每一行(件)开一个线程匹配
            new Thread(()->{
                dc.parse(regex2,line);
            }).start();
        }
            // 4. 状态值判定线程结束
        while (dc.getCount()<split.size()){
            Thread.sleep(200);
        }
           // 5. 多线程读取结束后遍历取值
        dc.foreach();
    }
}

 

posted @ 2020-09-08 19:48  PEAR2020  阅读(588)  评论(0编辑  收藏  举报