天下之事,必先处之难,而后易之。
君临
知我者谓我心忧,不知我者谓我何求

这里从车商网上进行数据抓取,请保持良好的职业道德不要将数据用于商业途径。工信部官网有汽车方面的公告目录,那里有最全的pdf或word数据,鉴于word和pdf解析的繁琐和耗时,我暂时用这个网站的数据进行测试。

Spider主要代码:

package tk.mybatis.springboot.util;

import java.io.IOException;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.mysql.jdbc.Connection;
import com.mysql.jdbc.PreparedStatement;
import tk.mybatis.springboot.model.AutobatchDirectory;
import tk.mybatis.springboot.service.AutobatchDirectoryService;

public class AutoBatchSpider {

    // 原始来源http://www.cn357.com/notice_list/
    public static final String web = "http://www.cn357.com";
    
    private static final int timeOut=30000;
    
    /**
     * 获取汽车公告批次
     * 
     * @throws InterruptedException
     * @throws IOException
     * 
     */
    public static void getBatchFromUrl(String listurl) throws InterruptedException, IOException {
        Document doc = Jsoup.connect(listurl).userAgent("Mozilla/5.0").timeout(timeOut).get();
        Element batchElement = doc.getElementById("noticeList");
        Elements elements = batchElement.children();
        List<String> urls = new ArrayList<String>();
        for (Element element : elements) {
            String href = element.attr("href");
            String text = element.text();
            if (!href.startsWith("http://")) {
                StringBuffer sb = new StringBuffer();
                String batchUrl = sb.append(web).append(href).toString();
                System.out.println(text + "\t\t" + batchUrl);
                urls.add(batchUrl);
            } else {
                String batchUrl = href;
                urls.add(batchUrl);
                System.out.println(text + "\t\t" + batchUrl);
            }
        }
        // 反转排序
        Collections.reverse(urls);
        
        System.out.println("总批次数:" + urls.size());
        for (int j = 0, k = urls.size(); j < k; j++) {
            String url = urls.get(j);
            System.out.println("第" + url.split("_")[1] + "批数据获取进度:" + (double) Math.round(j * 100 / k) / 100);
            // 获取分页链接
            List<Map<String,Object>> list=getDetailsPageFromBatchItems(url);
            for (Map<String, Object> map : list) {
                // 获取详细数据对象
                String detailUrl =(String)map.get("href");
                AutobatchDirectory autobatchDirectory=getDetailOfAutoBatchInfo(detailUrl);
                try {
                    saveByJdbc(autobatchDirectory);
                } catch (SQLException e) {
                    e.printStackTrace();
                }
            }
        }

    }

    /**
     * 获取汽车公告批次
     * 
     * @throws InterruptedException
     * @throws IOException
     * 
     */
    public static void getBatchFromUrl(AutobatchDirectoryService autobatchDirectoryService,String listurl) throws InterruptedException, IOException {
        Document doc = Jsoup.connect(listurl).userAgent("Mozilla/5.0").timeout(timeOut).get();
        Element batchElement = doc.getElementById("noticeList");
        Elements elements = batchElement.children();
        List<String> urls = new ArrayList<String>();
        for (Element element : elements) {
            String href = element.attr("href");
            String text = element.text();
            if (!href.startsWith("http://")) {
                StringBuffer sb = new StringBuffer();
                String batchUrl = sb.append(web).append(href).toString();
                System.out.println(text + "\t\t" + batchUrl);
                urls.add(batchUrl);
            } else {
                String batchUrl = href;
                urls.add(batchUrl);
                System.out.println(text + "\t\t" + batchUrl);
            }
        }
        // 反转排序
        Collections.reverse(urls);
        
        System.out.println("总批次数:" + urls.size());
        for (int j = 0, k = urls.size(); j < k; j++) {
            String url = urls.get(j);
            System.out.println("第" + url.split("_")[1] + "批数据获取进度:" + (double) Math.round(j * 100 / k) / 100);
            // 获取分页链接
            List<Map<String,Object>> list=getDetailsPageFromBatchItems(url);
            for (Map<String, Object> map : list) {
                // 获取详细数据对象
                String detailUrl =(String)map.get("href");
                AutobatchDirectory autobatchDirectory=getDetailOfAutoBatchInfo(detailUrl);
                autobatchDirectoryService.save(autobatchDirectory);
            }
        }

    }
    
    /**
     * 获取所有汽车公告批次详细分页条目
     * 
     * @throws InterruptedException
     * @throws IOException
     * 
     */
    public static List<Map<String, Object>> getDetailsPageFromBatchItems(String url)
            throws InterruptedException, IOException {

        Document doc = Jsoup.connect(url).userAgent("Mozilla/5.0").timeout(timeOut).get();
        Thread.sleep(1000);
        /************ 获取分页参数 ************/
        Elements pages = doc.getElementsByAttributeValue("class", "page");
        int max = 1;
        for (Element element : pages) {
            for (Element children : element.getElementsByTag("a")) {
                String text = children.text();
                if (!text.equals("") && !text.equals("下一页") && !text.equals("上一页")) {
                    int index = Integer.valueOf(children.text());
                    if (index > max) {
                        max = index;
                    }
                }
            }
        }
        int totalBatchPage = max;
        List<Map<String, Object>> list = new ArrayList<Map<String, Object>>();
        /************ 获取分页数据 ************/
        for (int i = 1; i <= totalBatchPage; i++) {
            System.out.println("分页数据获取进度:" + (double) Math.round(i * 100 / max) / 100);
            Thread.sleep(2000);// 等待2秒开始访问
            doc = Jsoup.connect(url + "_" + i).userAgent("Mozilla/5.0").timeout(timeOut).get();
            Elements divs = doc.getElementsByAttributeValue("class", "noticeLotItem");
            Map<String, Object> map = new HashMap<String, Object>();
            for (Element div : divs) {
                Elements divChildren = div.children();
                boolean isHref = false;
                for (Element element : divChildren) {
                    String claszType = element.attr("class");
                    if (claszType.equals("m")) {
                        Element a = element.getElementsByAttribute("href").get(0);
                        String href = a.attr("href");
                        if ("".equals(href)) {
                            continue;
                        }
                        map = new HashMap<String, Object>();
                        if (!href.startsWith("http://")) {
                            map.put("href", web + href);
                        } else {
                            map.put("href", href);
                        }
                        map.put("href_text", a.text());
                        isHref = true;
                    } else if (claszType.equals("c")) {
                        if (!isHref) {
                            continue;
                        }
                        map.put("type_text", element.text());
                    }
                }
                if (map != null) {
                    list.add(map);
                    map = null;
                }
            }
        }
        System.out.println("Total rows:" + list.size());
        return list;
    }

    /**
     * 解析详细的车型车厂信息
     * 
     * @throws IOException
     * @throws InterruptedException
     */
    public static AutobatchDirectory getDetailOfAutoBatchInfo(String url) throws IOException, InterruptedException {
        Thread.sleep(2000);
        Document doc = Jsoup.connect(url).userAgent("Mozilla/5.0").timeout(timeOut).get();
        Elements tableElements = doc.getElementsByAttributeValue("class", "noticeAttr mt5").get(0)
                .getElementsByTag("tbody").get(0).children();
        AutobatchDirectory autobatchDirectory = null;
        if (tableElements.size() == 22) {
            autobatchDirectory = new AutobatchDirectory();
            // 22行表示没有发动机参数
            System.out.println("......22行表示没有发动机参数.....解析中.......");
            for (int i = 1, j = tableElements.size(); i <= j; i++) {
                Element element = tableElements.get(i - 1);
                if (i == j) {
                    Elements children = element.children();
                    String values = children.get(1).text();
                    autobatchDirectory.setfRemark(values);
                    System.out.println(values);
                } else {
                    // 设置对象属性值
                    setPropertyToObject(autobatchDirectory, i, element);
                }
            }
        } else if (tableElements.size() == 23) {
            autobatchDirectory = new AutobatchDirectory();
            // 23行表示有发动机参数
            System.out.println("......23行表示有发动机参数.....解析中.......");
            for (int i = 1, j = tableElements.size(); i <= j; i++) {
                Element element = tableElements.get(i - 1);
                if (i == 22) {
                    Element valueTr = element.getElementsByTag("table").get(0);
                    Elements children = valueTr.getElementsByTag("tbody").get(0).children().get(1).children();
                    int count = 0;
                    for (Element child : children) {
                        switch (count) {
                        case 0:
                            // 发动机型号
                            autobatchDirectory.setfEngineType(child.text());
                            break;
                        case 1:
                            // 发动机生产企业
                            autobatchDirectory.setfEnginePro(child.text());
                            break;
                        case 2:
                            // 发动机商标
                            autobatchDirectory.setfEngineTrademark(child.text());
                            break;
                        case 3:
                            // 排量
                            autobatchDirectory.setfOutputVolume(child.text());
                            break;
                        case 4:
                            // 功率
                            autobatchDirectory.setfPower(child.text());
                            break;
                        default:
                            break;
                        }
                        count++;
                    }
                } else if (i == j) {
                    Elements children = element.children();
                    String values = children.get(1).text();
                    autobatchDirectory.setfRemark(values); 
                    System.out.println(values);
                } else {
                    // 设置对象属性值
                    setPropertyToObject(autobatchDirectory, i, element);
                }
            }
        }
        return autobatchDirectory;
    }

    /**
     * 设置对象属性值
     */
    private static void setPropertyToObject(AutobatchDirectory autobatchDirectory, int i, Element element) {
        Elements children = element.children();
        String values = children.get(1).text() + " " + children.get(3).text();
        System.out.println(values);
        switch (i) {
        case 1:
            // 公告型号 公告批次
            autobatchDirectory.setfAnnouType(children.get(1).text());
            autobatchDirectory.setfAnnouBatch(children.get(3).text());
            break;
        case 2:
            // 品牌 类型
            autobatchDirectory.setfVehicleBrand(children.get(1).text());
            autobatchDirectory.setfVehicleType(children.get(3).text());
            break;
        case 3:
            // 额定质量 32000,32700 总质量
            autobatchDirectory.setfMaxMass(children.get(1).text());
            autobatchDirectory.setfTotalMass(children.get(3).text());
            break;
        case 4:
            // 整备质量 8000,7300 燃料种类
            autobatchDirectory.setfWholeMass(children.get(1).text());
            autobatchDirectory.setfFuelType(children.get(3).text());
            break;
        case 5:
            // 排放依据标准 轴数
            autobatchDirectory.setfBlowoffStandard(children.get(1).text());
            autobatchDirectory.setfAxleNumber(children.get(3).text());
            break;
        case 6:
            // 轴距 7250+1310+1310,6850+1310+1310 轴荷
            autobatchDirectory.setfWheelbase(children.get(1).text());
            autobatchDirectory.setfAxleWeight(children.get(3).text());
            break;
        case 7:
            // 弹簧片数 -/8/8/8,-/4/4/4,-/7/7/7,-/-/-/-,-/10/10/10 轮胎数
            autobatchDirectory.setfSpringNumber(children.get(1).text());
            autobatchDirectory.setfTyreNumber(children.get(3).text());
            break;
        case 8:
            // 轮胎规格 11.00R20 12PR,11.00-20 12PR,12R22.5 12PR 接近离去角
            autobatchDirectory.setfTyreSize(children.get(1).text());
            autobatchDirectory.setfDepartureAngle(children.get(3).text());
            break;
        case 9:
            // 前悬后悬 -/2080,-/1730,-/2480,-/2130 前轮距
            autobatchDirectory.setfFrearSuspension(children.get(1).text());
            autobatchDirectory.setfFrontGauge(children.get(3).text());
            break;
        case 10:
            // 后轮距 1830/1830/1830 识别代号
            autobatchDirectory.setfBackGauge(children.get(1).text());
            autobatchDirectory.setfVinCode(children.get(3).text());
            break;
        case 11:
            // 整车长 13000 整车宽
            autobatchDirectory.setfVehicleLength(children.get(1).text());
            autobatchDirectory.setfVehicleWidth(children.get(3).text());
            break;
        case 12:
            // 整车高 2970,3030,2760 货厢长
            autobatchDirectory.setfVehicleHeight(children.get(1).text());
            autobatchDirectory.setfCargoLength(children.get(3).text());
            break;
        case 13:
            // 货厢宽 2400,2450,2470 货厢高
            autobatchDirectory.setfCargoWidth(children.get(1).text());
            autobatchDirectory.setfCargoHeight(children.get(3).text());
            break;
        case 14:
            // 最高车速 额定载客
            autobatchDirectory.setfMaxSpeed(children.get(1).text());
            autobatchDirectory.setfMaxPassenger(children.get(3).text());
            break;
        case 15:
            // 驾驶室准乘人数 转向形式
            autobatchDirectory.setfCabNumber(children.get(1).text());
            autobatchDirectory.setfSteeringType(children.get(3).text());
            break;
        case 16:
            // 准拖挂车总质量 载质量利用系数
            autobatchDirectory.setfTotalMassTrailer(children.get(1).text());
            autobatchDirectory.setfLoadMassFactor(children.get(3).text());
            break;
        case 17:
            // 半挂车鞍座最大承载质量 16000,16150 企业名称
            autobatchDirectory.setfMaxSemitrailer(children.get(1).text());
            autobatchDirectory.setfEnterpriseName(children.get(3).text());
            break;
        case 18:
            // 企业地址 深圳市龙岗区坪山镇锦龙大道1号 电话号码
            autobatchDirectory.setfEnterpriseAddress(children.get(1).text());
            autobatchDirectory.setfEnterprisePhone(children.get(3).text());
            break;
        case 19:
            // 传真号码 (0755)89663298 邮政编码
            autobatchDirectory.setfEnterpriseFax(children.get(1).text());
            autobatchDirectory.setfPostcode(children.get(3).text());
            break;
        case 20:
            // 底盘1 底盘2
            autobatchDirectory.setfChassisOne(children.get(1).text());
            autobatchDirectory.setfChassisTwo(children.get(3).text());
            break;
        case 21:
            // 底盘3 底盘4
            autobatchDirectory.setfChassisThree(children.get(1).text());
            autobatchDirectory.setfChassisFour(children.get(3).text());
            break;
        default:
            break;
        }
    }

    /**
     * JDBC存储
     * @throws SQLException 
     */
    public static void saveByJdbc(AutobatchDirectory autobatchDirectory) throws SQLException{
        String sql = "insert into autobatch_directory ("
                + "F_ANNOU_TYPE, F_ANNOU_BATCH, F_VEHICLE_BRAND, F_VEHICLE_TYPE,"
                + "F_MAX_MASS, F_TOTAL_MASS, F_WHOLE_MASS, F_FUEL_TYPE, "
                + "F_BLOWOFF_STANDARD, F_AXLE_NUMBER, F_WHEELBASE,  F_AXLE_WEIGHT,"
                + "F_SPRING_NUMBER, F_TYRE_NUMBER,F_TYRE_SIZE, F_DEPARTURE_ANGLE, "
                + "F_FREAR_SUSPENSION, F_FRONT_GAUGE, F_BACK_GAUGE, F_VIN_CODE, "
                + "F_VEHICLE_LENGTH, F_VEHICLE_WIDTH, F_VEHICLE_HEIGHT,F_CARGO_LENGTH,"
                + "F_CARGO_WIDTH, F_CARGO_HEIGHT,F_MAX_SPEED, F_MAX_PASSENGER,"
                + "F_CAB_NUMBER, F_STEERING_TYPE, F_TOTAL_MASS_TRAILER,"
                + "F_LOAD_MASS_FACTOR, F_MAX_SEMITRAILER, F_ENTERPRISE_NAME, F_ENTERPRISE_ADDRESS, "
                + "F_ENTERPRISE_PHONE, F_ENTERPRISE_FAX, F_POSTCODE, F_CHASSIS_ONE,"
                + "F_CHASSIS_TWO, F_CHASSIS_THREE, F_CHASSIS_FOUR, F_ENGINE_TYPE,"
                + "F_ENGINE_PRO,  F_ENGINE_TRADEMARK, F_OUTPUT_VOLUME, F_POWER, F_REMARK) values (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,"
                        + "?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)";
        Connection conn = MycatJdbc.getConnection();
        PreparedStatement pstm = null;
        try {
            System.out.println(sql);
            pstm = (PreparedStatement) conn.prepareStatement(sql);
            pstm.setString(1, autobatchDirectory.getfAnnouType());
            pstm.setString(2, autobatchDirectory.getfAnnouBatch());
            pstm.setString(3, autobatchDirectory.getfVehicleBrand());
            pstm.setString(4, autobatchDirectory.getfVehicleType());
            pstm.setString(5, autobatchDirectory.getfMaxMass());
            pstm.setString(6, autobatchDirectory.getfTotalMass());
            pstm.setString(7, autobatchDirectory.getfWholeMass());
            pstm.setString(8, autobatchDirectory.getfFuelType());
            pstm.setString(9, autobatchDirectory.getfBlowoffStandard());
            pstm.setString(10, autobatchDirectory.getfAxleNumber());
            pstm.setString(11, autobatchDirectory.getfWheelbase());
            pstm.setString(12,autobatchDirectory.getfAxleWeight() );
            pstm.setString(13, autobatchDirectory.getfSpringNumber());
            pstm.setString(14,autobatchDirectory.getfTyreNumber() );
            pstm.setString(15, autobatchDirectory.getfTyreSize() );
            pstm.setString(16, autobatchDirectory.getfDepartureAngle());
            pstm.setString(17, autobatchDirectory.getfFrearSuspension());
            pstm.setString(18, autobatchDirectory.getfFrontGauge());
            pstm.setString(19,autobatchDirectory.getfBackGauge() );
            pstm.setString(20, autobatchDirectory.getfVinCode());
            pstm.setString(21, autobatchDirectory.getfVehicleLength());
            pstm.setString(22, autobatchDirectory.getfVehicleWidth());
            pstm.setString(23, autobatchDirectory.getfVehicleHeight());
            pstm.setString(24, autobatchDirectory.getfCargoLength());
            pstm.setString(25,autobatchDirectory.getfCargoWidth() );
            pstm.setString(26, autobatchDirectory.getfCargoHeight());
            pstm.setString(27,autobatchDirectory.getfMaxSpeed() );
            pstm.setString(28,autobatchDirectory.getfMaxPassenger() );
            pstm.setString(29,autobatchDirectory.getfCabNumber() );
            pstm.setString(30, autobatchDirectory.getfSteeringType()  );
            pstm.setString(31, autobatchDirectory.getfTotalMassTrailer());
            pstm.setString(32,autobatchDirectory.getfLoadMassFactor() );
            pstm.setString(33,autobatchDirectory.getfMaxSemitrailer() );
            pstm.setString(34, autobatchDirectory.getfEnterpriseName());
            pstm.setString(35,autobatchDirectory.getfEnterpriseAddress() );
            pstm.setString(36,autobatchDirectory.getfEnterprisePhone()  );
            pstm.setString(37,autobatchDirectory.getfEnterpriseFax() );
            pstm.setString(38, autobatchDirectory.getfPostcode() );
            pstm.setString(39, autobatchDirectory.getfChassisOne());
            pstm.setString(40, autobatchDirectory.getfChassisTwo());
            pstm.setString(41, autobatchDirectory.getfChassisThree());
            pstm.setString(42, autobatchDirectory.getfChassisFour());
            pstm.setString(43, autobatchDirectory.getfEngineType() == null ? "" : autobatchDirectory.getfEngineType());
            pstm.setString(44, autobatchDirectory.getfEnginePro() == null ? "" : autobatchDirectory.getfEnginePro());
            pstm.setString(45, autobatchDirectory.getfEngineTrademark() == null ? "" : autobatchDirectory.getfEngineTrademark());
            pstm.setString(46, autobatchDirectory.getfOutputVolume() == null ? "" : autobatchDirectory.getfOutputVolume());
            pstm.setString(47, autobatchDirectory.getfPower() == null ? "" : autobatchDirectory.getfPower());
            pstm.setString(48, autobatchDirectory.getfRemark()==null?"":autobatchDirectory.getfRemark());
            pstm.executeUpdate();
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            if (pstm != null) {
                try {
                    pstm.close();
                } catch (SQLException e) {
                    e.printStackTrace();
                }
            }
        }
    }
}

代码没什么难度,都是基本的元素解析。

posted on 2017-08-04 13:20  boonya  阅读(502)  评论(0编辑  收藏  举报

我有佳人隔窗而居,今有伊人明月之畔。
轻歌柔情冰壶之浣,涓涓清流梦入云端。
美人如娇温雅悠婉,目遇赏阅适而自欣。
百草层叠疏而有致,此情此思怀彼佳人。
念所思之唯心叩之,踽踽彳亍寤寐思之。
行云如风逝而复归,佳人一去莫知可回?
深闺冷瘦独自徘徊,处处明灯影还如只。
推窗见月疑是归人,阑珊灯火托手思忖。
庐居闲客而好品茗,斟茶徐徐漫漫生烟。

我有佳人在水之畔,瓮载渔舟浣纱归还。
明月相照月色还低,浅近芦苇深深如钿。
庐山秋月如美人衣,画堂春阁香气靡靡。
秋意幽笃残粉摇曳,轻轻如诉画中蝴蝶。
泾水潺潺取尔浇园,暮色黄昏如沐佳人。
青丝撩弄长裙翩翩,彩蝶飞舞执子手腕。
香带丝缕缓缓在肩,柔美体肤寸寸爱怜。
如水之殇美玉成欢,我有佳人清新如兰。
伊人在水我在一边,远远相望不可亵玩。