这里从车商网上进行数据抓取,请保持良好的职业道德不要将数据用于商业途径。工信部官网有汽车方面的公告目录,那里有最全的pdf或word数据,鉴于word和pdf解析的繁琐和耗时,我暂时用这个网站的数据进行测试。
Spider主要代码:
package tk.mybatis.springboot.util; import java.io.IOException; import java.sql.SQLException; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import com.mysql.jdbc.Connection; import com.mysql.jdbc.PreparedStatement; import tk.mybatis.springboot.model.AutobatchDirectory; import tk.mybatis.springboot.service.AutobatchDirectoryService; public class AutoBatchSpider { // 原始来源http://www.cn357.com/notice_list/ public static final String web = "http://www.cn357.com"; private static final int timeOut=30000; /** * 获取汽车公告批次 * * @throws InterruptedException * @throws IOException * */ public static void getBatchFromUrl(String listurl) throws InterruptedException, IOException { Document doc = Jsoup.connect(listurl).userAgent("Mozilla/5.0").timeout(timeOut).get(); Element batchElement = doc.getElementById("noticeList"); Elements elements = batchElement.children(); List<String> urls = new ArrayList<String>(); for (Element element : elements) { String href = element.attr("href"); String text = element.text(); if (!href.startsWith("http://")) { StringBuffer sb = new StringBuffer(); String batchUrl = sb.append(web).append(href).toString(); System.out.println(text + "\t\t" + batchUrl); urls.add(batchUrl); } else { String batchUrl = href; urls.add(batchUrl); System.out.println(text + "\t\t" + batchUrl); } } // 反转排序 Collections.reverse(urls); System.out.println("总批次数:" + urls.size()); for (int j = 0, k = urls.size(); j < k; j++) { String url = urls.get(j); System.out.println("第" + url.split("_")[1] + "批数据获取进度:" + (double) Math.round(j * 100 / k) / 100); // 获取分页链接 List<Map<String,Object>> list=getDetailsPageFromBatchItems(url); for (Map<String, Object> map : list) { // 获取详细数据对象 String detailUrl =(String)map.get("href"); AutobatchDirectory autobatchDirectory=getDetailOfAutoBatchInfo(detailUrl); try { saveByJdbc(autobatchDirectory); } catch (SQLException e) { e.printStackTrace(); } } } } /** * 获取汽车公告批次 * * @throws InterruptedException * @throws IOException * */ public static void getBatchFromUrl(AutobatchDirectoryService autobatchDirectoryService,String listurl) throws InterruptedException, IOException { Document doc = Jsoup.connect(listurl).userAgent("Mozilla/5.0").timeout(timeOut).get(); Element batchElement = doc.getElementById("noticeList"); Elements elements = batchElement.children(); List<String> urls = new ArrayList<String>(); for (Element element : elements) { String href = element.attr("href"); String text = element.text(); if (!href.startsWith("http://")) { StringBuffer sb = new StringBuffer(); String batchUrl = sb.append(web).append(href).toString(); System.out.println(text + "\t\t" + batchUrl); urls.add(batchUrl); } else { String batchUrl = href; urls.add(batchUrl); System.out.println(text + "\t\t" + batchUrl); } } // 反转排序 Collections.reverse(urls); System.out.println("总批次数:" + urls.size()); for (int j = 0, k = urls.size(); j < k; j++) { String url = urls.get(j); System.out.println("第" + url.split("_")[1] + "批数据获取进度:" + (double) Math.round(j * 100 / k) / 100); // 获取分页链接 List<Map<String,Object>> list=getDetailsPageFromBatchItems(url); for (Map<String, Object> map : list) { // 获取详细数据对象 String detailUrl =(String)map.get("href"); AutobatchDirectory autobatchDirectory=getDetailOfAutoBatchInfo(detailUrl); autobatchDirectoryService.save(autobatchDirectory); } } } /** * 获取所有汽车公告批次详细分页条目 * * @throws InterruptedException * @throws IOException * */ public static List<Map<String, Object>> getDetailsPageFromBatchItems(String url) throws InterruptedException, IOException { Document doc = Jsoup.connect(url).userAgent("Mozilla/5.0").timeout(timeOut).get(); Thread.sleep(1000); /************ 获取分页参数 ************/ Elements pages = doc.getElementsByAttributeValue("class", "page"); int max = 1; for (Element element : pages) { for (Element children : element.getElementsByTag("a")) { String text = children.text(); if (!text.equals("") && !text.equals("下一页") && !text.equals("上一页")) { int index = Integer.valueOf(children.text()); if (index > max) { max = index; } } } } int totalBatchPage = max; List<Map<String, Object>> list = new ArrayList<Map<String, Object>>(); /************ 获取分页数据 ************/ for (int i = 1; i <= totalBatchPage; i++) { System.out.println("分页数据获取进度:" + (double) Math.round(i * 100 / max) / 100); Thread.sleep(2000);// 等待2秒开始访问 doc = Jsoup.connect(url + "_" + i).userAgent("Mozilla/5.0").timeout(timeOut).get(); Elements divs = doc.getElementsByAttributeValue("class", "noticeLotItem"); Map<String, Object> map = new HashMap<String, Object>(); for (Element div : divs) { Elements divChildren = div.children(); boolean isHref = false; for (Element element : divChildren) { String claszType = element.attr("class"); if (claszType.equals("m")) { Element a = element.getElementsByAttribute("href").get(0); String href = a.attr("href"); if ("".equals(href)) { continue; } map = new HashMap<String, Object>(); if (!href.startsWith("http://")) { map.put("href", web + href); } else { map.put("href", href); } map.put("href_text", a.text()); isHref = true; } else if (claszType.equals("c")) { if (!isHref) { continue; } map.put("type_text", element.text()); } } if (map != null) { list.add(map); map = null; } } } System.out.println("Total rows:" + list.size()); return list; } /** * 解析详细的车型车厂信息 * * @throws IOException * @throws InterruptedException */ public static AutobatchDirectory getDetailOfAutoBatchInfo(String url) throws IOException, InterruptedException { Thread.sleep(2000); Document doc = Jsoup.connect(url).userAgent("Mozilla/5.0").timeout(timeOut).get(); Elements tableElements = doc.getElementsByAttributeValue("class", "noticeAttr mt5").get(0) .getElementsByTag("tbody").get(0).children(); AutobatchDirectory autobatchDirectory = null; if (tableElements.size() == 22) { autobatchDirectory = new AutobatchDirectory(); // 22行表示没有发动机参数 System.out.println("......22行表示没有发动机参数.....解析中......."); for (int i = 1, j = tableElements.size(); i <= j; i++) { Element element = tableElements.get(i - 1); if (i == j) { Elements children = element.children(); String values = children.get(1).text(); autobatchDirectory.setfRemark(values); System.out.println(values); } else { // 设置对象属性值 setPropertyToObject(autobatchDirectory, i, element); } } } else if (tableElements.size() == 23) { autobatchDirectory = new AutobatchDirectory(); // 23行表示有发动机参数 System.out.println("......23行表示有发动机参数.....解析中......."); for (int i = 1, j = tableElements.size(); i <= j; i++) { Element element = tableElements.get(i - 1); if (i == 22) { Element valueTr = element.getElementsByTag("table").get(0); Elements children = valueTr.getElementsByTag("tbody").get(0).children().get(1).children(); int count = 0; for (Element child : children) { switch (count) { case 0: // 发动机型号 autobatchDirectory.setfEngineType(child.text()); break; case 1: // 发动机生产企业 autobatchDirectory.setfEnginePro(child.text()); break; case 2: // 发动机商标 autobatchDirectory.setfEngineTrademark(child.text()); break; case 3: // 排量 autobatchDirectory.setfOutputVolume(child.text()); break; case 4: // 功率 autobatchDirectory.setfPower(child.text()); break; default: break; } count++; } } else if (i == j) { Elements children = element.children(); String values = children.get(1).text(); autobatchDirectory.setfRemark(values); System.out.println(values); } else { // 设置对象属性值 setPropertyToObject(autobatchDirectory, i, element); } } } return autobatchDirectory; } /** * 设置对象属性值 */ private static void setPropertyToObject(AutobatchDirectory autobatchDirectory, int i, Element element) { Elements children = element.children(); String values = children.get(1).text() + " " + children.get(3).text(); System.out.println(values); switch (i) { case 1: // 公告型号 公告批次 autobatchDirectory.setfAnnouType(children.get(1).text()); autobatchDirectory.setfAnnouBatch(children.get(3).text()); break; case 2: // 品牌 类型 autobatchDirectory.setfVehicleBrand(children.get(1).text()); autobatchDirectory.setfVehicleType(children.get(3).text()); break; case 3: // 额定质量 32000,32700 总质量 autobatchDirectory.setfMaxMass(children.get(1).text()); autobatchDirectory.setfTotalMass(children.get(3).text()); break; case 4: // 整备质量 8000,7300 燃料种类 autobatchDirectory.setfWholeMass(children.get(1).text()); autobatchDirectory.setfFuelType(children.get(3).text()); break; case 5: // 排放依据标准 轴数 autobatchDirectory.setfBlowoffStandard(children.get(1).text()); autobatchDirectory.setfAxleNumber(children.get(3).text()); break; case 6: // 轴距 7250+1310+1310,6850+1310+1310 轴荷 autobatchDirectory.setfWheelbase(children.get(1).text()); autobatchDirectory.setfAxleWeight(children.get(3).text()); break; case 7: // 弹簧片数 -/8/8/8,-/4/4/4,-/7/7/7,-/-/-/-,-/10/10/10 轮胎数 autobatchDirectory.setfSpringNumber(children.get(1).text()); autobatchDirectory.setfTyreNumber(children.get(3).text()); break; case 8: // 轮胎规格 11.00R20 12PR,11.00-20 12PR,12R22.5 12PR 接近离去角 autobatchDirectory.setfTyreSize(children.get(1).text()); autobatchDirectory.setfDepartureAngle(children.get(3).text()); break; case 9: // 前悬后悬 -/2080,-/1730,-/2480,-/2130 前轮距 autobatchDirectory.setfFrearSuspension(children.get(1).text()); autobatchDirectory.setfFrontGauge(children.get(3).text()); break; case 10: // 后轮距 1830/1830/1830 识别代号 autobatchDirectory.setfBackGauge(children.get(1).text()); autobatchDirectory.setfVinCode(children.get(3).text()); break; case 11: // 整车长 13000 整车宽 autobatchDirectory.setfVehicleLength(children.get(1).text()); autobatchDirectory.setfVehicleWidth(children.get(3).text()); break; case 12: // 整车高 2970,3030,2760 货厢长 autobatchDirectory.setfVehicleHeight(children.get(1).text()); autobatchDirectory.setfCargoLength(children.get(3).text()); break; case 13: // 货厢宽 2400,2450,2470 货厢高 autobatchDirectory.setfCargoWidth(children.get(1).text()); autobatchDirectory.setfCargoHeight(children.get(3).text()); break; case 14: // 最高车速 额定载客 autobatchDirectory.setfMaxSpeed(children.get(1).text()); autobatchDirectory.setfMaxPassenger(children.get(3).text()); break; case 15: // 驾驶室准乘人数 转向形式 autobatchDirectory.setfCabNumber(children.get(1).text()); autobatchDirectory.setfSteeringType(children.get(3).text()); break; case 16: // 准拖挂车总质量 载质量利用系数 autobatchDirectory.setfTotalMassTrailer(children.get(1).text()); autobatchDirectory.setfLoadMassFactor(children.get(3).text()); break; case 17: // 半挂车鞍座最大承载质量 16000,16150 企业名称 autobatchDirectory.setfMaxSemitrailer(children.get(1).text()); autobatchDirectory.setfEnterpriseName(children.get(3).text()); break; case 18: // 企业地址 深圳市龙岗区坪山镇锦龙大道1号 电话号码 autobatchDirectory.setfEnterpriseAddress(children.get(1).text()); autobatchDirectory.setfEnterprisePhone(children.get(3).text()); break; case 19: // 传真号码 (0755)89663298 邮政编码 autobatchDirectory.setfEnterpriseFax(children.get(1).text()); autobatchDirectory.setfPostcode(children.get(3).text()); break; case 20: // 底盘1 底盘2 autobatchDirectory.setfChassisOne(children.get(1).text()); autobatchDirectory.setfChassisTwo(children.get(3).text()); break; case 21: // 底盘3 底盘4 autobatchDirectory.setfChassisThree(children.get(1).text()); autobatchDirectory.setfChassisFour(children.get(3).text()); break; default: break; } } /** * JDBC存储 * @throws SQLException */ public static void saveByJdbc(AutobatchDirectory autobatchDirectory) throws SQLException{ String sql = "insert into autobatch_directory (" + "F_ANNOU_TYPE, F_ANNOU_BATCH, F_VEHICLE_BRAND, F_VEHICLE_TYPE," + "F_MAX_MASS, F_TOTAL_MASS, F_WHOLE_MASS, F_FUEL_TYPE, " + "F_BLOWOFF_STANDARD, F_AXLE_NUMBER, F_WHEELBASE, F_AXLE_WEIGHT," + "F_SPRING_NUMBER, F_TYRE_NUMBER,F_TYRE_SIZE, F_DEPARTURE_ANGLE, " + "F_FREAR_SUSPENSION, F_FRONT_GAUGE, F_BACK_GAUGE, F_VIN_CODE, " + "F_VEHICLE_LENGTH, F_VEHICLE_WIDTH, F_VEHICLE_HEIGHT,F_CARGO_LENGTH," + "F_CARGO_WIDTH, F_CARGO_HEIGHT,F_MAX_SPEED, F_MAX_PASSENGER," + "F_CAB_NUMBER, F_STEERING_TYPE, F_TOTAL_MASS_TRAILER," + "F_LOAD_MASS_FACTOR, F_MAX_SEMITRAILER, F_ENTERPRISE_NAME, F_ENTERPRISE_ADDRESS, " + "F_ENTERPRISE_PHONE, F_ENTERPRISE_FAX, F_POSTCODE, F_CHASSIS_ONE," + "F_CHASSIS_TWO, F_CHASSIS_THREE, F_CHASSIS_FOUR, F_ENGINE_TYPE," + "F_ENGINE_PRO, F_ENGINE_TRADEMARK, F_OUTPUT_VOLUME, F_POWER, F_REMARK) values (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?," + "?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)"; Connection conn = MycatJdbc.getConnection(); PreparedStatement pstm = null; try { System.out.println(sql); pstm = (PreparedStatement) conn.prepareStatement(sql); pstm.setString(1, autobatchDirectory.getfAnnouType()); pstm.setString(2, autobatchDirectory.getfAnnouBatch()); pstm.setString(3, autobatchDirectory.getfVehicleBrand()); pstm.setString(4, autobatchDirectory.getfVehicleType()); pstm.setString(5, autobatchDirectory.getfMaxMass()); pstm.setString(6, autobatchDirectory.getfTotalMass()); pstm.setString(7, autobatchDirectory.getfWholeMass()); pstm.setString(8, autobatchDirectory.getfFuelType()); pstm.setString(9, autobatchDirectory.getfBlowoffStandard()); pstm.setString(10, autobatchDirectory.getfAxleNumber()); pstm.setString(11, autobatchDirectory.getfWheelbase()); pstm.setString(12,autobatchDirectory.getfAxleWeight() ); pstm.setString(13, autobatchDirectory.getfSpringNumber()); pstm.setString(14,autobatchDirectory.getfTyreNumber() ); pstm.setString(15, autobatchDirectory.getfTyreSize() ); pstm.setString(16, autobatchDirectory.getfDepartureAngle()); pstm.setString(17, autobatchDirectory.getfFrearSuspension()); pstm.setString(18, autobatchDirectory.getfFrontGauge()); pstm.setString(19,autobatchDirectory.getfBackGauge() ); pstm.setString(20, autobatchDirectory.getfVinCode()); pstm.setString(21, autobatchDirectory.getfVehicleLength()); pstm.setString(22, autobatchDirectory.getfVehicleWidth()); pstm.setString(23, autobatchDirectory.getfVehicleHeight()); pstm.setString(24, autobatchDirectory.getfCargoLength()); pstm.setString(25,autobatchDirectory.getfCargoWidth() ); pstm.setString(26, autobatchDirectory.getfCargoHeight()); pstm.setString(27,autobatchDirectory.getfMaxSpeed() ); pstm.setString(28,autobatchDirectory.getfMaxPassenger() ); pstm.setString(29,autobatchDirectory.getfCabNumber() ); pstm.setString(30, autobatchDirectory.getfSteeringType() ); pstm.setString(31, autobatchDirectory.getfTotalMassTrailer()); pstm.setString(32,autobatchDirectory.getfLoadMassFactor() ); pstm.setString(33,autobatchDirectory.getfMaxSemitrailer() ); pstm.setString(34, autobatchDirectory.getfEnterpriseName()); pstm.setString(35,autobatchDirectory.getfEnterpriseAddress() ); pstm.setString(36,autobatchDirectory.getfEnterprisePhone() ); pstm.setString(37,autobatchDirectory.getfEnterpriseFax() ); pstm.setString(38, autobatchDirectory.getfPostcode() ); pstm.setString(39, autobatchDirectory.getfChassisOne()); pstm.setString(40, autobatchDirectory.getfChassisTwo()); pstm.setString(41, autobatchDirectory.getfChassisThree()); pstm.setString(42, autobatchDirectory.getfChassisFour()); pstm.setString(43, autobatchDirectory.getfEngineType() == null ? "" : autobatchDirectory.getfEngineType()); pstm.setString(44, autobatchDirectory.getfEnginePro() == null ? "" : autobatchDirectory.getfEnginePro()); pstm.setString(45, autobatchDirectory.getfEngineTrademark() == null ? "" : autobatchDirectory.getfEngineTrademark()); pstm.setString(46, autobatchDirectory.getfOutputVolume() == null ? "" : autobatchDirectory.getfOutputVolume()); pstm.setString(47, autobatchDirectory.getfPower() == null ? "" : autobatchDirectory.getfPower()); pstm.setString(48, autobatchDirectory.getfRemark()==null?"":autobatchDirectory.getfRemark()); pstm.executeUpdate(); } catch (Exception e) { e.printStackTrace(); } finally { if (pstm != null) { try { pstm.close(); } catch (SQLException e) { e.printStackTrace(); } } } } }
代码没什么难度,都是基本的元素解析。
纸上得来终觉浅,绝知此事要躬行。