人生第一个java脚本-jsoup实例
目的:
获取如下信息;
制作流程图
该方法缺点,会获取到多个重复货号。
解决:导出成excel表格-》选择 货号 列 -》删除重复值
代码结构如下:
ToMain.java
package com.lnthz.main; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import org.jsoup.Connection; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import com.lnthz.cookie.CookieUtil; import com.lnthz.jdbc.JdbcMain; import com.lnthz.pojo.ItemCas; import com.lnthz.pojo.TargetData; import com.lnthz.pojo.XDocDataPojo; /** * @Desc 主类 * @author lnthz * @param * */ public class ToMain { public static void main(String[] args) throws Exception{ ToMain.JueDDZ(441, 1000); //这两个参数是为了方便调试,有少量目标网页规则不一样,也可以用作开线程 } //此方法为了找到 每个货号对应的绝对地址 public static void JueDDZ(int aa,int bb) throws Exception{ ItemCas itemCas=new ItemCas(); int HH=100001; String aUrl="https://www.xfnano.com/Product/?1=1&key="; //找到规则循环货期地址 for (int i = aa; i<bb; ++i) { //空Url String nullUrl="https://www.xfnano.com/Product/comment.aspx?fk=0&kind=0&width=520&height=350&TB_iniframe=true&KeepThis=true&TB_iframe=true&modal=false"; int aHH=HH+i; //拼接字符串 String bUrl=aUrl+aHH; //得到整个目标页面源码 Document doc = Jsoup.connect(bUrl).get(); //得到货号所在的 div Element clasDoc=doc.select("div.pro_list_container").first(); //得到货号地址 Elements links = clasDoc.select("a[href]"); String casName = clasDoc.select("a[href]").text(); //得到绝对地址 删选出来空地址 String absHref = links.attr("abs:href");// if(absHref.equals(nullUrl)){ continue; }else{ /*ToMain.JueDDZ(absHref);*/ /* System.out.println(absHref);*/ System.out.println("当前i值:"+i+"当前地址:"+absHref); itemCas.setItem(aHH); itemCas.setCasName(casName); itemCas.setMaincasurl(absHref); JdbcMain.addItemCas(itemCas); ToMain.xTableData(absHref); ToMain.xDocData(aHH,absHref); } } System.out.println("最后"); JdbcMain.jdbcClose(); } /** * 此方法用于获取货号对应的详细介绍 * @param absHref */ private static void xDocData(int aHH,String absHref) throws Exception{ // TODO Auto-generated method stub XDocDataPojo xd=new XDocDataPojo(); String url=absHref; Connection conn=Jsoup.connect(url); conn.cookies(CookieUtil.getCookies()); Document doc_x=conn.get(); // Element doc_d=doc_x.getElementById("conn"); // System.out.println(doc_d.val()); // if (doc_d.val() != null) { // String x2doc=doc_d.select("div.other_r div.pro_detail").html(); // xd.setItem(aHH); // xd.setXdoc(x2doc); // JdbcMain.addXDocDataPojo(xd); // } else { Elements x1doc=doc_x.select("div.pro_contbox"); Elements x2doc=x1doc.select("div.other_r"); String x3doc=x2doc.select("div.pro_detail").html(); // System.out.println(""+x1doc); xd.setItem(aHH); xd.setXdoc(x3doc); JdbcMain.addXDocDataPojo(xd); // } } /** * 此方法用于获取表格详细数据 * @author lnthz * @param absHref */ private static void xTableData(String absHref) throws Exception{ // TODO Auto-generated method stub JdbcMain td=new JdbcMain(); List list = new ArrayList(); String url=absHref; Connection conn=Jsoup.connect(url); conn.cookies(CookieUtil.getCookies()); Document doc_t=conn.get(); Elements doc_table=doc_t.select("div.pro_contbox div.tablelist"); // 使用选择器选择该table内所有的<tr> <tr/> Elements trs = doc_table.select("tr"); /*System.out.println(trs);*/ //遍历表格 //i=0,带第一行标题; i=1 不带第一行标题 for (int i = 1; i < trs.size(); ++i) { // 获取一个tr Element tr = trs.get(i); // 获取该行的所有td节点 Elements tds = tr.select("td"); //遍历td数据 HashMap<Integer,String> map=new HashMap<Integer,String>(); for(int j=0; j<tds.size(); j++){ Element[] array=new Element[16]; array[j]= tds.get(j); map.put(j, array[j].text()); } list.add(map); /* System.out.println("-----------------"); */ } td.insertCas(list); } }
JdbcMain.java
package com.lnthz.jdbc; import java.sql.Connection; import java.sql.DriverManager; import java.sql.PreparedStatement; import java.sql.ResultSet; import java.sql.SQLException; import java.sql.Statement; import java.util.Iterator; import java.util.List; import java.util.Map; import com.lnthz.pojo.ItemCas; import com.lnthz.pojo.XDocDataPojo; public class JdbcMain { public static final String URL = "jdbc:mysql://localhost:3307/webCas?useUnicode=true&characterEncoding=utf8"; public static final String USER = "root"; public static final String PASSWORD = "123456"; private static Connection conn = null; static{ try { //1.加载驱动程序 Class.forName("com.mysql.jdbc.Driver"); //2. 获得数据库连接 conn = DriverManager.getConnection(URL, USER, PASSWORD); } catch (ClassNotFoundException e) { e.printStackTrace(); } catch (SQLException e) { e.printStackTrace(); } } public static void jdbcClose(){ try { System.out.println("数据库已关闭(* ̄︶ ̄)"); conn.close(); } catch (SQLException e) { // TODO Auto-generated catch block e.printStackTrace(); } } public static Connection getConnection(){ return conn; } /** * 此方法用于增加表格详情 * @param list * @throws Exception */ public static void insertCas(List list)throws Exception{ Map map=null; Connection conn = JdbcMain.getConnection(); String str="insert into TargetData(id,itemnumber,casnumber,packnumber,parameter,instock,period,price,shu) values(?,?,?,?,?,?,?,?,?)"; PreparedStatement pstat = conn.prepareStatement(str); for(int h =0;h<list.size();h++){ map = (Map)list.get(h); Iterator<Map.Entry<Integer, String>> entries = map.entrySet().iterator(); while (entries.hasNext()) { Map.Entry<Integer, String> entry = entries.next(); int a=entry.getKey()+1; pstat.setString(a,entry.getValue()); } pstat.executeUpdate(); } System.out.println("TargetData插入成功(* ̄︶ ̄)"); } /** * 用于ItemCas数据表增加 * * @param i */ public static void addItemCas(ItemCas i) { // TODO Auto-generated method stub Connection conn = JdbcMain.getConnection(); String sql="insert into ItemCas(item,casName,maincasurl) values (?,?,?)"; PreparedStatement ptmt; try { ptmt = conn.prepareStatement(sql); ptmt.setInt(1,i.getItem()); ptmt.setString(2, i.getCasName()); ptmt.setNString(3, i.getMaincasurl()); System.out.println("ItemCas插入成功(* ̄︶ ̄)"); ptmt.executeUpdate(); } catch (SQLException e) { e.printStackTrace(); }finally { } } public static void addXDocDataPojo(XDocDataPojo xd) throws SQLException{ Connection conn=JdbcMain.getConnection(); PreparedStatement ptmt=null; String sql="insert into XDocDataPojo(item,xdoc) values(?,?)"; ptmt=conn.prepareStatement(sql); ptmt.setInt(1, xd.getItem()); ptmt.setString(2, xd.getXdoc()); System.out.println("XDocDataPojo插入成功(* ̄︶ ̄)"); ptmt.executeUpdate(); } }
ItemCas.java
package com.lnthz.pojo; public class ItemCas { public int item; public String casName; public String maincasurl; public String getCasName() { return casName; } public void setCasName(String casName) { this.casName = casName; } public int getItem() { return item; } public void setItem(int item) { this.item = item; } public String getMaincasurl() { return maincasurl; } public void setMaincasurl(String maincasurl) { this.maincasurl = maincasurl; } }
XDocDataPojo.java
package com.lnthz.pojo; public class XDocDataPojo { public int item; public String xdoc; public int getItem() { return item; } public void setItem(int aHH) { this.item = aHH; } public String getXdoc() { return xdoc; } public void setXdoc(String xdoc) { this.xdoc = xdoc; } }
CookieUtil.java
package com.lnthz.cookie; import java.util.HashMap; public class CookieUtil { static HashMap cookies; static{ HashMap cookie=new HashMap(); //目标网站需要登录,cookie表自行解决,put参数就行 cookie.put("Hm_lvt_d4e9a2b5f76697fc95880ee989b6b944", "1543460799,1543894953,1543987988,1543992054"); cookie.put("LXB_REFER", "www.baidu.com"); } public static HashMap getCookies(){ return cookies; } }
流年拓荒者