记一次爬需要登录之后才能爬取数据的demo
一:工程概况
注意:
二:涉及到的类
package com.bigdata.crawler; import java.io.IOException; import java.util.List; import org.apache.commons.io.IOUtils; import org.apache.http.HttpEntity; import org.apache.http.client.ClientProtocolException; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.openqa.selenium.By; import org.openqa.selenium.Keys; import org.openqa.selenium.WebElement; import org.openqa.selenium.chrome.ChromeDriver; import org.openqa.selenium.firefox.FirefoxDriver; import org.openqa.selenium.interactions.Actions; import com.bigdata.util.DriverCommon; public class CnzzCrawler { private String baseUrl ="http://new.cnzz.com/v1/login.php?siteid=1262437219"; private String password = "******";//查看密码 private ChromeDriver driver; public CnzzCrawler() { } public CnzzCrawler(ChromeDriver driver) { super(); this.driver = driver; } public void start(){ // 登入网站 driver.get(baseUrl); // 输入密码 driver.findElement(By.id("password")).sendKeys(password); try { Thread.sleep(1000); } catch (InterruptedException e) { e.printStackTrace(); } // 点击登入 html body div.pwdmain div.pwdcheck div.pwdcheck4 div form div img // body > div.pwdmain > div.pwdcheck > div.pwdcheck4 > div:nth-child(1) > form > div:nth-child(2) > img driver.findElement(By.cssSelector("div.pwdcheck4 > div:nth-child(1) > form > div:nth-child(2) > img")).click(); try { Thread.sleep(1000); } catch (InterruptedException e) { e.printStackTrace(); } Document doc = Jsoup.parse(driver.getPageSource()); //System.out.println(doc); //html.cnzz body div#userLoginHeader.userLoginHeader div.section div#rightContainer.rightContainer div#dashboardRootEl.dashboard ul#module_container.module.ui-sortable li.module_data0.moduleTwo table tbody tr.bg-white td.url div a.blue12 Elements elements= doc.select("a.blue12"); for(Element e: elements ){ String string = e.attr("href"); System.out.println(string); } driver.close(); } public static void main(String[] args) throws IOException { System.setProperty("webdriver.chrome.driver", DriverCommon.getDriverName(DriverCommon.getOSType())); //System.setProperty("webdriver.firefox.driver", "D:/Program Files (x86)/Mozilla Firefox/firefox.exe"); ChromeDriver driver = new ChromeDriver(); //FirefoxDriver driver = new FirefoxDriver(); new CnzzCrawler(driver).start(); } }
package com.bigdata.util; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; public class DriverCommon { /** * 获取操作系统类型 */ public static String getOSType(){ String temp = System.getProperty("os.name"); if(temp.contains("Mac")){ return "mac"; }else if(temp.contains("Win")){ return "win"; }else{ try { Process process = Runtime.getRuntime().exec("getconf LONG_BIT"); BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(process.getInputStream())); String s = bufferedReader.readLine(); if(s.contains("64")){ return "linux64"; }else{ return "linux32"; } } catch (IOException e) { e.printStackTrace(); return "linux64"; //默认Linux64 } } } /** * 获取浏览器驱动 * @param os * @return * @throws IOException */ public static String getDriverName(String os) throws IOException{ if(os == null) return null; switch (os) { case "win": return "chromedriver.exe"; case "mac": return "chromedriver_mac"; case "linux_32": return "chromedriver_linux32"; case "linux_64": default: return "chromedriver_linux64"; } } }