package com.test.crawler.service; import org.apache.http.HttpEntity; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.impl.conn.PoolingHttpClientConnectionManager; import org.apache.http.protocol.BasicHttpContext; import org.apache.http.protocol.HttpContext; import org.apache.http.util.EntityUtils; import java.io.IOException; import java.util.List; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import com.test.db.po.Tb_test_company_info; import com.test.crawler.htmlHandler.CompanyDetailHtmlHandler; public class ViewCompanyDetailService { private static final int MAX_THREAD_NUM = 100; public void ViewCompanyDetail(List<Tb_test_company_info> companyList) throws InterruptedException{ if(companyList==null||companyList.size()==0){return;} PoolingHttpClientConnectionManager connManager = new PoolingHttpClientConnectionManager(); connManager.setMaxTotal(MAX_THREAD_NUM); CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(connManager).build(); try{ //公司信息总数 int iTotalComanyInfoNum = companyList.size(); //多线程执行的次数 int iMulitThreadRunTimes = 1; if(iTotalComanyInfoNum > MAX_THREAD_NUM){ iMulitThreadRunTimes = (iTotalComanyInfoNum/MAX_THREAD_NUM) + ( (iTotalComanyInfoNum%MAX_THREAD_NUM ==0) ? 0:1); } for(int iCurMulitThreadRunTimes = 0 ;iCurMulitThreadRunTimes < iMulitThreadRunTimes ;iCurMulitThreadRunTimes++ ){ //线程数 int iThreadNum = (iCurMulitThreadRunTimes+1)* MAX_THREAD_NUM <= iTotalComanyInfoNum ? MAX_THREAD_NUM : (iTotalComanyInfoNum - iCurMulitThreadRunTimes*MAX_THREAD_NUM); ExecutorService exe = Executors.newFixedThreadPool(iThreadNum); for (int i = 0; i < iThreadNum; i++) { HttpGet httpget = new HttpGet(companyList.get(iCurMulitThreadRunTimes*MAX_THREAD_NUM+i).getCompanyUrl()); exe.execute(new ViewCompanyDetailThread(httpClient, httpget, companyList.get(iCurMulitThreadRunTimes*MAX_THREAD_NUM+i).getId())); } exe.shutdown(); while (true) { if (exe.isTerminated()) { System.out.println(MAX_THREAD_NUM + " Over !!"); Thread.sleep(15000); break; } Thread.sleep(200); } } }finally{ try { httpClient.close(); } catch (IOException e) { e.printStackTrace(); } } } static class ViewCompanyDetailThread extends Thread { private final CloseableHttpClient httpClient; private final HttpContext context; private final HttpGet httpget; private final int shopId; public ViewCompanyDetailThread(CloseableHttpClient httpClient, HttpGet httpget, int shopId) { this.httpClient = httpClient; this.context = new BasicHttpContext(); this.httpget = httpget; this.shopId = shopId; } @Override public void run() { try { //System.out.println(shopId + "Get"); CloseableHttpResponse response = httpClient.execute(httpget, context); try { HttpEntity entity = response.getEntity(); if (entity != null) { String pageContent = EntityUtils.toString(entity,"UTF-8"); CompanyDetailHtmlHandler companyDetailHtmlHandler = new CompanyDetailHtmlHandler(); if(!companyDetailHtmlHandler.CompanyInfoParseAndSave(shopId, pageContent)){ System.out.println(shopId + " - CompanyInfoParseAndSave Failure"); } } } finally { response.close(); } } catch (Exception e) { System.out.println(shopId + " - error: " + e); } } } }
package com.test.crawler.htmlHandler; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.select.Elements; import com.test.db.dao.CompanyInfoDao; public class CompanyDetailHtmlHandler { /** * 公司信息解析并且保存 * @param shopId * @param pageContent * @return */ public synchronized boolean CompanyInfoParseAndSave(int shopId,String pageContent){ if(shopId<=0 || pageContent == null){return false;} Document doc = Jsoup.parse(pageContent); String CompanyPhone = ""; String CompanyBoss = ""; String CompanyMobil = ""; String CompanyAddr = ""; String QQ = ""; String Jyms = ""; String createDatetime = ""; Elements eleContents = doc.select("省略..."); if(eleContents!=null && eleContents.size() >0 ){ CompanyBoss = eleContents.first().select("省略...").first().text(); try{ String qqHref = eleContents.first().select("省略...").first().attr("href"); Pattern p = Pattern.compile("http://wpa.qq.com/msgrd\\?v=3\\&uin=(\\d*?)\\&site=qq\\&menu=yes"); Matcher m = p.matcher(qqHref); if(m.find()) { QQ = m.group(1); } }catch(Exception e){} try{ Jyms = eleContents.first().select("省略...").get(0).text(); CompanyAddr = eleContents.first().select("省略...").get(1).text(); createDatetime = eleContents.first().select("省略...").get(2).text(); }catch(Exception e){} } Elements eleContents2 = doc.select("div.wp-colsub div.wp-mdl div.wp-contact ul.contact-lst"); if(eleContents2!=null && eleContents2.size() >0 ){ try{ String regEx="[^0-9]"; Pattern p = Pattern.compile(regEx); Matcher m = p.matcher(eleContents2.select("li").get(1).text()); CompanyMobil = m.replaceAll("").trim(); String regEx2="[^0-9\\-]"; Pattern p2 = Pattern.compile(regEx2); Matcher m2 = p2.matcher(eleContents2.select("li").get(2).text()); CompanyPhone = m2.replaceAll("").trim(); }catch(Exception e){} } CompanyInfoDao dao = new CompanyInfoDao(); if(CompanyBoss==null||"".equals(CompanyBoss.trim())){CompanyBoss="-";} return dao.Update(shopId, CompanyPhone, CompanyBoss, CompanyMobil, CompanyAddr, QQ, Jyms, createDatetime); } }
package com.test.crawler.main; import com.test.crawler.service.ViewCompanyDetailService; import com.test.db.dao.CompanyInfoDao; public class TestMain { public static void main(String[] args) { //关闭httpclient多余日志 System.setProperty("org.apache.commons.logging.Log", "org.apache.commons.logging.impl.SimpleLog"); System.setProperty("org.apache.commons.logging.simplelog.showdatetime", "true"); System.setProperty("org.apache.commons.logging.simplelog.log.org.apache.commons.httpclient", "stdout"); try{ ViewCompanyDetailService ss = new ViewCompanyDetailService(); CompanyInfoDao dao = new CompanyInfoDao(); ss.ViewCompanyDetail(dao.ListForViewDetail()); }catch(Exception ex){ ex.printStackTrace(); } } }
Over