简单JAVA爬虫51Jobs
使用Jsoup工具,它是一个HTML解析器,可以直接直接解析某个地址或者HTML文件。还可 通过Dom,CSS以及类似JQuery的操作方法操作数据。
Jsoup官方文档地址:https://jsoup.org/cookbook/introduction/parsing-a-document
注意:出现乱码时,需要查看编码方式网页的编码方式,使用它的编码方式解码。使用表单传输中文数据时有些网站需要进行url编码才能正常传输中文=。=
主要代码如下:
package com.galoliy.spider.maven_spider.domain; import java.io.BufferedInputStream; import java.io.BufferedOutputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.io.UnsupportedEncodingException; import java.net.URLEncoder; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import org.jsoup.Connection; import org.jsoup.Connection.Method; import org.jsoup.Connection.Response; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; public class Cat5jobs { public Document getResultPage(String url,String keyword) throws UnsupportedEncodingException { Document doc = null; //multipart/form-data 编码类型转换,必须进行转换,不然会导致POST里的keyword乱码 //Multipart/form-data code type conversion must be converted, otherwise it will cause keyword confusion in POST. keyword = URLEncoder.encode(keyword, "gbk"); try { //获取主页 //Get index page Response resp = Jsoup.connect(url).method(Method.GET).execute(); doc = resp.parse(); //获取查询结果页的跳转链接 //Get query results jump page link String actionPath = doc.select("form").attr("action"); Connection con = Jsoup.connect(actionPath) .data("keyword", keyword) .userAgent("Mozilla") .cookies(resp.cookies()) .header("Accept-Language", "zh-CN,zh;q=0.9") .timeout(300000); //得到查询结果页面 //Get query results page doc = con.method(Method.POST).execute().parse(); } catch (IOException e) { e.printStackTrace(); } return doc; } public void getResult(String url,String keyword,String dir,String fileName) { Document doc = null; File htmlPath = null; File txtPath = null; String htmlFilePath = dir + fileName + ".htm"; String txtFilePath = dir + fileName + "2.txt"; txtPath = new File(txtFilePath); htmlPath = new File(htmlFilePath); Map map = null; String printSrc = ""; try { //本地如果有html文件则解析该文件,打印内容并储存一个txt文件 //If there is a HTML file in the local area, parse the file, print the contents and store a TXT file. if(!txtPath.exists() && htmlPath.exists()) { doc = Jsoup.parse(htmlPath, "utf-8"); if(!doc.children().isEmpty()) System.out.println("File not empty"); map = Screen51Jobs(doc); printSrc = printScreen(map); saveFile(printSrc, txtFilePath); System.out.println(printSrc); //如果本地有html和txt文件则读取txt文件内容,否则抛出IOException //If you have HTML and txt files locally, you can read the contents of the txt file, otherwise throw IOException. }else if(txtPath.exists() && htmlPath.exists()) { System.out.println("File not empty"); printSrc = printScreen(txtPath); System.out.println(printSrc); }else throw new IOException("NOT HTML FILE"); } catch (IOException e) { //在catch块里执行爬虫并且把文件保存在本地,Execute crawler in catch block and save the file locally. System.out.println("file not found"); try { //从网址上获取查询结果页面 //Get query results page from web address doc = this.getResultPage(url,keyword); htmlPath.createNewFile(); //存储html文件 //Save html file saveFile(doc.toString(),htmlFilePath); map = Screen51Jobs(doc); String printStr = printScreen(map); if(!htmlPath.exists()) htmlPath.createNewFile(); //存储txt文件 //Save txt file saveFile(printStr, txtFilePath); System.out.println(printSrc); } catch (IOException ex) { ex.printStackTrace(); } } } private String printScreen(File path) throws IOException{ StringBuilder printSrc = new StringBuilder(); InputStream in = new FileInputStream(path); BufferedInputStream bis = new BufferedInputStream(in); int len = 0; byte[] bytes = new byte[1024 * 8]; while((len = bis.read(bytes, 0, bytes.length)) != -1) { printSrc.append(new String(bytes,0,bytes.length)); } bis.close(); return printSrc.toString(); } private String printScreen(Map<?,?> screen) throws IOException { StringBuilder sb = new StringBuilder(); String p = "\r\n"; sb.append(p + " KeyWord:" + screen.get("keyword") + p + p +" Total query data:" + screen.get("totalquerydata") + p + p + " Recruitment info:"); List list = (ArrayList)screen.get("recruitmentlist"); for (Object o : list) { Map map = (HashMap<String,Object>)o; for (Object obj : map.entrySet()) { Map.Entry<String, Object> entry = (Map.Entry<String, Object>)obj; sb.append(p + entry.getKey() + " == " + entry.getValue()); } sb.append(p); } return sb.toString(); } @SuppressWarnings({ "rawtypes", "unchecked" }) private Map<?,?> Screen51Jobs(Document doc){ Map screen = new HashMap<String,Object>(); Elements resultList = doc.select("div[class=dw_table]div[id=resultList]"); Elements findKeyword = resultList.select("div[class=sbox]"); Elements totalQueryData = resultList.select("div[class=rt]div:matchesOwn(^共)"); Elements recruitmentInfo = resultList.select("div[class=el]"); screen.put("keyword", findKeyword.text()); screen.put("totalquerydata", totalQueryData.text()); List recruitmentList = new ArrayList<Map<String,String>>(); Map m = null; for (Element e : recruitmentInfo) { m = new HashMap<String,Object>(); m.put("position",e.select("p[class~=^t1]").text()); m.put("href", e.select("a").attr("href")); m.put("corporatename", e.select("a").text()); m.put("address", e.select("span[class=t3]").text()); m.put("salary", e.select("span[class=t4]").text()); m.put("releasedate", e.select("span[class=t5]").text()); recruitmentList.add(m); } screen.put("recruitmentlist", recruitmentList); return screen; } private void saveFile(String src,String path) throws IOException { // InputStream in = new FileInputStream(path); OutputStream out = new FileOutputStream(path); BufferedOutputStream bos = new BufferedOutputStream(out); byte[] bytes = src.getBytes("utf-8"); bos.write(bytes, 0, bytes.length); }