Jsoup爬取带登录验证码的网站
今天学完爬虫之后想的爬一下我们学校的教务系统,可是发现登录的时候有验证码。因此研究了Jsoup爬取带验证码的网站:
大体的思路是:(需要注意的是__VIEWSTATE一直变化,所以我们每个页面都需要重新获取并带着爬取下一个页面)
1.先爬取网站的主页,由于我们学校的网站是ASP.net,所以需要爬到每个网页的__VIEWSTATE。同时爬取主页也可以获得一个cookie(ASP.sessionId)
2.带着__VIEWSTATE和ASP.sessionId爬取验证码。(网上说有专门识别验证码的软件,在这里我只是把验证码下载到本地之后,需要用户输入验证码)获取验证码图片的时候需要带着cookie去获取,来标识是本次session请求的验证码,如果不带sessionid下载验证码之后输入验证码也无效。
3.输入用户名,密码和验证码登录系统,登录系统需要携带一些其他参数(值为空也需要携带)。
4.登录之后不能直接爬取成绩,需要爬虫登录成功之后的主页面获取__viewstate。
5.爬完登录成功的主页之后就可以进行爬取成绩,将爬到的成绩收集起来,最后输出到html页面中。
(在这个爬虫的过程中需要注意__viewstate,每个页面都需要获取这个值,这个值是放在input隐藏域中。另外爬取过程中请求头携带REFER参数(也就是表示你从哪个网站过来的),防止盗链)
下面是代码:
1.爬虫的入口
package cn.qlq.craw.JsoupCrawJWXT; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Map.Entry; import java.util.Scanner; /** * 爬虫主的程序调度器(爬虫教务系统的入口) * * @author liqiang * */ public class MainClass { public static void main(String[] args) { // 输入学号和密码 System.out.print("请输入你要查询学号:"); Scanner sc = new Scanner(System.in); String xuehao = sc.next(); System.out.print("请输入密码:"); String password = sc.next(); // Console con = System.console(); // String pswd = new String(con.readPassword());// 因为读取的是字符数组,所以需要用new try { DownloadLoginfo downloadLoginfo = new DownloadLoginfo(); LoginClass loginClass = new LoginClass(); GradeOutput gradeOutput = new GradeOutput(); // 1.访问主页,获取验证码与viewstate downloadLoginfo.getLogInfo(); // 2.登录 loginClass.login(downloadLoginfo, xuehao, password); for (Entry<String, String> entry : loginClass.getCookies().entrySet()) { System.out.println("key:" + entry.getKey() + ";value" + entry.getValue()); } CrawGrade crawGrade = new CrawGrade(); //3. 爬取成绩的上一个页面 crawGrade.crawGradeLastPage(downloadLoginfo.getCookies(), downloadLoginfo.getViewState(), xuehao); List<String> condition = geneQueryCondition(); //4.循环分学年爬取成绩 for (String xuenian : condition) { String html_content = crawGrade.crawGrade(xuenian, "2", downloadLoginfo.getCookies(), // 4.1爬取成绩页面 downloadLoginfo.getViewState(), xuehao); gradeOutput.collectGrade(html_content); } //5.输出爬到的数据到html文件中 gradeOutput.outputDatas2Html(); } catch (IOException e) { System.out.println("无法连接学校服务器"); } catch (Exception e) { e.printStackTrace(); } } /** * 构造需要查询的年份和学期 * * @return */ public static List<String> geneQueryCondition() { List<String> condition = new ArrayList<String>(); condition.add("2014-2015"); condition.add("2015-2016"); condition.add("2016-2017"); condition.add("2017-2018"); return condition; } }
2.爬取学校主页获取__VIEWSTATE和cookie
package cn.qlq.craw.JsoupCrawJWXT; import java.util.HashMap; import java.util.Map; import java.util.Map.Entry; import org.jsoup.Connection; import org.jsoup.Connection.Method; import org.jsoup.Connection.Response; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; /** * url获取图片并且保存到本地 * * @author liqiang * */ public class DownloadLoginfo { /** * 第一次访问获取的cookie(查看发现就返回一个cookie:ASP.NET_SessionId) */ private Map<String, String> cookies = null; /** * __viewstate 教务系统用于验证的信息 */ private String viewState = null; public DownloadLoginfo() { this.cookies = new HashMap<String,String>();; this.viewState = ""; } /** * 获取登录信息 * 主要就是访问一下主页面,获取一个__viewstate与cookie */ public void getLogInfo() throws Exception { String urlLogin = "http://newjwc.tyust.edu.cn/"; Connection connect = Jsoup.connect(urlLogin); // 伪造请求头 connect.header("Accept", "application/json, text/javascript, */*; q=0.01").header("Accept-Encoding", "gzip, deflate"); connect.header("Accept-Language", "zh-CN,zh;q=0.9").header("Connection", "keep-alive"); connect.header("Content-Length", "213").header("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8"); connect.header("Host", "newjwc.tyust.edu.cn").header("Referer", "http://newjwc.tyust.edu.cn/"); connect.header("User-Agent", "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36") .header("X-Requested-With", "XMLHttpRequest"); // 请求url获取响应信息 Response res = connect.ignoreContentType(true).method(Method.POST).execute();// 执行请求 // 获取返回的cookie this.cookies = res.cookies(); for (Entry<String, String> entry : cookies.entrySet()) { System.out.println(entry.getKey() + "-" + entry.getValue()); } // 获取响应体 String body = res.body(); // 调用下面方法获取__viewstate this.getViewState(body);// 获取viewState //调用下载验证码的工具类下载验证码 JsoupDoloadPicture.downloadImg("http://newjwc.tyust.edu.cn/CheckCode.aspx", cookies);; } /** * 获取viewstate * * @return */ public String getViewState(String htmlContent) { Document document = Jsoup.parse(htmlContent); Element ele = document.select("input[name='__VIEWSTATE']").first(); String value = ele.attr("value"); // 获取到viewState this.viewState = value; return value; } public Map<String, String> getCookies() { return cookies; } public void setCookies(Map<String, String> cookies) { this.cookies = cookies; } public String getViewState() { return viewState; } public void setViewState(String viewState) { this.viewState = viewState; } }
3.带着验证码爬取验证码,并下载到本地
package cn.qlq.craw.JsoupCrawJWXT; import java.io.BufferedOutputStream; import java.io.File; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.util.Map; import org.apache.commons.io.FileUtils; import org.jsoup.Connection; import org.jsoup.Jsoup; /** * Jsoup带着cookie下载验证码到本地(必须带着cookie下载验证码,否则下载的验证码无效) * * @author liqiang * */ public class JsoupDoloadPicture { /** * 带着cookie下载验证码图片 * * @param url * @param cookies * @throws IOException */ public static void downloadImg(String url, Map<String, String> cookies) throws IOException { // TODO Auto-generated method stub Connection connect = Jsoup.connect(url); connect.cookies(cookies);// 携带cookies爬取图片 connect.timeout(5 * 10000); Connection.Response response = connect.ignoreContentType(true).execute(); byte[] img = response.bodyAsBytes(); System.out.println(img.length); // 读取文件存储位置 String directory = ResourcesUtil.getValue("path", "file"); savaImage(img, directory, "yzm.png"); } /** * 保存图片到本地 * @param img * @param filePath * @param fileName */ public static void savaImage(byte[] img, String filePath, String fileName) { BufferedOutputStream bos = null; FileOutputStream fos = null; File file = null; File dir = new File(filePath); try { //判断文件目录是否存在 if(dir.exists() && !dir.isDirectory()){ FileUtils.deleteQuietly(dir); } dir.mkdir(); file = new File(filePath + "\\" + fileName); fos = new FileOutputStream(file); bos = new BufferedOutputStream(fos); bos.write(img); System.out.println("验证码已经下载到:"+filePath); } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } finally { if (bos != null) { try { bos.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } if (fos != null) { try { fos.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } } }
4.登录类
package cn.qlq.craw.JsoupCrawJWXT; import java.util.Map; import java.util.Map.Entry; import java.util.Scanner; import org.jsoup.Connection; import org.jsoup.Connection.Method; import org.jsoup.Connection.Response; import org.jsoup.Jsoup; /** * 登录类(访问登录页面获取登录的cookie) * * @author liqiang * */ public class LoginClass { /** * 记录返回的cookie */ private Map<String, String> cookies = null; /** * 模拟登录获取cookie和sessionid * */ public void login(DownloadLoginfo downloadLoginfo, String xuehao, String mima) throws Exception { String urlLogin = "http://newjwc.tyust.edu.cn/default2.aspx"; Connection connect = Jsoup.connect(urlLogin); connect.timeout(5 * 100000); // 伪造请求头 connect.header("Content-Length", "213").header("Content-Type", "application/x-www-form-urlencoded"); connect.header("Host", "newjwc.tyust.edu.cn").header("Referer", "http://newjwc.tyust.edu.cn/xscjcx.aspx?xh=" + xuehao + "&xm=%C7%C7%C0%FB%C7%BF&gnmkdm=N121613"); connect.header("User-Agent", "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"); // 输入验证码 System.out.println("-----------请输入验证码---------"); Scanner sc = new Scanner(System.in); String yzm = sc.next(); sc.close(); // 携带登陆信息 connect.data("txtUserName", xuehao).data("__VIEWSTATE", downloadLoginfo.getViewState()).data("TextBox2", mima) .data("Textbox1", "").data("RadioButtonList1", "").data("Button1", "").data("lbLanguage", "") .data("hidPdrs", "").data("hidsc", "").data("txtSecretCode", yzm); connect.cookies(downloadLoginfo.getCookies()); // 请求url获取响应信息 Response res = connect.ignoreContentType(true).method(Method.POST).execute();// 执行请求 // 获取返回的cookie this.cookies = res.cookies(); for (Entry<String, String> entry : cookies.entrySet()) { System.out.println(entry.getKey() + "-" + entry.getValue()); } System.out.println("---------获取的登录之后的页面-----------"); String body = res.body();// 获取响应体 System.out.println(body); } public Map<String, String> getCookies() { return cookies; } public void setCookies(Map<String, String> cookies) { this.cookies = cookies; } }
5.爬取登录之后的主页和成绩
package cn.qlq.craw.JsoupCrawJWXT; import java.io.IOException; import java.util.Map; import org.jsoup.Connection; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; /** * 爬取成绩的类 * * @author liqiang * */ public class CrawGrade { private String viewState; /** * 全局获取viewstate的函数 * @param html * @return */ public String getViewState(String html){ Document document = Jsoup.parse(html); Element ele = document.select("input[name='__VIEWSTATE']").first(); String value = ele.attr("value"); this.viewState = value; // 获取到viewState return value; } /** * 爬取获取成绩的上一个页面(也就是刚登陆之后的页面) * @param cookies * @param viewStata * @param xuehao * @return * @throws IOException */ public String crawGradeLastPage(Map<String,String> cookies,String viewStata,String xuehao) throws IOException{ String urlLogin = "http://newjwc.tyust.edu.cn/xscjcx.aspx?xh="+xuehao+"&xm=%C7%C7%C0%FB%C7%BF&gnmkdm=N121613"; Connection connect = Jsoup.connect(urlLogin); connect.timeout(5 * 100000); // 伪造请求头 connect.header("Content-Length", "74642").header("Content-Type", "application/x-www-form-urlencoded"); connect.header("Host", "newjwc.tyust.edu.cn").header("Referer", "http://newjwc.tyust.edu.cn/xscjcx.aspx?xh=201420020123&xm=%C7%C7%C0%FB%C7%BF&gnmkdm=N121613"); connect.header("User-Agent", "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"); // 携带登陆信息 connect.data("xh","201420020123") .data("xm", viewStata) .data("hidLanguage", "") .data("gnmkdm", "N121613"); //设置cookie connect.cookies(cookies); Document document = connect.post(); System.out.println("-----------爬到的成绩的上一个页面--------------"); String html = document.toString(); System.out.println(html); // 重新获取到viewState this.getViewState(html); return html; } /** * 爬取成绩页面 */ public String crawGrade(String xuenian,String xueqi,Map<String,String> cookies,String viewStata,String xuehao) throws IOException{ String urlLogin = "http://newjwc.tyust.edu.cn/xscjcx.aspx?xh="+xuehao+"&xm=%C7%C7%C0%FB%C7%BF&gnmkdm=N121613"; Connection connect = Jsoup.connect(urlLogin); connect.timeout(5 * 100000); // 伪造请求头 connect.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8") .header("Accept-Encoding", "gzip, deflate"); connect.header("Accept-Language", "zh-CN,zh;q=0.9").header("Connection", "keep-alive"); connect.header("Content-Length", "74642").header("Content-Type", "application/x-www-form-urlencoded"); connect.header("Host", "newjwc.tyust.edu.cn").header("Referer", "http://newjwc.tyust.edu.cn/xscjcx.aspx?xh=201420020123&xm=%C7%C7%C0%FB%C7%BF&gnmkdm=N121613"); connect.header("User-Agent", "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"); // 携带登陆信息 connect.data("__EVENTTARGET","") .data("__EVENTARGUMENT", "") .data("__VIEWSTATE", this.viewState) .data("hidLanguage","") .data("ddlXN", xuenian) .data("ddlXQ", xueqi) .data("btn_xn", "") .data("ddl_kcxz", ""); connect.cookies(cookies); Document document = connect.post(); System.out.println("-----------爬到的成绩的页面--------------"); String html = document.toString(); //更新viewstate this.getViewState(html); System.out.println(html); return html; } public void setViewState(String viewState) { this.viewState = viewState; } }
6.收集成绩的类
package cn.qlq.craw.JsoupCrawJWXT; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.util.ArrayList; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; /** * 收集成绩与输出成绩 * * @author liqiang * */ @SuppressWarnings("all") public class GradeOutput { /** * 保存成绩的集合 */ private List<Map<String, Object>> datas; public GradeOutput() { this.datas = new ArrayList<Map<String, Object>>(); } /** * 收集成绩 * * @param html * @return */ public String collectGrade(String html) { // 解析html Document document = Jsoup.parse(html); // 获取成绩表格 Element table = document.select("#Datagrid1").first(); // 选择除表格表头之外的元素 Elements trs = table.select("tr:gt(0)"); for (Element ele : trs) { Map result = new LinkedHashMap(); Elements ele0 = ele.select("td:eq(0)");// 找到学年 result.put("xuenian", ele0.text()); Elements ele1 = ele.select("td:eq(1)");// 找到学期 result.put("xueqi", ele1.text()); Elements ele3 = ele.select("td:eq(3)");// 找到课程名称 result.put("kecheng", ele3.text()); Elements ele8 = ele.select("td:eq(8)");// 找到成绩 result.put("chengji", ele8.text()); this.datas.add(result); } return null; } /** * 输出成绩到控制台 */ public void outPutGrade() { if (this.datas == null || this.datas.size() == 0) { return; } System.out.println("-------下面是提取到的成绩--------"); for (Map result : datas) { System.out.println(result.get("xuenian") + "\t" + result.get("xueqi") + "\t" + result.get("kecheng") + "\t" + result.get("chengji") + "\t"); } } /** * 最后处理所有的数据,写出到html或者保存数据库 * * @throws IOException */ public void outputDatas2Html() throws IOException { if (datas != null && datas.size() > 0) { // 读取文件存储位置 String directory = ResourcesUtil.getValue("path", "file"); File file = new File(directory+"\\gradeOut.html"); // 如果文件不存在就创建文件 if (!file.exists()) { file.createNewFile(); } // 构造FileWriter用于向文件中输出信息(此构造方法可以接收file参数,也可以接收fileName参数) FileWriter fileWriter = new FileWriter(file); // 开始写入数据 fileWriter.write("<html>"); fileWriter.write("<head>"); fileWriter.write("<title>xxx成绩单</title>"); fileWriter .write("<style>table{width:100%;table-layout: fixed;word-break: break-all; word-wrap: break-word;}" + "table td{border:1px solid black;width:300px}</style>"); fileWriter.write("</head>"); fileWriter.write("<body>"); fileWriter.write("<table cellpadding='0' cellspacing='0' style='text-align:center;'>"); fileWriter.write( "<tr style='background-color:#95caca;font-size:20px'><td>学年</td><td>学期</td><td>课程名字</td><td>成绩</td></tr>"); for (Map<String, Object> data : datas) { String xuenian = (String) data.get("xuenian"); String xueqi = (String) data.get("xueqi"); String kecheng = (String) data.get("kecheng"); String chengji = (String) data.get("chengji"); fileWriter.write("<tr>"); fileWriter.write("<td>" + xuenian + "</td>"); fileWriter.write("<td>" + xueqi + "</td>"); fileWriter.write("<td>" + kecheng + "</td>"); fileWriter.write("<td>" + chengji + "</td>"); fileWriter.write("</tr>"); } fileWriter.write("</table>"); fileWriter.write("</body>"); fileWriter.write("</html>"); // 关闭文件流 fileWriter.close(); } } public List<Map<String, Object>> getDatas() { return datas; } public void setDatas(List<Map<String, Object>> datas) { this.datas = datas; } }
path.properties (设置验证码图片和最后的成绩单输出到哪个位置)
#fileToSave
#yzm
file=C:\\Users\\liqiang\\Desktop
读取上述配置文件的工具类:
package cn.qlq.craw.JsoupCrawJWXT; import java.io.Serializable; import java.text.MessageFormat; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.Locale; import java.util.ResourceBundle; import java.util.Set; /** * 资源文件读取工具类 * */ public class ResourcesUtil implements Serializable { private static final long serialVersionUID = -7657898714983901418L; /** * 系统语言环境,默认为中文zh */ public static final String LANGUAGE = "zh"; /** * 系统国家环境,默认为中国CN */ public static final String COUNTRY = "CN"; private static Locale getLocale() { Locale locale = new Locale(LANGUAGE, COUNTRY); return locale; } /** * 根据语言、国家、资源文件名和key名字获取资源文件值 * * @param language * 语言 * * @param country * 国家 * * @param baseName * 资源文件名 * * @param section * key名字 * * @return 值 */ private static String getProperties(String baseName, String section) { String retValue = ""; try { Locale locale = getLocale(); ResourceBundle rb = ResourceBundle.getBundle(baseName, locale); retValue = (String) rb.getObject(section); } catch (Exception e) { e.printStackTrace(); // TODO 添加处理 } return retValue; } /** * 通过key从资源文件读取内容 * * @param fileName * 资源文件名 * * @param key * 索引 * * @return 索引对应的内容 */ public static String getValue(String fileName, String key) { String value = getProperties(fileName,key); return value; } public static List<String> gekeyList(String baseName) { Locale locale = getLocale(); ResourceBundle rb = ResourceBundle.getBundle(baseName, locale); List<String> reslist = new ArrayList<String>(); Set<String> keyset = rb.keySet(); for (Iterator<String> it = keyset.iterator(); it.hasNext();) { String lkey = (String)it.next(); reslist.add(lkey); } return reslist; } /** * 通过key从资源文件读取内容,并格式化 * * @param fileName * 资源文件名 * * @param key * 索引 * * @param objs * 格式化参数 * * @return 格式化后的内容 */ public static String getValue(String fileName, String key, Object[] objs) { String pattern = getValue(fileName, key); String value = MessageFormat.format(pattern, objs); return value; } public static void main(String[] args) { System.out.println(getValue("resources.messages", "101",new Object[]{100,200})); //根据操作系统环境获取语言环境 /*Locale locale = Locale.getDefault(); System.out.println(locale.getCountry());//输出国家代码 System.out.println(locale.getLanguage());//输出语言代码s //加载国际化资源(classpath下resources目录下的messages.properties,如果是中文环境会优先找messages_zh_CN.properties) ResourceBundle rb = ResourceBundle.getBundle("resources.messages", locale); String retValue = rb.getString("101");//101是messages.properties文件中的key System.out.println(retValue); //信息格式化,如果资源中有{}的参数则需要使用MessageFormat格式化,Object[]为传递的参数,数量根据资源文件中的{}个数决定 String value = MessageFormat.format(retValue, new Object[]{100,200}); System.out.println(value); */ } }
git地址:https://github.com/qiao-zhi/javaCraw