java爬虫(八)使用node.js获取network中api接口内信息并用java的jsoup重写该方法
1.电脑安装node.js 点击官网传送门
2.在浏览器中(我用了一个谷歌内核的浏览器)找到自己url api 右键-->copy-->copy as nodejs fetch
(打开网页的审查元素后如果找不多url尝试刷新页面)
3.将代码粘贴进js文件中(我用的记事本)需要对代码进行简单的修改
修改的内容为:头部定义fetch变量,尾部输出结果,代码如下:
粘贴出来的代码:
fetch("http://ehall.tjut.edu.cn/publicapp/sys/zxzxapp/consult/queryConsultingList.do", { "headers": { "accept": "*/*", "accept-language": "zh-CN,zh;q=0.9", "content-type": "application/x-www-form-urlencoded; charset=UTF-8", "x-requested-with": "XMLHttpRequest", "cookie": "EMAP_LANG=zh; _WEU=0lGU9nUPyZ9qx*Rn4K9rs02ZG7l70bbzRFO5mUgwGPNPYxv1E*sNledwwPblS1lfd5Ik_YiYW3vIpD3LYICxSVT8oAgUTnE3MSHiJQzRFU7hbwhsp2gIy0OWEnvRY2eX8lSq0pRdz_2.; iPlanetDirectoryPro=kAXrTrPpkhEvxffxPsHPss; amp.locale=undefined; route=8da53839b22816a2e9746dc2f57870c1; MOD_AUTH_CAS=MOD_AUTH_ST-1453563-6C0UWnfbrqXCCeeFLV9u1609300648527-21wH-cas; zg_did=%7B%22did%22%3A%20%22174575a6eca420-0b1ea05958cdee-51a2f73-1fa400-174575a6ecb376%22%7D; zg_=%7B%22sid%22%3A%201609300649292%2C%22updated%22%3A%201609300649297%2C%22info%22%3A%201608798758280%2C%22superProperty%22%3A%20%22%7B%7D%22%2C%22platform%22%3A%20%22%7B%7D%22%2C%22utm%22%3A%20%22%7B%7D%22%2C%22referrerDomain%22%3A%20%22ehall.tjut.edu.cn%22%2C%22cuid%22%3A%20%22203128301%22%2C%22zs%22%3A%200%2C%22sc%22%3A%200%2C%22firstScreen%22%3A%201609300649292%7D; asessionid=f025e83f-cdc2-4206-8bef-d521c5dfb7d2; JSESSIONID=8nexyyMRuvuF9w5FdY-wnfhPlVhKKLLKtmAHz5m3FQur3psQlbvJ!1969776676" }, "referrer": "http://ehall.tjut.edu.cn/publicapp/sys/zxzxapp/index.do", "referrerPolicy": "no-referrer-when-downgrade", "body": "consultZone=ALL&search=&consultState=0&pageNumber=1&pageSize=10", "method": "POST", "mode": "cors" });
修改后的代码:
const fetch = require('node-fetch') fetch("http://ehall.tjut.edu.cn/publicapp/sys/zxzxapp/consult/queryConsultingList.do", { "headers": { "accept": "*/*", "accept-language": "zh-CN,zh;q=0.9", "content-type": "application/x-www-form-urlencoded; charset=UTF-8", "x-requested-with": "XMLHttpRequest", "cookie": "EMAP_LANG=zh; _WEU=0lGU9nUPyZ9qx*Rn4K9rs02ZG7l70bbzRFO5mUgwGPNPYxv1E*sNledwwPblS1lfd5Ik_YiYW3vIpD3LYICxSVT8oAgUTnE3MSHiJQzRFU7hbwhsp2gIy0OWEnvRY2eX8lSq0pRdz_2.; iPlanetDirectoryPro=kAXrTrPpkhEvxffxPsHPss; amp.locale=undefined; route=8da53839b22816a2e9746dc2f57870c1; MOD_AUTH_CAS=MOD_AUTH_ST-1453563-6C0UWnfbrqXCCeeFLV9u1609300648527-21wH-cas; zg_did=%7B%22did%22%3A%20%22174575a6eca420-0b1ea05958cdee-51a2f73-1fa400-174575a6ecb376%22%7D; zg_=%7B%22sid%22%3A%201609300649292%2C%22updated%22%3A%201609300649297%2C%22info%22%3A%201608798758280%2C%22superProperty%22%3A%20%22%7B%7D%22%2C%22platform%22%3A%20%22%7B%7D%22%2C%22utm%22%3A%20%22%7B%7D%22%2C%22referrerDomain%22%3A%20%22ehall.tjut.edu.cn%22%2C%22cuid%22%3A%20%22203128301%22%2C%22zs%22%3A%200%2C%22sc%22%3A%200%2C%22firstScreen%22%3A%201609300649292%7D; asessionid=f025e83f-cdc2-4206-8bef-d521c5dfb7d2; JSESSIONID=8nexyyMRuvuF9w5FdY-wnfhPlVhKKLLKtmAHz5m3FQur3psQlbvJ!1969776676" }, "referrer": "http://ehall.tjut.edu.cn/publicapp/sys/zxzxapp/index.do", "referrerPolicy": "no-referrer-when-downgrade", "body": "consultZone=ALL&search=&consultState=0&pageNumber=1&pageSize=10", "method": "POST", "mode": "cors" }).then(res=>res.json()).then(json=>console.log(json))
4.运行代码
(初次使用会报没有node-fetch这个包直接在cmd中用命令安装:npm install node-fetch即可)
cmd命令行中有两种node.js的运行方式
第一种:用node命令进入环境 然后逐句编写运行
第二种:用运行写好的node.js文件
在cmd中使用node+文件名.js即可
5.只用jsoup重写该方法
通过不停地注释代码查看运行结果,我们发现了很多冗余参数,经过删减后的node.js代码如下
const fetch = require('node-fetch') fetch("http://ehall.tjut.edu.cn/publicapp/sys/zxzxapp/consult/queryConsultingList.do", { "headers": { "content-type": "application/x-www-form-urlencoded; charset=UTF-8", "cookie": "EMAP_LANG=zh; _WEU=0lGU9nUPyZ9qx*Rn4K9rs02ZG7l70bbzRFO5mUgwGPNPYxv1E*sNledwwPblS1lfd5Ik_YiYW3vIpD3LYICxSVT8oAgUTnE3MSHiJQzRFU7hbwhsp2gIy0OWEnvRY2eX8lSq0pRdz_2.; iPlanetDirectoryPro=kAXrTrPpkhEvxffxPsHPss; amp.locale=undefined; route=8da53839b22816a2e9746dc2f57870c1; MOD_AUTH_CAS=MOD_AUTH_ST-1453563-6C0UWnfbrqXCCeeFLV9u1609300648527-21wH-cas; zg_did=%7B%22did%22%3A%20%22174575a6eca420-0b1ea05958cdee-51a2f73-1fa400-174575a6ecb376%22%7D; zg_=%7B%22sid%22%3A%201609300649292%2C%22updated%22%3A%201609300649297%2C%22info%22%3A%201608798758280%2C%22superProperty%22%3A%20%22%7B%7D%22%2C%22platform%22%3A%20%22%7B%7D%22%2C%22utm%22%3A%20%22%7B%7D%22%2C%22referrerDomain%22%3A%20%22ehall.tjut.edu.cn%22%2C%22cuid%22%3A%20%22203128301%22%2C%22zs%22%3A%200%2C%22sc%22%3A%200%2C%22firstScreen%22%3A%201609300649292%7D; asessionid=f025e83f-cdc2-4206-8bef-d521c5dfb7d2; JSESSIONID=8nexyyMRuvuF9w5FdY-wnfhPlVhKKLLKtmAHz5m3FQur3psQlbvJ!1969776676" }, "body": "consultState=0&pageNumber=1&pageSize=10", "method": "POST", }).then(res=>res.json()).then(json=>console.log(json))
使用jsoup转写后如下:
转写过程中遇到的问题:
1.网页头文件中form data中的数据需要用.data( )进行赋值
2.报错:Jsoup Unhandled content type 原因是头部信息中的部分类型不符合要求
解决方法:添加头部信息 .ignoreContentType(true)
package debug; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; public class Myhttpclient { public static void querryhtml(String loginUrl) throws Exception{ Document document = Jsoup.connect(loginUrl) // 手动设置cookies .header("Content-Type","application/x-www-form-urlencoded; charset=UTF-8") .ignoreContentType(true) .header("Cookie","EMAP_LANG=zh; _WEU=0lGU9nUPyZ9qx*Rn4K9rs02ZG7l70bbzRFO5mUgwGPNPYxv1E*sNledwwPblS1lfd5Ik_YiYW3vIpD3LYICxSVT8oAgUTnE3MSHiJQzRFU7hbwhsp2gIy0OWEnvRY2eX8lSq0pRdz_2.; iPlanetDirectoryPro=kAXrTrPpkhEvxffxPsHPss; amp.locale=undefined; route=8da53839b22816a2e9746dc2f57870c1; MOD_AUTH_CAS=MOD_AUTH_ST-1453563-6C0UWnfbrqXCCeeFLV9u1609300648527-21wH-cas; zg_did=%7B%22did%22%3A%20%22174575a6eca420-0b1ea05958cdee-51a2f73-1fa400-174575a6ecb376%22%7D; zg_=%7B%22sid%22%3A%201609300649292%2C%22updated%22%3A%201609300649297%2C%22info%22%3A%201608798758280%2C%22superProperty%22%3A%20%22%7B%7D%22%2C%22platform%22%3A%20%22%7B%7D%22%2C%22utm%22%3A%20%22%7B%7D%22%2C%22referrerDomain%22%3A%20%22ehall.tjut.edu.cn%22%2C%22cuid%22%3A%20%22203128301%22%2C%22zs%22%3A%200%2C%22sc%22%3A%200%2C%22firstScreen%22%3A%201609300649292%7D; asessionid=f025e83f-cdc2-4206-8bef-d521c5dfb7d2; JSESSIONID=8nexyyMRuvuF9w5FdY-wnfhPlVhKKLLKtmAHz5m3FQur3psQlbvJ!1969776676") // .header("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198Safari/537.36") .data("consultState","0") .data("pageSize","10") .data("pageNumber","1") .post(); System.out.println(document); } }
运行结果:
7.目前唯一的问题就是cookies的有效时间问题,解决方法:通过定时登陆主页获取cookies来传递给该api