Jsoup模拟表单提交爬取数据
package com.wl.getInfo;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import cn.hutool.core.text.StrBuilder;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.Connection.Method;
import org.jsoup.Connection.Response;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class PhoneInfo {
public static String LOGIN_URL = "https://phonedb.net/index.php?m=device&s=list";
public static String PHONE_URL = "https://phonedb.net/";
public static String USER_AGENT = "User-Agent";
public static String USER_AGENT_VALUE = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0";
public static void main(String[] args) throws Exception {
simulateLogin("SHV47");
}
public static Map<String,String> simulateLogin(String keyword) throws Exception {
Connection con = Jsoup.connect(LOGIN_URL); // 获取connection
con.header(USER_AGENT, USER_AGENT_VALUE); // 配置模拟浏览器
Response rs = con.execute(); // 获取响应
Document d1 = Jsoup.parse(rs.body()); // 转换为Dom树
List<Element> eleList = d1.select("form"); // 获取提交form表单
// 获取cooking和表单属性
Map<String, String> datas = new HashMap<>();
eleList.forEach(
eList -> {
Elements allElements = eList.getAllElements();
allElements.forEach(allEle -> {
if (allElements.attr("name").equals("search_exp")) {
allElements.attr("value", keyword);
}
// 排除空值表单属性
if (allElements.attr("name").length() > 0) {
datas.put(allElements.attr("name"), allElements.attr("value"));
}
}
);
}
);
/*
* 第二次请求,以post方式提交表单数据以及cookie信息
*/
Connection con2 = Jsoup.connect(LOGIN_URL);
con2.header(USER_AGENT, USER_AGENT_VALUE);
// 设置cookie和post上面的map数据
Response response = con2.ignoreContentType(true).followRedirects(true).method(Method.POST).data(datas).cookies(rs.cookies()).execute();
// 查询手机界面信息
Document d = Jsoup.parse(response.body());
// 获取href里面的内容
String targetUrl = "";
Elements div = d.getElementsByClass("content_block_title");
for (Element element : div) {
Elements link = element.getElementsByTag("a").eq(0);
targetUrl += link.attr("href");
System.out.println(link.attr("href"));
}
// 拼接访问地址
String phoneUrl = PHONE_URL + targetUrl;
/**
* 第三次请求
*/
Connection con3 = Jsoup.connect(phoneUrl);
con2.header(USER_AGENT, USER_AGENT_VALUE);
// 获取响应
Response rs3 = con3.execute();
// 转换为Dom树
Document d3 = Jsoup.parse(rs3.body());
// 提取数据
// 先获取tr
List<Element> tr = d3.select("tr");
Map<String, String> map = new HashMap();
new StrBuilder();
for (Element td : tr) {
String strong = td.getElementsByTag("strong").eq(0).text();
if (strong.equals("Brand")) {
String value = td.getElementsByTag("a").eq(1).text();
map.put("Brand", value);
}
if (strong.equals("Model")) {
String value = FilterText("Model", td.text());
map.put("Model", value);
}
if (strong.equals("Platform")) {
String value = td.getElementsByTag("a").eq(1).text();
map.put("os", value);
}
if (strong.equals("Resolution")) {
String value = FilterText("Resolution", td.text());
map.put("screen", value);
}
if (strong.equals("Pixel Density")) {
String value = td.getElementsByTag("a").eq(1).text();
map.put("ppi", value);
}
if (strong.equals("RAM Capacity (converted)")) {
String value = td.getElementsByTag("a").eq(1).text();
map.put("RAM_GB", value);
}
if (strong.equals("CPU")) {
String value = td.getElementsByTag("a").eq(1).text();
map.put("chipset", value);
}
if (strong.equals("Graphical Controller")) {
String value = td.getElementsByTag("a").eq(1).text();
map.put("GPU", value);
}
if (strong.equals("Released")) {
String value = FilterText("Released", td.text());
map.put("released", value);
}
}
return map;
}
private static String FilterText(String target, String text) {
return text.replaceAll(target, "").trim();
}
}
由于这个网站爬去的速度较慢,并且在爬取的过程中可能会出现异常行为导致丢失数据,于是我是用了多线程加上异常处理
改进后的工具类如下
package com.oasis.mdata.services
import com.oasis.mdata.entities.GameInfo
import com.oasis.mdata.entities.PhoneInfo
import org.jsoup.Connection
import org.jsoup.Jsoup
import org.jsoup.nodes.Document
import org.jsoup.nodes.Element
import java.util.function.Consumer
/**
*@author 没有梦想的java菜鸟
* @date 2022/01/21 11:05 上午
*/
class JsoupUtils {
companion object {
// 定义爬取信息
val Phone_Url = "https://phonedb.net/index.php?m=device&s=list"
val PHONE_URL = "https://phonedb.net/"
val Game_URL="https://www.qimai.cn/rank/index/brand/grossing/device/iphone/country/us/genre/6014/date"
val USER_AGENT = "User-Agent"
// val USER_AGENT_VALUE = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0"
val USER_AGENT_VALUE =
"Mozilla/5.0 (Windows; U; WindowsNT 5.1; en-US; rv1.8.1.6) Gecko/20070725 Firefox/2.0.0.6"
fun getPhoneinfo(keyword: String?): PhoneInfo {
var conn: Connection = Jsoup.connect(Phone_Url)
conn.header(USER_AGENT, USER_AGENT_VALUE)
// 创建集合
val phoneInfo = PhoneInfo()
// 获取响应
val response = conn.execute()
// 转换为dom
val dom: Document = Jsoup.parse(response.body())
// 获取提交form表单
val eleList: List<Element> = dom.select("form")
val datas = HashMap<String, String>()
// 搜索赋值
eleList.forEach(
Consumer { ele: Element ->
val allElements = ele.allElements
allElements.forEach(
Consumer { allEle ->
if (allElements.attr("name").equals("search_exp")) {
allElements.attr("value", keyword);
}
// 排除空值表单属性
if (allElements.attr("name").length > 0) {
datas.put(allElements.attr("name"), allElements.attr("value"));
}
}
)
}
)
/**
* 第二次请求
*/
val con2 = Jsoup.connect(Phone_Url)
con2.header(USER_AGENT, USER_AGENT_VALUE)
// 表单提交
val response2 = con2.ignoreContentType(true)
.followRedirects(true)
.method(Connection.Method.POST).data(datas)
.cookies(response.cookies())
.execute()
// 查询手机界面信息
val document = Jsoup.parse(response2.body())
var targetUrl = ""
var phoneUrl = ""
var flag = false
// 获取div元素
val div = document.getElementsByClass("content_block_title")
if (div.size == 1) {
for (element in div) {
val link = element.getElementsByTag("a").eq(0)
// 获取a标签里面的链接
targetUrl += link.attr("href")
}
// 拼接地址
phoneUrl = PHONE_URL + targetUrl
Execute(phoneUrl, phoneInfo)
phoneInfo.to_link = keyword
phoneInfo.status = "0"
} else if (div.size > 1) {// 说明有多种结果
for (element in div) {
val link = element.getElementsByTag("a").eq(0)
val s = link.text()
if (keyword?.let { s.contains(it) } == true) {
// 获取a标签里面的链接
targetUrl += link.attr("href")
flag = true
}
break
}
if (flag) {
// 拼接地址
phoneUrl = PHONE_URL + targetUrl
Execute(phoneUrl, phoneInfo)
phoneInfo.to_link = keyword
phoneInfo.status = "1"
} else {
phoneInfo.to_link = keyword
phoneInfo.status = "2"
}
} else {// 说明没有此型号
phoneInfo.to_link = keyword
phoneInfo.status = "2"
}
return phoneInfo
}
private fun CheckDate(values: String): String {
val Month = values.replace(values.substring(0, 4), "").trim()
val sb = StringBuilder(values.replace(Month, "")?.trim())
CheckQuarter(Month, sb)
return sb.toString()
}
private fun FilterText(target: String, text: String): String {
return text.replace(target, "").trim()
}
private fun Execute(phoneUrl: String, phoneInfo: PhoneInfo) {
/**
* 第三次请求
*/
val conn3 = Jsoup.connect(phoneUrl)
conn3.header(USER_AGENT, USER_AGENT_VALUE)
val response3 = conn3.execute()
val document3 = Jsoup.parse(response3.body())
val tr: List<Element> = document3.select("tr")
for (td in tr) {
val strong = td.getElementsByTag("strong").eq(0).text()
if (strong == "Brand") {
val value = td.getElementsByTag("a").eq(1).text()
phoneInfo.brand = value
}
if (strong == "Model") {
val value = FilterText("Model", td.text())
phoneInfo.model = value
}
if (strong == "Platform") {
val value = td.getElementsByTag("a").eq(1).text()
phoneInfo.os = value
}
if (strong == "Resolution") {
val value = FilterText("Resolution", td.text())
phoneInfo.screen = value
}
if (strong == "Pixel Density") {
val values = td.getElementsByTag("a").eq(1).text()
val value = FilterText("PPI", values)
phoneInfo.ppi = value
}
if (strong == "RAM Capacity (converted)") {
val values = td.getElementsByTag("a").eq(1).text()
val value = FilterText("GiB RAM", values)
phoneInfo.ram_gb = value
}
if (strong == "CPU") {
val values = td.getElementsByTag("a").eq(1).text()
val value = values.split(",")[0]
phoneInfo.chipset = value
}
if (strong == "Graphical Controller") {
val value = td.getElementsByTag("a").eq(1).text()
phoneInfo.GPU = value
}
if (strong == "Released") {
val values = FilterText("Released", td.text())
val value = CheckDate(values)
phoneInfo.released = value
}
}
}
private fun CheckQuarter(Month: String, sb: StringBuilder) {
if (Month.contains("Jan")) {
sb.append("Q1")
} else if (Month.contains("Feb")) {
sb.append("Q1")
} else if (Month.contains("Mar")) {
sb.append("Q1")
} else if (Month.contains("Apr")) {
sb.append("Q2")
} else if (Month.contains("May")) {
sb.append("Q2")
} else if (Month.contains("Jun")) {
sb.append("Q2")
} else if (Month.contains("Jul")) {
sb.append("Q3")
} else if (Month.contains("Aug")) {
sb.append("Q3")
} else if (Month.contains("Sep")) {
sb.append("Q3")
} else if (Month.contains("Oct")) {
sb.append("Q4")
} else if (Month.contains("Nov")) {
sb.append("Q4")
} else if (Month.contains("Dec")) {
sb.append("Q4")
}
}
}
}
我将 需要爬取的 关键词 放到 phones.txt
中,然后用代码将其中的关键词 分割成等份的txt
文件,并把生成的txt
文件放到lastResolve.txt
中 ,多线程处理读取lastResolve.txt
中的 txt
文件所对应的内容。
分割文件的类
package com.oasis.mdata.services
import java.io.BufferedReader
import java.io.FileReader
import java.io.FileWriter
import java.io.PrintWriter
import java.util.*
import kotlin.collections.ArrayList
/**
*@author 没有梦想的java菜鸟
* @date 2022/02/28 6:10 下午
*/
class SubFile {
fun subFile(filePath: String):Int {
val buffered = BufferedReader(FileReader(filePath))
var line: String = ""
val list = ArrayList<String>()
// 读取目标文件放进集合
while ((buffered.readLine()?.also { line = it }) != null) {
list.add(line)
}
// 将集合分为若干个集合
val subList = subList(list, list.size /6)
val uuidFileList = ArrayList<String>()
subList.map {
// 将集合中的内容写进txt文件中
val uuid = UUID.randomUUID().toString().substring(0 until 10)
// 记录生成的文件
uuidFileList.add("$uuid.txt")
val pw = PrintWriter(FileWriter("src/main/resources/$uuid.txt"), true)
it.map { str ->
pw.println(str)
pw.flush()
}
pw.close()
}
// 写进要处理的文件里
val pw = PrintWriter(FileWriter("src/main/resources/lastResolve.txt"), true)
uuidFileList.map {
pw.println(it)
pw.flush()
}
pw.close()
return if (list.size%(list.size/6)==0 && list.size>=6) list.size/(list.size/6) else list.size/(list.size/6)+1
}
fun subList(subList: MutableList<String>, subNumber: Int): MutableList<MutableList<String>> {
var beginIndex = 0
var endIndex = subNumber
val list = ArrayList<MutableList<String>>()
var totalCount = subList.size / subNumber
for (i in 0..totalCount) {
if (i == totalCount) {
subList.subList(beginIndex, subList.size).let {
if (it.isNotEmpty()) {
list.add(it)
}
}
} else {
list.add(subList.subList(beginIndex, endIndex))
beginIndex = endIndex
endIndex += subNumber
}
}
return list
}
}
fun main() {
SubFile().subFile("src/main/resources/test.txt")
}
多线程处理的类
package com.oasis.mdata.services
import com.alibaba.excel.EasyExcel
import com.oasis.mdata.entities.PhoneInfo
import com.oasis.mdata.services.JsoupUtils
import java.io.BufferedReader
import java.io.File
import java.io.FileReader
import java.util.*
import java.util.concurrent.*
import kotlin.collections.ArrayList
/**
*@author 没有梦想的java菜鸟
* @date 2022/02/22 2:00 下午
*/
class Tests {
}
val threadPool = ThreadPoolExecutor(
8,
10,
20,
TimeUnit.SECONDS,
LinkedBlockingQueue(10),
Executors.defaultThreadFactory(),
ThreadPoolExecutor.AbortPolicy()
)
fun main() {
val beginTime = Date()
// 将主文件生成多个txt文件用于多线程读取
val threadNumber=SubFile().subFile("src/main/resources/phones.txt")
val buffered = BufferedReader(FileReader("src/main/resources/lastResolve.txt"))
var line: String = ""
// 线程安全集合 存储excel的位置
val excelList = CopyOnWriteArrayList<String>()
// 线程安全的计数器
val countDownLatch=CountDownLatch(threadNumber)
// 记录临时文件
val temFileList=ArrayList<String>()
while ((buffered.readLine()?.also { line = it }) != null) {
Thread.sleep(10)
temFileList.add(line)
read(line, excelList,countDownLatch)
}
countDownLatch.await()
// 删除生成为临时txt文件
temFileList.map {
File("src/main/resources/$it").delete()
}
val fileName = "src/main/resources/手机信息.xlsx"
val listener = Listener()
excelList.map {
// 删除临时的excel
EasyExcel.read(it, PhoneInfo::class.java, listener).sheet().doRead()
File(it).delete()
}
println(excelList.size)
EasyExcel.write(fileName, PhoneInfo::class.java).sheet("PhoneInfo").doWrite(listener.list)
println("程序运行结束")
val endTime = Date()
val time = (endTime.time - beginTime.time) / 1000
println("程序共耗时 $time 秒")
}
fun read(txt: String?, excelList: MutableList<String>,countDownLatch:CountDownLatch) {
threadPool.execute {
val beginTime = Date()
val path = "src/main/resources/"
val fileName = path + System.currentTimeMillis() + "手机信息.xlsx"
excelList.add(fileName)
val buffered = BufferedReader(FileReader("$path/$txt"))
var line: String = ""
val list = ArrayList<PhoneInfo>()
val tempList = ArrayList<String>()
while ((buffered.readLine()?.also { line = it }) != null) {
tempList.add(line)
}
// 循环读取 出异常 继续读取
do {
val iterator = tempList.iterator()
while (iterator.hasNext()) {
var info = iterator.next()
try {
val phoneInfo = JsoupUtils.getPhoneinfo(info)
list.add(phoneInfo)
iterator.remove()
println(info)
} catch (e: Exception) {
println("$info 异常")
}
}
} while (tempList.isNotEmpty())
// 写自己的临时文件
EasyExcel.write(fileName, PhoneInfo::class.java).sheet("PhoneInfo").doWrite(list)
val endTime = Date()
val time = (endTime.time - beginTime.time) / 1000
println("读取$txt 共耗费 $time 秒")
countDownLatch.countDown()
}
}
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· winform 绘制太阳,地球,月球 运作规律
· AI与.NET技术实操系列(五):向量存储与相似性搜索在 .NET 中的实现
· 超详细:普通电脑也行Windows部署deepseek R1训练数据并当服务器共享给他人
· 【硬核科普】Trae如何「偷看」你的代码?零基础破解AI编程运行原理
· 上周热点回顾(3.3-3.9)