Live2D

Jsoup模拟表单提交爬取数据

package com.wl.getInfo;

import java.util.HashMap;
import java.util.List;
import java.util.Map;

import cn.hutool.core.text.StrBuilder;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.Connection.Method;
import org.jsoup.Connection.Response;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;


public class PhoneInfo {

    public static String LOGIN_URL = "https://phonedb.net/index.php?m=device&s=list";
    public static String PHONE_URL = "https://phonedb.net/";
    public static String USER_AGENT = "User-Agent";
    public static String USER_AGENT_VALUE = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0";

    public static void main(String[] args) throws Exception {
        simulateLogin("SHV47");
    }

    public static Map<String,String> simulateLogin(String keyword) throws Exception {

        Connection con = Jsoup.connect(LOGIN_URL);  // 获取connection
        con.header(USER_AGENT, USER_AGENT_VALUE);   // 配置模拟浏览器
        Response rs = con.execute();                // 获取响应
        Document d1 = Jsoup.parse(rs.body());       // 转换为Dom树


        List<Element> eleList = d1.select("form");  // 获取提交form表单

        // 获取cooking和表单属性
        Map<String, String> datas = new HashMap<>();
        eleList.forEach(
                eList -> {
                    Elements allElements = eList.getAllElements();
                    allElements.forEach(allEle -> {
                                if (allElements.attr("name").equals("search_exp")) {
                                    allElements.attr("value", keyword);
                                }
                                // 排除空值表单属性
                                if (allElements.attr("name").length() > 0) {
                                    datas.put(allElements.attr("name"), allElements.attr("value"));
                                }
                            }
                    );
                }
        );

        /*
         * 第二次请求,以post方式提交表单数据以及cookie信息
         */
        Connection con2 = Jsoup.connect(LOGIN_URL);
        con2.header(USER_AGENT, USER_AGENT_VALUE);
        // 设置cookie和post上面的map数据
        Response response = con2.ignoreContentType(true).followRedirects(true).method(Method.POST).data(datas).cookies(rs.cookies()).execute();
        // 查询手机界面信息
        Document d = Jsoup.parse(response.body());
        // 获取href里面的内容
        String targetUrl = "";
        Elements div = d.getElementsByClass("content_block_title");
        for (Element element : div) {
            Elements link = element.getElementsByTag("a").eq(0);
            targetUrl += link.attr("href");
            System.out.println(link.attr("href"));
        }
        // 拼接访问地址
        String phoneUrl = PHONE_URL + targetUrl;
        /**
         * 第三次请求
         */
        Connection con3 = Jsoup.connect(phoneUrl);
        con2.header(USER_AGENT, USER_AGENT_VALUE);
        // 获取响应
        Response rs3 = con3.execute();
        // 转换为Dom树
        Document d3 = Jsoup.parse(rs3.body());

        // 提取数据

        // 先获取tr
        List<Element> tr = d3.select("tr");
        Map<String, String> map = new HashMap();
        new StrBuilder();
        for (Element td : tr) {
            String strong = td.getElementsByTag("strong").eq(0).text();
            if (strong.equals("Brand")) {
                String value = td.getElementsByTag("a").eq(1).text();
                map.put("Brand", value);
            }
            if (strong.equals("Model")) {
                String value = FilterText("Model", td.text());
                map.put("Model", value);
            }
            if (strong.equals("Platform")) {
                String value = td.getElementsByTag("a").eq(1).text();
                map.put("os", value);
            }
            if (strong.equals("Resolution")) {
                String value = FilterText("Resolution", td.text());
                map.put("screen", value);
            }
            if (strong.equals("Pixel Density")) {
                String value = td.getElementsByTag("a").eq(1).text();
                map.put("ppi", value);
            }
            if (strong.equals("RAM Capacity (converted)")) {
                String value = td.getElementsByTag("a").eq(1).text();
                map.put("RAM_GB", value);
            }
            if (strong.equals("CPU")) {
                String value = td.getElementsByTag("a").eq(1).text();
                map.put("chipset", value);
            }
            if (strong.equals("Graphical Controller")) {
                String value = td.getElementsByTag("a").eq(1).text();
                map.put("GPU", value);
            }
            if (strong.equals("Released")) {
                String value = FilterText("Released", td.text());
                map.put("released", value);
            }
        }
        return map;

    }

    private static String FilterText(String target, String text) {
        return text.replaceAll(target, "").trim();
    }

}

由于这个网站爬去的速度较慢,并且在爬取的过程中可能会出现异常行为导致丢失数据,于是我是用了多线程加上异常处理
改进后的工具类如下

package com.oasis.mdata.services

import com.oasis.mdata.entities.GameInfo
import com.oasis.mdata.entities.PhoneInfo
import org.jsoup.Connection
import org.jsoup.Jsoup
import org.jsoup.nodes.Document
import org.jsoup.nodes.Element
import java.util.function.Consumer


/**
 *@author 没有梦想的java菜鸟
 * @date 2022/01/21 11:05 上午
 */
class JsoupUtils {
    companion object {
        // 定义爬取信息
        val Phone_Url = "https://phonedb.net/index.php?m=device&s=list"
        val PHONE_URL = "https://phonedb.net/"
        val Game_URL="https://www.qimai.cn/rank/index/brand/grossing/device/iphone/country/us/genre/6014/date"
        val USER_AGENT = "User-Agent"

        //        val USER_AGENT_VALUE = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0"
        val USER_AGENT_VALUE =
            "Mozilla/5.0 (Windows; U; WindowsNT 5.1; en-US; rv1.8.1.6) Gecko/20070725 Firefox/2.0.0.6"


        fun getPhoneinfo(keyword: String?): PhoneInfo {
            var conn: Connection = Jsoup.connect(Phone_Url)
            conn.header(USER_AGENT, USER_AGENT_VALUE)
            // 创建集合
            val phoneInfo = PhoneInfo()
            // 获取响应
            val response = conn.execute()
            // 转换为dom
            val dom: Document = Jsoup.parse(response.body())
            // 获取提交form表单
            val eleList: List<Element> = dom.select("form")
            val datas = HashMap<String, String>()
            // 搜索赋值
            eleList.forEach(
                Consumer { ele: Element ->
                    val allElements = ele.allElements
                    allElements.forEach(
                        Consumer { allEle ->
                            if (allElements.attr("name").equals("search_exp")) {
                                allElements.attr("value", keyword);
                            }
                            // 排除空值表单属性

                            if (allElements.attr("name").length > 0) {
                                datas.put(allElements.attr("name"), allElements.attr("value"));
                            }
                        }
                    )
                }
            )

            /**
             * 第二次请求
             */
            val con2 = Jsoup.connect(Phone_Url)
            con2.header(USER_AGENT, USER_AGENT_VALUE)
            // 表单提交
            val response2 = con2.ignoreContentType(true)
                .followRedirects(true)
                .method(Connection.Method.POST).data(datas)
                .cookies(response.cookies())
                .execute()
            // 查询手机界面信息
            val document = Jsoup.parse(response2.body())
            var targetUrl = ""
            var phoneUrl = ""
            var flag = false
            // 获取div元素
            val div = document.getElementsByClass("content_block_title")
            if (div.size == 1) {
                for (element in div) {
                    val link = element.getElementsByTag("a").eq(0)
                    // 获取a标签里面的链接
                    targetUrl += link.attr("href")
                }
                // 拼接地址
                phoneUrl = PHONE_URL + targetUrl
                Execute(phoneUrl, phoneInfo)
                phoneInfo.to_link = keyword
                phoneInfo.status = "0"

            } else if (div.size > 1) {// 说明有多种结果
                for (element in div) {
                    val link = element.getElementsByTag("a").eq(0)
                    val s = link.text()
                    if (keyword?.let { s.contains(it) } == true) {
                        // 获取a标签里面的链接
                        targetUrl += link.attr("href")
                        flag = true
                    }
                    break
                }
                if (flag) {
                    // 拼接地址
                    phoneUrl = PHONE_URL + targetUrl
                    Execute(phoneUrl, phoneInfo)
                    phoneInfo.to_link = keyword
                    phoneInfo.status = "1"
                } else {
                    phoneInfo.to_link = keyword
                    phoneInfo.status = "2"
                }

            } else {// 说明没有此型号
                phoneInfo.to_link = keyword
                phoneInfo.status = "2"
            }
            return phoneInfo
        }

        private fun CheckDate(values: String): String {
            val Month = values.replace(values.substring(0, 4), "").trim()
            val sb = StringBuilder(values.replace(Month, "")?.trim())
            CheckQuarter(Month, sb)
            return sb.toString()

        }

        private fun FilterText(target: String, text: String): String {
            return text.replace(target, "").trim()
        }

        private fun Execute(phoneUrl: String, phoneInfo: PhoneInfo) {
            /**
             * 第三次请求
             */
            val conn3 = Jsoup.connect(phoneUrl)
            conn3.header(USER_AGENT, USER_AGENT_VALUE)
            val response3 = conn3.execute()
            val document3 = Jsoup.parse(response3.body())

            val tr: List<Element> = document3.select("tr")

            for (td in tr) {
                val strong = td.getElementsByTag("strong").eq(0).text()
                if (strong == "Brand") {
                    val value = td.getElementsByTag("a").eq(1).text()
                    phoneInfo.brand = value
                }
                if (strong == "Model") {
                    val value = FilterText("Model", td.text())
                    phoneInfo.model = value
                }
                if (strong == "Platform") {
                    val value = td.getElementsByTag("a").eq(1).text()
                    phoneInfo.os = value
                }
                if (strong == "Resolution") {
                    val value = FilterText("Resolution", td.text())
                    phoneInfo.screen = value
                }
                if (strong == "Pixel Density") {
                    val values = td.getElementsByTag("a").eq(1).text()
                    val value = FilterText("PPI", values)
                    phoneInfo.ppi = value
                }
                if (strong == "RAM Capacity (converted)") {
                    val values = td.getElementsByTag("a").eq(1).text()
                    val value = FilterText("GiB RAM", values)
                    phoneInfo.ram_gb = value
                }
                if (strong == "CPU") {
                    val values = td.getElementsByTag("a").eq(1).text()
                    val value = values.split(",")[0]
                    phoneInfo.chipset = value
                }
                if (strong == "Graphical Controller") {
                    val value = td.getElementsByTag("a").eq(1).text()
                    phoneInfo.GPU = value
                }
                if (strong == "Released") {
                    val values = FilterText("Released", td.text())
                    val value = CheckDate(values)
                    phoneInfo.released = value
                }
            }
        }

        private fun CheckQuarter(Month: String, sb: StringBuilder) {
            if (Month.contains("Jan")) {
                sb.append("Q1")
            } else if (Month.contains("Feb")) {
                sb.append("Q1")
            } else if (Month.contains("Mar")) {
                sb.append("Q1")
            } else if (Month.contains("Apr")) {
                sb.append("Q2")
            } else if (Month.contains("May")) {
                sb.append("Q2")
            } else if (Month.contains("Jun")) {
                sb.append("Q2")
            } else if (Month.contains("Jul")) {
                sb.append("Q3")
            } else if (Month.contains("Aug")) {
                sb.append("Q3")
            } else if (Month.contains("Sep")) {
                sb.append("Q3")
            } else if (Month.contains("Oct")) {
                sb.append("Q4")
            } else if (Month.contains("Nov")) {
                sb.append("Q4")
            } else if (Month.contains("Dec")) {
                sb.append("Q4")
            }
        }
    }
}

我将 需要爬取的 关键词 放到 phones.txt中,然后用代码将其中的关键词 分割成等份的txt文件,并把生成的txt文件放到lastResolve.txt中 ,多线程处理读取lastResolve.txt中的 txt文件所对应的内容。

分割文件的类

package com.oasis.mdata.services

import java.io.BufferedReader
import java.io.FileReader
import java.io.FileWriter
import java.io.PrintWriter
import java.util.*
import kotlin.collections.ArrayList

/**
 *@author 没有梦想的java菜鸟
 * @date 2022/02/28 6:10 下午
 */
class SubFile {

    fun subFile(filePath: String):Int {
        val buffered = BufferedReader(FileReader(filePath))
        var line: String = ""
        val list = ArrayList<String>()
        // 读取目标文件放进集合
        while ((buffered.readLine()?.also { line = it }) != null) {
            list.add(line)
        }

        // 将集合分为若干个集合
        val subList = subList(list, list.size /6)
        val uuidFileList = ArrayList<String>()
        subList.map {
            // 将集合中的内容写进txt文件中
            val uuid = UUID.randomUUID().toString().substring(0 until 10)
            // 记录生成的文件
            uuidFileList.add("$uuid.txt")
            val pw = PrintWriter(FileWriter("src/main/resources/$uuid.txt"), true)
            it.map { str ->
                pw.println(str)
                pw.flush()
            }
            pw.close()
        }
        // 写进要处理的文件里
        val pw = PrintWriter(FileWriter("src/main/resources/lastResolve.txt"), true)
        uuidFileList.map {
            pw.println(it)
            pw.flush()
        }
        pw.close()
        return if (list.size%(list.size/6)==0 && list.size>=6) list.size/(list.size/6) else list.size/(list.size/6)+1
    }


    fun subList(subList: MutableList<String>, subNumber: Int): MutableList<MutableList<String>> {
        var beginIndex = 0
        var endIndex = subNumber
        val list = ArrayList<MutableList<String>>()
        var totalCount = subList.size / subNumber

        for (i in 0..totalCount) {
            if (i == totalCount) {
                subList.subList(beginIndex, subList.size).let {
                    if (it.isNotEmpty()) {
                        list.add(it)
                    }
                }

            } else {
                list.add(subList.subList(beginIndex, endIndex))
                beginIndex = endIndex
                endIndex += subNumber
            }
        }
        return list
    }
}

fun main() {
    SubFile().subFile("src/main/resources/test.txt")
}

多线程处理的类

package com.oasis.mdata.services

import com.alibaba.excel.EasyExcel
import com.oasis.mdata.entities.PhoneInfo
import com.oasis.mdata.services.JsoupUtils
import java.io.BufferedReader
import java.io.File
import java.io.FileReader
import java.util.*
import java.util.concurrent.*
import kotlin.collections.ArrayList

/**
 *@author 没有梦想的java菜鸟
 * @date 2022/02/22 2:00 下午
 */
class Tests {

}

val threadPool = ThreadPoolExecutor(
    8,
    10,
    20,
    TimeUnit.SECONDS,
    LinkedBlockingQueue(10),
    Executors.defaultThreadFactory(),
    ThreadPoolExecutor.AbortPolicy()
)

fun main() {
    val beginTime = Date()
    // 将主文件生成多个txt文件用于多线程读取
    val threadNumber=SubFile().subFile("src/main/resources/phones.txt")
    val buffered = BufferedReader(FileReader("src/main/resources/lastResolve.txt"))
    var line: String = ""
    // 线程安全集合 存储excel的位置
    val excelList = CopyOnWriteArrayList<String>()
    // 线程安全的计数器
    val countDownLatch=CountDownLatch(threadNumber)
    // 记录临时文件
    val temFileList=ArrayList<String>()
    while ((buffered.readLine()?.also { line = it }) != null) {
        Thread.sleep(10)
        temFileList.add(line)
        read(line, excelList,countDownLatch)
    }
    countDownLatch.await()
    // 删除生成为临时txt文件
    temFileList.map {
        File("src/main/resources/$it").delete()
    }
    val fileName = "src/main/resources/手机信息.xlsx"
    val listener = Listener()
    excelList.map {
        // 删除临时的excel
        EasyExcel.read(it, PhoneInfo::class.java, listener).sheet().doRead()
        File(it).delete()
    }
    println(excelList.size)
    EasyExcel.write(fileName, PhoneInfo::class.java).sheet("PhoneInfo").doWrite(listener.list)


    println("程序运行结束")
    val endTime = Date()
    val time = (endTime.time - beginTime.time) / 1000
    println("程序共耗时 $time 秒")
}

fun read(txt: String?, excelList: MutableList<String>,countDownLatch:CountDownLatch) {
    threadPool.execute {
        val beginTime = Date()
        val path = "src/main/resources/"
        val fileName = path + System.currentTimeMillis() + "手机信息.xlsx"
        excelList.add(fileName)
        val buffered = BufferedReader(FileReader("$path/$txt"))
        var line: String = ""
        val list = ArrayList<PhoneInfo>()
        val tempList = ArrayList<String>()
        while ((buffered.readLine()?.also { line = it }) != null) {
            tempList.add(line)
        }
        // 循环读取 出异常 继续读取
        do {
            val iterator = tempList.iterator()
            while (iterator.hasNext()) {
                var info = iterator.next()
                try {
                    val phoneInfo = JsoupUtils.getPhoneinfo(info)
                    list.add(phoneInfo)
                    iterator.remove()
                    println(info)
                } catch (e: Exception) {
                    println("$info 异常")
                }
            }
        } while (tempList.isNotEmpty())
        // 写自己的临时文件
        EasyExcel.write(fileName, PhoneInfo::class.java).sheet("PhoneInfo").doWrite(list)
        val endTime = Date()
        val time = (endTime.time - beginTime.time) / 1000
        println("读取$txt 共耗费 $time 秒")
        countDownLatch.countDown()
    }


}
posted @ 2022-01-21 11:02  没有梦想的java菜鸟  阅读(473)  评论(0编辑  收藏  举报