Fork me on GitHub

BeautifulSoup解析H5——python爬虫

import requests
from bs4 import BeautifulSoup

"""
description:爬取東方語言學網
word:待查字
zu:閩語|吳語|平話|客家|贛語|官話
"""


def crawl_main(word, zu):
    url = "http://eastling.org/fangyan_word_go.php"
    payload = {'word': word,
               'zu': zu,
               'mode': 'word',
               'map': '查 詢'}
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36",
    }

    ret = requests.request("POST", url, headers=headers, data=payload)
    ret.encoding = ret.apparent_encoding  # 指定编码等于原始页面编码

    # print(ret.text)
    soup = BeautifulSoup(ret.text, 'html.parser')
    tables = soup.findAll('table')[1]  # 指定采集第二个table的信息

    trs = soup.findAll('table')[1].findAll('tr')
    heads = []  # 表头

    for tr in trs:

        ths = tr.findAll('th')
        for th in ths:
            heads.append(th.get_text())
        break
    cols = []
    for tr in trs:

        col = []
        ths = tr.findAll('td')
        for th in ths:
            col.append(th.get_text())
        cols.append(col)
    res = {'heads': heads, 'cols': cols[1:]}
    return res


if __name__ == '__main__':
    print(crawl_main("好", "粵語"))
posted @ 2022-11-03 20:30  壶小旭  阅读(53)  评论(0编辑  收藏  举报