BeautifulSoup解析H5——python爬虫
import requests
from bs4 import BeautifulSoup
"""
description:爬取東方語言學網
word:待查字
zu:閩語|吳語|平話|客家|贛語|官話
"""
def crawl_main(word, zu):
url = "http://eastling.org/fangyan_word_go.php"
payload = {'word': word,
'zu': zu,
'mode': 'word',
'map': '查 詢'}
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36",
}
ret = requests.request("POST", url, headers=headers, data=payload)
ret.encoding = ret.apparent_encoding # 指定编码等于原始页面编码
# print(ret.text)
soup = BeautifulSoup(ret.text, 'html.parser')
tables = soup.findAll('table')[1] # 指定采集第二个table的信息
trs = soup.findAll('table')[1].findAll('tr')
heads = [] # 表头
for tr in trs:
ths = tr.findAll('th')
for th in ths:
heads.append(th.get_text())
break
cols = []
for tr in trs:
col = []
ths = tr.findAll('td')
for th in ths:
col.append(th.get_text())
cols.append(col)
res = {'heads': heads, 'cols': cols[1:]}
return res
if __name__ == '__main__':
print(crawl_main("好", "粵語"))
脚踏实地,注重基础。