百科爬虫
特点:根据关键词的特征 寻找与关键词最相关的百科的实体
==| Fighting ~~
import re
from urllib.parse import quote, urljoin
import requests
from bs4 import BeautifulSoup
from lxml import etree
s1, s2 = '\u4e00', '\u9fa5'
d1, d2 = '0', '9'
po = ",。、;():\n.-():-"
def get_str_baike(s):
ans = ''
s = re.sub('\[[^\[]*\]', '', s)
pos = ',、;():.-():-'
for ch in s:
if (ch in pos or s1 <= ch <= s2 or d1 <= ch <= d2 or 'A' <= ch <= 'Z' or 'a' <= ch <= 'z'):
ans += ch
return ans
def craw_bk(key, feature=''):
def rt_response(url):
sessions = requests.session()
sessions.headers[
'User-Agent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36'
html = sessions.get(url)
html.encoding = 'utf'
return etree.HTML(html.text)
def get_raw_html(url, code='UTF-8'):
head = {
'User-Agent': "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Mobile Safari/537.36"
}
try:
r = requests.get(url, headers=head)
r.encoding = code
html = r.text
except BaseException:
print("open error", url)
return ""
return html
def get_key_val(html):
ans = dict()
soup = BeautifulSoup(html, 'lxml')
dd = soup.find('dd', class_='lemmaWgt-lemmaTitle-title')
if dd:
ans['name'] = dd.find('h1').get_text()
dt = soup.find_all('dt', class_='basicInfo-item name')
dd = soup.find_all('dd', class_='basicInfo-item value')
for i in range(len(dt)):
s1 = dt[i].get_text().strip('\n')
s2 = dd[i].get_text().strip('\n')
s = ''.join([ch for ch in s1 if '\u4e00' <= ch <= '\u9fa5'])
ans[s] = s2
# print(f'{s}: {s2}')
div = soup.find('div', class_='lemma-summary')
if div:
pa = div.find_all('div', class_='para')
txt = '\n'.join([it.get_text() for it in pa])
li = txt.strip('\n').split('\n')
txt = '\n'.join([it for it in li if it != '\n'])
ans['introduct'] = txt
return ans
def search_find(key, feature):
key = quote(key + feature)
url = 'http://baike.baidu.com/search/none?word={}'.format(key)
response = rt_response(url)
hrefs = response.xpath('//a[@class="result-title"]/@href')
if hrefs:
href = urljoin(url, hrefs[0])
url = href + '?noadapt=1'
html = get_raw_html(url, code='UTF-8')
ans = get_key_val(html)
return ans
else:
return None
s = quote(key)
url = 'http://baike.baidu.com/item/' + s + '?noadapt=1'
html = get_raw_html(url)
soup = BeautifulSoup(html, 'lxml')
s = soup.find('div', class_="main-content")
if s and feature in s.get_text(): # feature 不在文本中
ans = get_key_val(html)
else:
ans = search_find(key, feature) # 搜索查询
for key, val in ans.items(): # 字符串规范
ans[key] = get_str_baike(val)
return ans
if __name__ == '__main__':
ans = craw_bk('朝阳区', feature='长春市')
for key, val in ans.items():
print(f'{key}:{val}')
抓住青春的尾巴。。。