百度汉语释义爬取


import urllib.request as ur
import urllib.parse as up
from lxml import etree

def openUrl(url='http://dict.baidu.com/s?wd=apple'):
request = ur.Request(url)
# print(url)
# User - Agent IP 池
# headers = [
# "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36",
# "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:35.0) Gecko/20100101 Firefox/35.0",
# "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"
# ]
request.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:35.0) Gecko/20100101 Firefox/35.0')
response = ur.urlopen(request)
html = response.read()
return html

def getMeaning(word):
# 根据单词生成百度汉语url
url = 'http://dict.baidu.com/s?wd=' + up.quote(word.replace(' ', '+')) # 使用quote将中文转码
# print(url)
url = url.replace('%2B', '+')
html = openUrl(url).decode('utf-8',errors='ignore')
# print(html)
html=etree.HTML(html)
# print(html)
# 提取释义
# area=html.xpath('//div[@class="tab-content"]/dl/dd/p/text()')
# 名人简介
area=html.xpath('//div[@class="poem-author-intro"]//span/text()')
if area :
print("11111")
return area
# 百度基本释义
area=html.xpath('//div[@class="tab-content"]//p/text()')
if area:
print("22222")
return area
try:
# 关联词义
area = html.xpath('//div[@class="poem-list-item-body check-red"]/text()')[0]
print("33333")
return area
except:
return []


def find_baidu(word):
meanings = getMeaning(word)
# print(meanings)
if len(meanings) != 0:
print('释义:')
str_dict=''
for meaning in meanings:
str_dict=str_dict+''.join(meaning.split())
# print(str_dict)
return True, str_dict
else:
# print('没有释义!')
return False, "没有合适的释义!"


if __name__ == '__main__':
while True:
word = input('请输入单词: ')
find_baidu(word)
posted @ 2019-07-31 16:11  wxl106  阅读(554)  评论(0编辑  收藏  举报