python爬虫输入标题百度百科获取内容

##原始诉求，经过标题获取内容翻译英文再翻译中文，提高原创度

import requests

import re

from lxml import etree

from translate import Translator

import urllib.request

import urllib.parse

import json

def first_url(url):

send_headers = {

"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36",

"Connection": "keep-alive",

"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",

"Accept-Language": "zh-CN,zh;q=0.8"

}

res = requests.get(url,send_headers)

res.close()

res.encoding= 'utf-8'

html=res.text

return html

#获取第一个百度经验内容下p标签的文本内容

def get_result(html):

title=[]

datahtml=etree.HTML(html)

html_data=datahtml.xpath('/html/body/section/div/div/article/div/div/div/ol/li/div/p')

print(type(html_data))

finalre=''

for i in html_data:

title.append(i.text)

#print(i.text)

finalre=''.join(title)

#print(finalre)

f=open('f.txt','w',encoding='utf-8')

f.write(finalre)

f.close()

#中文转英文

def youdao_translate(content):

'''实现有道翻译的接口'''

youdao_url = 'http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule'

data = {}

data['i']= content

data['from'] = 'AUTO'

data['to'] = 'AUTO'

data['smartresult'] = 'dict'

data['client'] = 'fanyideskweb'

data['salt'] = '1525141473246'

data['sign'] = '47ee728a4465ef98ac06510bf67f3023'

data['doctype'] = 'json'

data['version'] = '2.1'

data['keyfrom'] = 'fanyi.web'

data['action'] = 'FY_BY_CLICKBUTTION'

data['typoResult'] = 'false'

data = urllib.parse.urlencode(data).encode('utf-8')

youdao_response = urllib.request.urlopen(youdao_url, data)

youdao_html = youdao_response.read().decode('utf-8')

target = json.loads(youdao_html)

trans = target['translateResult']

ret = ''

for i in range(len(trans)):

line = ''

for j in range(len(trans[i])):

line = trans[i][j]['tgt']

ret += line + '\n'

#print(ret)

return ret

if __name__ == "__main__":

searce_value='为什么流鼻血'

url="http://jingyan.baidu.com/search?word=%s"%searce_value

first_html_url=first_url(url) #首次获取url结果内容

second_url=eval(re.findall(r'"/article/.*.html"',first_html_url)[0]) #选择第一个百度经验链接

second_resul_url="http://jingyan.baidu.com%s"%second_url

se_html_resu=first_url(second_resul_url) #第一个百度经验链接获取内容

get_result(se_html_resu) #进行获取p标签的内容打印输出

fif=open('f.txt','r',encoding='utf-8')

linelist=fif.read().split('。')

#print(linelist)

for line in linelist:

if line.find('？')>0:

#print('-----',line)

linli=line.split('？')

for l in linli:

if len(l) > 0:

fi=youdao_translate(l)

zhfi=youdao_translate(fi)

print(zhfi)

if len(line) > 0:

print(line)

fi=youdao_translate(line)

zhfi=youdao_translate(fi)

print(zhfi)

posted on 2020-06-11 13:21 游走的鱼阅读(246) 评论(0) 编辑收藏举报

刷新页面返回顶部

游走的鱼

导航

公告

python爬虫输入标题百度百科获取内容