游走的鱼

导航

python爬虫输入标题百度百科获取内容

##原始诉求,经过标题获取内容翻译英文再翻译中文,提高原创度
 
import requests
import re
from lxml import etree
from translate import Translator
import urllib.request  
import urllib.parse  
import json 
 
 

def first_url(url):
    send_headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36",
        "Connection": "keep-alive",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
        "Accept-Language": "zh-CN,zh;q=0.8"
        }
    res = requests.get(url,send_headers)
    res.close()
    res.encoding= 'utf-8'
    html=res.text
    return html

#获取第一个百度经验内容下p标签的文本内容
def get_result(html):
    title=[]
    datahtml=etree.HTML(html)
    html_data=datahtml.xpath('/html/body/section/div/div/article/div/div/div/ol/li/div/p')
    print(type(html_data))
    finalre=''
    for i in html_data:
        title.append(i.text)
        #print(i.text)
    finalre=''.join(title)
    #print(finalre)
    f=open('f.txt','w',encoding='utf-8')
    f.write(finalre)
    f.close()
#中文转英文
def youdao_translate(content):
    '''实现有道翻译的接口'''
    youdao_url = 'http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule'
    data = {}
    
    data['i']= content
    data['from'] = 'AUTO'
    data['to'] = 'AUTO'
    data['smartresult'] = 'dict'
    data['client'] = 'fanyideskweb'
    data['salt'] = '1525141473246'
    data['sign'] = '47ee728a4465ef98ac06510bf67f3023'
    data['doctype'] = 'json'
    data['version'] = '2.1'
    data['keyfrom'] = 'fanyi.web'
    data['action'] = 'FY_BY_CLICKBUTTION'
    data['typoResult'] = 'false'
    data = urllib.parse.urlencode(data).encode('utf-8')

    youdao_response = urllib.request.urlopen(youdao_url, data)
    youdao_html = youdao_response.read().decode('utf-8')
    target = json.loads(youdao_html)

    trans = target['translateResult']
    ret = ''
    for i in range(len(trans)):
        line = ''
        for j in range(len(trans[i])):
            line = trans[i][j]['tgt']
        ret += line + '\n'
    #print(ret)
    return ret

if __name__ == "__main__":

    searce_value='为什么流鼻血'
    url="http://jingyan.baidu.com/search?word=%s"%searce_value
    first_html_url=first_url(url)  #首次获取url结果内容
    second_url=eval(re.findall(r'"/article/.*.html"',first_html_url)[0])  #选择第一个百度经验链接
    second_resul_url="http://jingyan.baidu.com%s"%second_url 
    se_html_resu=first_url(second_resul_url)  #第一个百度经验链接获取内容
    get_result(se_html_resu)   #进行获取p标签的内容打印输出
    
    fif=open('f.txt','r',encoding='utf-8')
    linelist=fif.read().split('。')
    #print(linelist)
    for line in linelist:

        if line.find('?')>0:
            #print('-----',line)
            linli=line.split('?')
            for l in linli:
                if len(l) > 0:
                    fi=youdao_translate(l)
                    zhfi=youdao_translate(fi)
                    print(zhfi)
        if len(line) > 0:
            print(line)
            fi=youdao_translate(line)
            zhfi=youdao_translate(fi)
            print(zhfi)


posted on 2020-06-11 13:21  游走的鱼  阅读(246)  评论(0编辑  收藏  举报