百度联想:用Python抓取百度关键字联想信息

Python抓取百度关键字联想信息

参考:https://www.jianshu.com/p/dc1ec2456331?appinstall=0

MAC上运行经常得到乱码,只有偶尔非乱码;

该方法其实没什么实用价值。

#https://www.jianshu.com/p/dc1ec2456331?appinstall=0
#coding: utf-8

import requests
import urllib
import re
import sys
from bs4 import BeautifulSoup

def getAssociate(keyword, savedFilePath):
    gjc = urllib.parse.quote(keyword)  # 中文转换成url编码
    url = 'https://sp0.baidu.com/5a1Fazu8AA54nxGko9WTAnF6hhy/su?wd='+gjc+'&json=1&p=3&sid=&csor=2&pwd= &cb=jQuery110207361392755424963_1505220177752&_=1505220177757'
    proxies = {"http":'http://119.5.0.53', "http":'http://140.250.170.110', "http":'http://221.229.46.81'}#使用3个代理ip地址,防止多次爬取屏蔽
    headers = {'GET':url,
                'HOST':'sp0.baidu.com',
                'Referer':'https://www.baidu.com/?tn=91694651_hao_pg',
                'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:55.0) Gecko/20100101 Firefox/55.0'
               }
    print('url:')
    print(url)
    html = requests.get(url, headers=headers, proxies=proxies)#取得html文本,并且使用代理ip
    html.encoding = 'utf-8'
    print(html.content)
    print('=============')
    soup = BeautifulSoup(html.content, 'html.parser')
    res = soup.get_text()#将html变成文本格式
    print(res)
    key_word = re.findall("\"(.*?)\"", res)#正则获得字符串里面的字符
    num = 0

    with open(savedFilePath, 'w', encoding='utf-8') as f:
        # 覆盖'w',追加'a+'
        for i in key_word:
            num += 1
            if i == "s":#通过排查得到's'字符后面的字符为关键字
                for item in key_word[num:]:
                    print(item)
                    f.writelines(item)
                    f.writelines('\n\n')
    f.close()
if __name__ == '__main__':
    # getAssociate('王',r'C:\AH_TOOLS\AH_JavaCode\AhProjV5.1\temp\keyword')
    # Java调用
    a = sys.argv[1].split(",")
    keyword = a[0]
    savedFilePath = a[1]
    getAssociate(keyword,savedFilePath);
posted @ 2023-01-24 18:00  虎老狮  阅读(93)  评论(0编辑  收藏  举报