百度联想:用Python抓取百度关键字联想信息
Python抓取百度关键字联想信息
参考:https://www.jianshu.com/p/dc1ec2456331?appinstall=0
MAC上运行经常得到乱码,只有偶尔非乱码;
该方法其实没什么实用价值。
#https://www.jianshu.com/p/dc1ec2456331?appinstall=0
#coding: utf-8
import requests
import urllib
import re
import sys
from bs4 import BeautifulSoup
def getAssociate(keyword, savedFilePath):
gjc = urllib.parse.quote(keyword) # 中文转换成url编码
url = 'https://sp0.baidu.com/5a1Fazu8AA54nxGko9WTAnF6hhy/su?wd='+gjc+'&json=1&p=3&sid=&csor=2&pwd= &cb=jQuery110207361392755424963_1505220177752&_=1505220177757'
proxies = {"http":'http://119.5.0.53', "http":'http://140.250.170.110', "http":'http://221.229.46.81'}#使用3个代理ip地址,防止多次爬取屏蔽
headers = {'GET':url,
'HOST':'sp0.baidu.com',
'Referer':'https://www.baidu.com/?tn=91694651_hao_pg',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:55.0) Gecko/20100101 Firefox/55.0'
}
print('url:')
print(url)
html = requests.get(url, headers=headers, proxies=proxies)#取得html文本,并且使用代理ip
html.encoding = 'utf-8'
print(html.content)
print('=============')
soup = BeautifulSoup(html.content, 'html.parser')
res = soup.get_text()#将html变成文本格式
print(res)
key_word = re.findall("\"(.*?)\"", res)#正则获得字符串里面的字符
num = 0
with open(savedFilePath, 'w', encoding='utf-8') as f:
# 覆盖'w',追加'a+'
for i in key_word:
num += 1
if i == "s":#通过排查得到's'字符后面的字符为关键字
for item in key_word[num:]:
print(item)
f.writelines(item)
f.writelines('\n\n')
f.close()
if __name__ == '__main__':
# getAssociate('王',r'C:\AH_TOOLS\AH_JavaCode\AhProjV5.1\temp\keyword')
# Java调用
a = sys.argv[1].split(",")
keyword = a[0]
savedFilePath = a[1]
getAssociate(keyword,savedFilePath);