百度词汇检索,计算PMI值
'''词汇检索百度返回值,并且计算PMI值的类''' from bs4 import BeautifulSoup import requests import re import pandas as pd import time import numpy as np class PMI(): def __init__(self): self.url = 'https://www.baidu.com/s?wd=' #self.vocab = vocab def getHtml(self, url): # 只输入URL的主体部分,后面的参数用下面的字典附加上 '''注意这里必须加一个user-Agent,不然request发送请求是是以Python名义发送的,百度知道是Python发的就不给你返回需要的内容,伪装一下''' try: header = { "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.36", } r = requests.get(url, headers=header) r.raise_for_status() r.encoding = 'utf-8' return r.text except: print('爬取失败') def getNum(self, html): # 返回搜索的数字 soup = BeautifulSoup(html, 'html.parser') content = soup.find_all('span', { 'class': 'nums_text'}) # 返回内容为 <span class="nums_text">百度为您找到相关结果约100,000,000个</span> num = re.findall(r'[\d+,*]+', content[0].string)[ 0] # 返回我们需要的搜索次数,内容是字符串型的数字.形如'100,000,000',数字内部包含逗号,下一步需要剔除掉逗号 return int(re.sub(r',', '', num)) # 将逗号替换掉,并强制转换为整数 def retrieveNum(self, vocab): # url主体和爬取网页的数量 url = self.url + vocab html = self.getHtml(url) num = self.getNum(html) return num def getPmi(self,vocab): n_p = 100000000 n_f = self.retrieveNum(vocab) n_pf = self.retrieveNum(' '.join(['手机', vocab])) # print(' '.join(['手机',word])) # print(n_pf) pmi = np.log10(n_pf / (n_p * n_f)) return pmi def getPmiList(self,words_list):#返回输入词列表的pmi值,以列表形式 pmi_list=[] for i in words_list: pmi_list.append(self.getPmi(i)) return pmi_list if __name__ =='__main__': time_start = time.time() url = 'https://www.baidu.com/s?wd=' #print(getHtml( url+'爸爸')) # file=pd.DataFrame(columns=name,data=comm) # file.to_csv('D:/machinelearning data/crawlerData/huaWei_P20_JD100-110.csv',index=False) # num = retrieveNum('办法') # print('搜索次数为:', num) d=PMI() a=['快递','傻子','总体','物流', '验机', '物流', '游戏']#['鸡楚', '留香王者', '系列', '性能', '电池', '电', '视频', '游戏','中华民族', '性价比', '王者', '卡', '天', '红米.', '老婆', '电池', '电', '王者', '时间', '游戏', '相机', '感触', '粉色', '妹妹'] pmi=d.getPmiList(a) print('PMI:',list(pmi)) time_end = time.time() print('耗时%s秒' % (time_end - time_start))