25selenium的无头模式,进程池,随机ua、词云图
1.selenium的无头模式,后期有需要可直接提取
# 实现无可视化界面
from selenium.webdriver.chrome.options import Options
# 实现无头模式的操作
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
#1.打开浏览器
driver = webdriver.Chrome(chrome_options=chrome_options)
2.随机UA
# faker模块 随机生成UA
from faker import Factory
# -----------------------------------随机生成一个ua-------------------------
Fact =Factory.create()
ua = Fact.user_agent()
3.进程池
import requests
from time import sleep
import time
from multiprocessing.dummy import Pool
# faker模块 随机生成UA
from faker import Factory
# -----------------------------------随机生成一个ua-------------------------
Fact =Factory.create()
ua = Fact.user_agent()
# --------------------------------------
def collect(url):
headers = {
"User-Agent": ua, # 将ua赋值
}
resp = requests.get(url=url, headers=headers)
print(resp.status_code)
if __name__ == '__main__':
# tim = time.time() # 1671166250.3781385 <class 'float'>
stat = int(time.time())
urlLs = ['http://www.baidu.com' for i in range(50)]
# for u in urlLs:
# collect(u)
pool = Pool(processes=4) #可设置最大进程数,不加默认电脑配置
pool.map(collect, urlLs) # 进程平行,互不影响
pool.close() # 关闭进程
pool.join()
end = int(time.time())
print(end - stat)
4.词云图
# 导入结巴模块
import jieba
# 导入词云模块
import wordcloud
jieba.setLogLevel(jieba.logging.INFO) # 出现红字
# 读取文件内容
f = open('弹幕.txt',encoding='utf-8')
txt = f.read()
# print(txt)
# jieba.lcut(txt) # 返回一组列表
string = ' '.join(jieba.lcut(txt))
print(string)
wc = wordcloud.WordCloud(
width=700,
height=700,
background_color='white',
scale=15,
font_path='msyh.ttc' # 设置字体
)
wc.generate(string)
wc.to_file('词云弹幕.png')
本文来自博客园,作者:__username,转载请注明原文链接:https://www.cnblogs.com/code3/p/16988365.html