25selenium的无头模式,进程池,随机ua、词云图

1.selenium的无头模式,后期有需要可直接提取

# 实现无可视化界面
from selenium.webdriver.chrome.options import Options

# 实现无头模式的操作
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')

#1.打开浏览器
driver = webdriver.Chrome(chrome_options=chrome_options)

2.随机UA

# faker模块 随机生成UA
from faker import Factory
# -----------------------------------随机生成一个ua-------------------------
Fact =Factory.create()
ua = Fact.user_agent()

3.进程池

import requests
from time import sleep
import time
from multiprocessing.dummy import Pool
# faker模块 随机生成UA
from faker import Factory
# -----------------------------------随机生成一个ua-------------------------
Fact =Factory.create()
ua = Fact.user_agent()
# --------------------------------------

def collect(url):
    headers = {
        "User-Agent": ua,  # 将ua赋值
    }
    resp = requests.get(url=url, headers=headers)
    print(resp.status_code)


if __name__ == '__main__':
    # tim = time.time()  # 1671166250.3781385 <class 'float'>
    stat = int(time.time())
    urlLs = ['http://www.baidu.com' for i in range(50)]
    # for u in urlLs:
    #     collect(u)
    pool = Pool(processes=4)  #可设置最大进程数,不加默认电脑配置
    pool.map(collect, urlLs)   # 进程平行,互不影响
    pool.close()           # 关闭进程
    pool.join()
    end = int(time.time())
    print(end - stat)

4.词云图

# 导入结巴模块
import jieba
# 导入词云模块
import wordcloud

jieba.setLogLevel(jieba.logging.INFO) # 出现红字
# 读取文件内容
f = open('弹幕.txt',encoding='utf-8')
txt = f.read()
# print(txt)
# jieba.lcut(txt)  # 返回一组列表
string = ' '.join(jieba.lcut(txt))
print(string)
wc = wordcloud.WordCloud(
    width=700,
    height=700,
    background_color='white',
    scale=15,
    font_path='msyh.ttc'  # 设置字体
)
wc.generate(string)
wc.to_file('词云弹幕.png')
posted @ 2022-12-16 21:46  __username  阅读(193)  评论(0编辑  收藏  举报

本文作者:DIVMonster

本文链接:https://www.cnblogs.com/guangzan/p/12886111.html

版权声明:本作品采用知识共享署名-非商业性使用-禁止演绎 2.5 中国大陆许可协议进行许可。