多线程爬虫

复制代码
from bs4 import BeautifulSoup
import urllib.request
import urllib.parse
import requests
import re
import time
import pandas as pd
import math
import time
from multiprocessing.dummy import Pool

def get_soup(url):
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36'}
    req = urllib.request.Request(url=url, headers=headers, method='GET')
    response = urllib.request.urlopen(req)
    text = response.read().decode('utf-8')
    soup=BeautifulSoup(text,"html.parser")
    return soup

def fenleipa(label, n, url):
    res = ''
    for i in range(int(math.ceil(n/20))):
        soup=get_soup(url + 'page/'+ str(i+1))
        items = soup.find_all("div", attrs={'class':'feature-box clearfix'})
        for t in items:
            soup2 = get_soup(t.div.a.attrs['href'])
            post = soup2.find("div", attrs={'class':'posts clearfix'}).find("ul", class_=False)
            for p in post.find_all("li"):
                txt = p.text
                href = re.findall(r'https?://[a-zA-Z0-9./-]+', txt)[0]
                title = re.sub(href, '', txt)
                title = re.sub(r'\s', '', title)
                res += label + ' ' + title + ' ' + href + '\n'
    return res

def summary_process(a):
    start = time.perf_counter()
    x,y,z = re.split( r'(?: \(|个项目\))', a.attrs['aria-label'])
    y = int(re.sub(',', '', y))
    res = fenleipa(x, y, a.attrs['href'])
    with open('输出.txt', 'a') as f:
        f.write(res)
    end = time.perf_counter()
    print('label:%s已完成,花费时间:%d'%(x, end-start))

soup = get_soup('') //网址
summary = soup.find_all("a", attrs={'class':re.compile(r'tag-cloud-link tag-link-\d+ tag-link-position-\d+')})
pool = Pool(3)
pool.map(summary_process, summary)
复制代码

 

posted @   徐钏  阅读(21)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· DeepSeek 开源周回顾「GitHub 热点速览」
· 物流快递公司核心技术能力-地址解析分单基础技术分享
· .NET 10首个预览版发布:重大改进与新特性概览!
· AI与.NET技术实操系列(二):开始使用ML.NET
· 单线程的Redis速度为什么快?
点击右上角即可分享
微信分享提示