bs multiprocessing 多分链处理

import requests
from bs4 import BeautifulSoup
from multiprocessing import Pool
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
    }
def get_links(url):
    wb_data = requests.get(url,headers = headers)
    soup = BeautifulSoup(wb_data.text,'lxml')
    links = soup.select('#sojob > div > div > div.job-content > div > ul > li > div > div.job-info > h3 > a')
    for link in links:
        href = link.get('href')
        get_info(href)
def get_info(url):
    try:
        wb_data = requests.get(url, headers=headers)
        soup = BeautifulSoup(wb_data.text, 'lxml')
        names = soup.select('#job-view-enterprise > div.wrap.clearfix > div.clearfix > div.main > div.about-position > div.title-info > h1')
        names2s = soup.select('#job-hunter > div.wrap.clearfix > div.clearfix.content > div.main > div.about-position > div > div.title-info > h1')
        for name in names:
            name = name.get_text()
            print(name)
        for name2 in names2s:
            name2 = name2.get_text()
            print(name2)
    except:
        href = 'https://www.liepin.com'+ url
        get_info(href)
if __name__ == '__main__':
    urls = ['https://www.liepin.com/zhaopin/?pubTime=&ckid=c3f082db39687a2e&fromSearchBtn=2&compkind=&isAnalysis=&init=-1&searchType=1&dqs=180030&industryType=&jobKind=&sortFlag=15&degradeFlag=1&industries=&salary=&compscale=&clean_condition=&key=Python&headckid=8d8f20f3992526c8&d_pageSize=40&siTag=p_XzVCa5J0EfySMbVjghcw~EOtkr3bGdK0Q0sAdShkiWA&d_headId=f3347652c2a8fa3b5c5ae65088f2a6d2&d_ckId=3fd8626dc1bc73a61bef7a5da4c2bd96&d_sfrom=search_prime&d_curPage=4&curPage={}'.format(str(i))
            for i in range(0,5)]
    pool = Pool(processes=4)
    pool.map(get_links,urls)


#总结:用了try语句处理两问题：
    #1.处理报错并使程序继续运行
    #2.处理完后 发现少报了数量并不完整，仔细查看后 是少数分链 在别的TAG 于是重新构建。
    #3. 最后处理倒是完美 但是存在另一个致命错误：如果 不存在 且打乱的话 不能完美处理 需要改进。
posted on 2018-03-24 00:56 GhostAatrox 阅读(221) 评论(0) 编辑收藏举报