import requests from bs4 import BeautifulSoup from multiprocessing import Pool headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36' } def get_links(url): wb_data = requests.get(url,headers = headers) soup = BeautifulSoup(wb_data.text,'lxml') links = soup.select('#sojob > div > div > div.job-content > div > ul > li > div > div.job-info > h3 > a') for link in links: href = link.get('href') get_info(href) def get_info(url): try: wb_data = requests.get(url, headers=headers) soup = BeautifulSoup(wb_data.text, 'lxml') names = soup.select('#job-view-enterprise > div.wrap.clearfix > div.clearfix > div.main > div.about-position > div.title-info > h1') names2s = soup.select('#job-hunter > div.wrap.clearfix > div.clearfix.content > div.main > div.about-position > div > div.title-info > h1') for name in names: name = name.get_text() print(name) for name2 in names2s: name2 = name2.get_text() print(name2) except: href = 'https://www.liepin.com'+ url get_info(href) if __name__ == '__main__': urls = ['https://www.liepin.com/zhaopin/?pubTime=&ckid=c3f082db39687a2e&fromSearchBtn=2&compkind=&isAnalysis=&init=-1&searchType=1&dqs=180030&industryType=&jobKind=&sortFlag=15°radeFlag=1&industries=&salary=&compscale=&clean_condition=&key=Python&headckid=8d8f20f3992526c8&d_pageSize=40&siTag=p_XzVCa5J0EfySMbVjghcw~EOtkr3bGdK0Q0sAdShkiWA&d_headId=f3347652c2a8fa3b5c5ae65088f2a6d2&d_ckId=3fd8626dc1bc73a61bef7a5da4c2bd96&d_sfrom=search_prime&d_curPage=4&curPage={}'.format(str(i)) for i in range(0,5)] pool = Pool(processes=4) pool.map(get_links,urls) #总结:用了try语句处理两问题: #1.处理报错并使程序继续运行 #2.处理完后 发现少报了数量并不完整,仔细查看后 是少数分链 在别的TAG 于是重新构建。 #3. 最后处理倒是完美 但是存在另一个致命错误:如果 不存在 且打乱的话 不能完美处理 需要改进。
总有一个理由,会让我们开始变强。