爬取招聘具体要求

 1 from selenium import webdriver
 2 browser=webdriver.Chrome()
 3 import time
 4 from lxml import etree
 5 import requests
 6 import re
 7 import json
 8 import random
 9 
10 def search():
11     browser.get('https://www.lagou.com/jobs/list_%E7%88%AC%E8%99%AB?labelWords=&fromSearch=true&suginput=')
12     time.sleep(2)
13     #print(browser.page_source)
14     #browser.execute_script('window.scrollTo(0,document.body.scrollHeight)')
15     #time.sleep(1)
16     i=0
17     for i in range(1,25):
18         browser.execute_script('window.scrollTo(0,document.body.scrollHeight)')
19         button=browser.find_element_by_css_selector('#s_position_list > div.item_con_pager > div > span.pager_next')
20         button.click()
21         browser.execute_script('window.scrollTo(0,document.body.scrollHeight)')
22         time.sleep(2)
23         html = etree.HTML(browser.page_source)
24         links = html.xpath(
25             '//ul[@class="item_con_list"]/li[@class="con_list_item default_list"]//a[@class="position_link"]/@href')
26         #browser.close
27         for link in links:
28             yield link
29 for i in search():
30     time.sleep(3)
31     user_agent_list = [
32         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"
33         "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
34         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
35         "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
36         "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
37         "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
38         "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
39         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
40         "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
41         "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
42         "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
43         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
44         "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
45         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
46         "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
47         "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
48         "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
49         "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
50     ]
51     ua = random.choice(user_agent_list)
52     headers = {'User-Agent': ua}
53     r=requests.get(i,headers=headers)
54     html=r.text
55     pattern=re.compile('class="description">职位描述:</h3>.*?<div>\s+(.*?)\s+</div',re.S)
56     job=re.findall(pattern,html)
57     content=str(job).replace('<p>', '').replace('\n', '').replace('</p>', '')
58     #if len(job)>1:
59     with open('1.txt', 'a', encoding='utf-8') as f:
60         f.write(content+'\n')
61     #else:
62         #pass
63     time.sleep(10)

 

posted @ 2017-12-05 23:05  不可叽叽歪歪  阅读(221)  评论(0编辑  收藏  举报