Python 2.7 爬取51job 全国java岗位

 

 一页有50条数据一共2000页 分页是get分页

 

#!/usr/bin/python
# encoding: utf-8
import requests
import threading
from lxml import etree
import sys
import os
import datetime
import re
import random
import time

reload(sys)

sys.setdefaultencoding('utf-8')

# 定义写入日志的方法
def log(context):
    txtName = "./log/log.txt"
    f=file(txtName, "a+")
    
    f.writelines(context+"\n") 
    
    f.close()

def xin():
        # 请求头
        header = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh-CN,zh;q=0.9'
        }
            
        count=1
           # 一共2000页
        while (count < 2000):
            url="https://search.51job.com/list/000000,000000,0000,00,9,99,java,2,"+str(count)+".html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare="
        
            response=requests.get(url,headers=header)
            html=response.content.decode("gbk")
            print(html)
            selector=etree.HTML(html)
            contents = selector.xpath('//div[@class="dw_table"]/div[@class="el"]')
            
       
            log(""+str(count)+"页了--"+str(len(contents))+"条数据")
            for eachlink in contents:
           
                    company = eachlink.xpath('span[@class="t2"]/a/text()')[0]
                    url= eachlink.xpath('p/span/a/@href')[0]
                    name= eachlink.xpath('p/span/a/text()')[0]
                    city= eachlink.xpath('span[@class="t3"]/text()')[0]

                    # 工资有的是没有的
                    key="0"
                    if len(eachlink.xpath('span[@class="t4"]/text()'))<1:
                        key="0"
                    else:
                        key= eachlink.xpath('span[@class="t4"]/text()')[0]
                    
                    # 把空格去掉    
                    company=company.replace(' ','')
                    name=name.replace(' ','')
                    city=city.replace(' ','')
                    
                    zhi=name+"============="+company+"============="+city+"============="+str(key)+"============="+url
                    
                    
                    txtName = "./file/java.txt"
                    f=file(txtName, "a+")
                    f.write(zhi)
                    f.close()
                    
            sui=random.randint(1,5)
            log("休眠"+str(sui))
            time.sleep(sui)
            count=count+1     

if __name__=="__main__":
    xin()

   

 日志文件

爬去的数据

但是爬去的速度有点慢,

于是乎采用了多线程爬去,

但是51job 立刻就把IP段给封掉了,

于是用户4台服务器,每台爬取500条数据,最后再结合一起加到数据库中

人生苦短,我用Python!!!

posted @ 2018-09-01 17:57  万隆  阅读(265)  评论(0编辑  收藏  举报