智联招聘上的工作抓取 后端编写

#!/bin/python
#encoding=utf-8
import urllib,lxml.html,lxml.html.soupparser,re,pymongo
url1="http://sou.zhaopin.com/jobs/SearchResult.ashx?in=210500%3b160400%3b160000&pd=1&jl=%E7%A6%8F%E5%B7%9E&sm=0&et=2&p=1"
url="http://sou.zhaopin.com/jobs/SearchResult.ashx?in=210500%3b160400%3b160000&pd=1&jl=%E7%A6%8F%E5%B7%9E&sm=0&et=2&p="


def getDocument(url,code='utf-8'):
    doc=lxml.html.fromstring(urllib.urlopen(url).read().decode(code,'ignore'))
    return doc
def getLastPageNumber(url):
    lastPageXpath="//div[@class='pagesDown']/ul/li[last()-3]"
    doc=getDocument(url)
    return int(doc.xpath(lastPageXpath)[0].text_content())
def getTodayTime(url):
    timeXpath="//td[@class='releasetime']/text()"
    doc=getDocument(url)
    return doc.xpath(timeXpath)[0].strip()
def getPageListAndUrl():
    titleXpath="//tr[@class='showTR']/td[@class='Jobname']/a/text()"
    titleLinkXpath="//tr[@class='showTR']/td[@class='Jobname']/a/@href"
    title=[]
    titleUrls=[]
    for i in range(1,getLastPageNumber(url1)+1):
        doc=getDocument(url+str(i))
        title+=doc.xpath(titleXpath)
        titleUrls+=doc.xpath(titleLinkXpath)
    #return dict(zip(title,titleUrls))
    return dict(zip(title,titleUrls))
def filterWord(jobList):
    AccessWord=[u'前端',u'web',u'程序',u'工程',u'技术',u'\\']
    for k,v in jobList.items():
        if len([True  for word in AccessWord if word in k])==0:
            del jobList[k]
    return jobList
def sort(url):
    peopleNumberXpath="//table[@class='terminalpage-table']//tr[3]/td[2]/text()"
    moneyXpath="//table[@class='terminalpage-table table-margin']//tr[3]/td[2]/text()"
    companyNameXpath="//td/h2/a/text()"
    doc=getDocument(url)
    try:
        companyName=doc.xpath(companyNameXpath)[0].strip()
    except:
        return 0
    #Important Number
    try:
        people=int(re.findall(r'\d+',doc.xpath(peopleNumberXpath)[0])[0])
    except:
        return 0
    try:
        money=int(re.findall(r'\d+',doc.xpath(moneyXpath)[0])[0])
    except:
        money=3000
    companyIndex=getBaiduIndex(companyName)
    weight=int((0.4*people+0.4*money+0.2*companyIndex))
    return weight

def getBaiduIndex(keyword):
    keyword=keyword.encode('utf-8')
    indexXpath="//span[@class='nums']/text()"
    url='http://www.baidu.com/s?wd="'+keyword+'"'
    doc=getDocument(url)
    index=doc.xpath(indexXpath)
    if index==[]:
        return 0
    else:
        index=index[0].replace(',','')
        index=re.findall(r'\d+',index)
        return int(index[0])

def writeDataIntoDb(document):
    collection=pymongo.Connection('localhost',27017).job.list
    try:
        collection.insert(document)
    except:
        print 'Error'

def generateRecord(newList):
    for k,v in newList.items():
        record={'occupy':k,'url':v,'weight':sort(v),'time':time}
        writeDataIntoDb(record)

time=getTodayTime(url1)
jobList=getPageListAndUrl()
newList=filterWord(jobList)
generateRecord(newList)
~               

 

posted @ 2013-11-10 12:32  Epirus  阅读(497)  评论(0编辑  收藏  举报