智联招聘上的工作抓取 后端编写
#!/bin/python #encoding=utf-8 import urllib,lxml.html,lxml.html.soupparser,re,pymongo url1="http://sou.zhaopin.com/jobs/SearchResult.ashx?in=210500%3b160400%3b160000&pd=1&jl=%E7%A6%8F%E5%B7%9E&sm=0&et=2&p=1" url="http://sou.zhaopin.com/jobs/SearchResult.ashx?in=210500%3b160400%3b160000&pd=1&jl=%E7%A6%8F%E5%B7%9E&sm=0&et=2&p=" def getDocument(url,code='utf-8'): doc=lxml.html.fromstring(urllib.urlopen(url).read().decode(code,'ignore')) return doc def getLastPageNumber(url): lastPageXpath="//div[@class='pagesDown']/ul/li[last()-3]" doc=getDocument(url) return int(doc.xpath(lastPageXpath)[0].text_content()) def getTodayTime(url): timeXpath="//td[@class='releasetime']/text()" doc=getDocument(url) return doc.xpath(timeXpath)[0].strip() def getPageListAndUrl(): titleXpath="//tr[@class='showTR']/td[@class='Jobname']/a/text()" titleLinkXpath="//tr[@class='showTR']/td[@class='Jobname']/a/@href" title=[] titleUrls=[] for i in range(1,getLastPageNumber(url1)+1): doc=getDocument(url+str(i)) title+=doc.xpath(titleXpath) titleUrls+=doc.xpath(titleLinkXpath) #return dict(zip(title,titleUrls)) return dict(zip(title,titleUrls)) def filterWord(jobList): AccessWord=[u'前端',u'web',u'程序',u'工程',u'技术',u'\\'] for k,v in jobList.items(): if len([True for word in AccessWord if word in k])==0: del jobList[k] return jobList def sort(url): peopleNumberXpath="//table[@class='terminalpage-table']//tr[3]/td[2]/text()" moneyXpath="//table[@class='terminalpage-table table-margin']//tr[3]/td[2]/text()" companyNameXpath="//td/h2/a/text()" doc=getDocument(url) try: companyName=doc.xpath(companyNameXpath)[0].strip() except: return 0 #Important Number try: people=int(re.findall(r'\d+',doc.xpath(peopleNumberXpath)[0])[0]) except: return 0 try: money=int(re.findall(r'\d+',doc.xpath(moneyXpath)[0])[0]) except: money=3000 companyIndex=getBaiduIndex(companyName) weight=int((0.4*people+0.4*money+0.2*companyIndex)) return weight def getBaiduIndex(keyword): keyword=keyword.encode('utf-8') indexXpath="//span[@class='nums']/text()" url='http://www.baidu.com/s?wd="'+keyword+'"' doc=getDocument(url) index=doc.xpath(indexXpath) if index==[]: return 0 else: index=index[0].replace(',','') index=re.findall(r'\d+',index) return int(index[0]) def writeDataIntoDb(document): collection=pymongo.Connection('localhost',27017).job.list try: collection.insert(document) except: print 'Error' def generateRecord(newList): for k,v in newList.items(): record={'occupy':k,'url':v,'weight':sort(v),'time':time} writeDataIntoDb(record) time=getTodayTime(url1) jobList=getPageListAndUrl() newList=filterWord(jobList) generateRecord(newList) ~