Python小爬虫
自己琢磨写了一个Python的小爬虫,用来爬学校的招聘信息,以下是代码。
1 __author__ = 'WCQ' 2 # -*- coding: utf-8 -*- 3 4 import urllib2 5 import urllib 6 import re 7 import thread 8 import time 9 10 11 #----------- 加载招聘信息 ----------- 12 class Spider_Model: 13 def __init__(self): 14 self.page = 1 15 self.enable = False 16 self.endPage = 2 17 18 # 获取网址的HTML 并编码 19 def GetHTML(self, myUrl): 20 user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' 21 headers = {'User-Agent': user_agent} 22 req = urllib2.Request(myUrl, headers=headers) 23 myResponse = urllib2.urlopen(req) 24 myPage = myResponse.read() 25 # encode的作用是将unicode编码转换成其他编码的字符串 26 # decode的作用是将其他编码的字符串转换成unicode编码 27 unicodePage = myPage.decode("GBK") 28 return unicodePage 29 30 31 # 将招聘信息抠出来,添加到列表中并且返回列表 32 def GetPage(self, page): 33 myUrl = "http://www.job.ustc.edu.cn/list.php?trans=7&page=" + str(page) + "&MenuID=002002" 34 unicodePage = self.GetHTML(myUrl) 35 # 找出所有class="content"的div标记 36 # re.S是任意匹配模式,也就是.可以匹配换行符 37 jobList = re.findall('<div class="Joplistone">(.*?)</div>', unicodePage, re.S) 38 jobItems = re.findall('<li><a href="(.*?)" style="color:#">(.*?)</a><span class="zhiwei">(.*?)</span><span class="zhuanye">(.*?)</span></li>', jobList[0], re.S) 39 jobs = [] 40 for job in jobItems: 41 # job 中第一个元素是招聘链接 42 # job 中第二个元素是招聘公司 43 # job 中第三个元素是职位 44 # job 中第四个元素是发布日期 45 jobs.append([job[1], "http://www.job.ustc.edu.cn/" + job[0], job[2], job[3]]) 46 return jobs 47 48 # 获得招聘细节 49 def getJobDetail(self, joburl): 50 jobHtml = self.GetHTML(joburl) 51 jobDetail = re.findall('<div class="textone">(.*?)</div>', jobHtml, re.S) 52 #print jobDetail 53 return jobDetail 54 55 # 获得完整的招聘信息 56 def getJobDetailList(self, jobs): 57 jobDetailList = [] 58 for job in jobs: 59 jobDetailList.append([job[0], job[1], job[2], job[3], self.getJobDetail(job[1])]) 60 return jobDetailList 61 62 # 先展示一下 63 def showJob(self, page): 64 jobs = self.GetPage(page) 65 jobDetailList = self.getJobDetailList(jobs) 66 for jobDetail in jobDetailList: 67 for iterm in jobDetail: 68 print iterm 69 70 def Start(self): 71 self.enable = True 72 page = self.page 73 while self.enable & (page < self.endPage): 74 # 展示招聘信息 75 self.showJob(page) 76 page += 1 77 78 79 print u'招聘内容:' 80 myModel = Spider_Model() 81 myModel.Start()