用python实现爬取极客网数据。
#-*-coding:utf8-*- import requests import re import sys reload(sys) sys.setdefaultencoding("utf-8") class spider(object): def __init__(self): print u'开始爬取内容。。。。' def getsource(self,url): html = requests.get(url) return html.text def changepage(self,url,total_page): now_page = int(re.search('pageNum=(\d+)',url,re.S).group(1)) page_group = [] for i in range(now_page,total_page+1): link = re.sub('pageNum=\d+','pageNum=%s'%i,url,re.S) page_group.append(link) return page_group def geteveryclass(self,source): everyclass = re.findall('<li id="(.*?)</li>',source,re.S) return everyclass def getinfo(self,eachclass): info = {} info['title'] = re.search('title="(.*?)alt="',eachclass,re.S).group(1) info['content'] = re.search('display: none;">(.*?)</p>',eachclass,re.S).group(1) timeandlevel = re.findall('<em>(.*?)</em>',eachclass,re.S) info['classtime'] = timeandlevel[0] info['classlevel'] = timeandlevel[1] info['learnnum'] = re.search('"learn-number">(.*?)</em>',eachclass,re.S).group(1) return info def saveinfo(self,classinfo): f = open('info.txt','a') for each in classinfo: #f.writelines('\n\n'+contents+'\n') f.writelines('title:'+each['title']+'\n') f.writelines('content:'+each['content']+'\n') f.writelines('classtime:'+each['classtime']+'\n') f.writelines('classlevel:'+each['classlevel']+'\n') f.writelines('learnnum:'+each['learnnum']+'\n\n') f.close() if __name__ == '__main__': classinfo = [] url = 'http://www.jikexueyuan.com/course/?pageNum=1' jikespider = spider() all_links = jikespider.changepage(url,20) for link in all_links: print u'正在处理页面:'+link html = jikespider.getsource(link) # i = 0 # i=i+1 # ii = str(i) # pages = "这个抓的是第"+ii+"页面" everyclass = jikespider.geteveryclass(html) for each in everyclass: info = jikespider.getinfo(each) classinfo.append(info) jikespider.saveinfo(classinfo)
爬去结果如下