抓取oschina上面的代码分享python块区下的 标题和对应URL
# -*- coding=utf-8 -*- import requests,re from lxml import etree import sys reload(sys) sys.setdefaultencoding( "utf-8" ) """ 目标:抓取oschina上面的代码分享python块区下的 标题和对应URL """ class spiders_oschina: def __init__(self): print u'开始运行' def get_html_obj(self,url = 'http://www.oschina.net/code/list?lang=python&catalog=&show=time&sort=&p=1'): #传入地址,返回一个xpath对象 tou = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.110 Safari/537.36'} obj = etree.HTML(requests.get(url,headers = tou).content) # 实例化可以被lxml操作的对像 return obj def get_page(self): tmp = '//*[@id="OSC_Content"]/div[1]/ul/li[11]/a/text()' obj = self.get_html_obj() page = int(obj.xpath(tmp)[0]) #从第一页可取得一共有多少页 urllist = [] for i in range(1,page+1): urllist.append('http://www.oschina.net/code/list?lang=python&catalog=&show=time&sort=&p=' + str(i)) return urllist def get_result(self,obj): #需要传入一个xpath对像 tmp = '//*[@id="OSC_Content"]/div[1]/div[3]/ul/li/h3/a/text()' #a标签中的文本 tmp2 = '//*[@id="OSC_Content"]/div[1]/div[3]/ul/li/h3/a/@href' #a标签的href属性 t = obj.xpath(tmp) #[0].decode('utf-8') #测试xpath方法 t2 = obj.xpath(tmp2) f = open('res.txt','a') str = '' for i in t: n = 0 str += i + ' ——>对应的URL是:' + t2[n] + '\n \n --------------------------\n' n += 1 f.write(str) #把结果最终写入txt文本 f.close() if __name__ == "__main__": oca = spiders_oschina() n = 1 for i in oca.get_page(): obj = oca.get_html_obj(i) oca.get_result(obj) print u'第%d页爬取完成' %n n += 1
貌似生成的url有问题,待优化。。。