python——爬虫
利用空余时间写了个简单的python爬虫程序——获取博海拾贝的标题和封面图
这里使用到的技术比较简单,可以供一些python入门的新手做参考。
知道需要采集的东西,那接下来的就是撸码干了。
首先应该分析爬取数据需要使用的函数或者程序包,在这里使用的时urllib2+lxml.etree.
下面是全部代码:(或许在代码中会出现一些新手们不了解的函数。不要怕麻烦,自己多在网上查找下资料可以很好的了解新知识)

#! /usr/bin/env python # coding:utf-8 import urllib2 import lxml.etree import sys import os from MyHelper import MyHelper class bssb: reload(sys) sys.setdefaultencoding('UTF-8') type = sys.getfilesystemencoding() def getHtml(self,_url): #获取网页内容 _headers = MyHelper().getHeaders() request = urllib2.Request(url=_url,headers=_headers) try: page=urllib2.urlopen(request) html=page.read() return html except urllib2.HTTPError as e: print 'HTTPError=',e.code except urllib2.URLError as e: print 'URLError=',e.reason def content(self,html,_xpath): #获取需要抓取的内容 content = lxml.etree.HTML(html.lower().decode('utf-8')) result = content.xpath(_xpath) return result def HtmlforPage(self,htmlurl,titles,imgs): html = self.getHtml(htmlurl) articles = self.content(html,'//article') nextpage = self.content(html,'//li[@class="next-page"]/a') for item in crticles: #抓取标题 title = item.findall('./header//a')[0].text #抓取图片 img = item.findall('./p[@class="focus"]//img')[0].attrib['src'] titles.append(title) imgs.append(img) if len(nextpage)!=0: self.HtmlForPage(nextpage[0].attrib['href'],titles,imgs) else: #将标题写入txt文件 MyHelper().save_txt(titles,'./Bohai/titles.txt','wb+') #将图片保存到本地 MyHelper().save_file(imgs) if __name__ == "__main__": bohai = bssb() url = 'https://bohaishibei.com/post/category/main/' titles=[] imgs=[] bohai.HtmlforPage(url,titles,imgs)
下面这个是简单的自定义帮助类:

#! /usr/bin/env python # coding:utf-8 import os import urllib import re import requests from PIL import Image from io import BytestIO class MyHelper: def __init__(self,language='zh-CN,en;q=0.9',control='max-age=0') self.language = language self.control = control def getAgent(self): user_agent=['Mozilla/5.0(Windows NT 10.0; WOW64)','Mozilla/5.0 (Windows NT 6.3;WOW64)','Opera/9.27 (Windows NT 5.2; U; zh-cn)'] return user_agent def getHeaders(self): headers = {'Accept-Language':self.language, 'cache-control':self.control, 'User-Agent':random.choice(self.getAgent()) } return headers def save_txt(self,contents,txtPath,model): _path = self.GetPath(txtPath) with open(_path,model) as fo: if isinstance(contents,list): for item in contents: fo.write(item+'\n') else: fo.write(contents+'\n') fo.close() def save_file(self,_path): file_path='./imgData' try: file_path = self.GetPath(file_path) if isinstance(_path,list): count=1 for item in _path: file_suffix = os.path.splitext(item)[1] if file_suffix.__contains__('&'): file_suffix = file_suffix.aplit('&')[0] response = requests.get(item) image = Image.open(BytesIO(response.content)) _img = file_path+'/bohai'+str(count)+file_suffix image.save(_img) count+=1 else: file_suffix = os.path.splitext(_path)[1] if file_suffix.__contains__('&'): file_suffix = file_suffix.split('&')[0] response = requests.get(item) image = Image.Open(BytesIO(response.content)) image.save(file_path+'/bohai'+file_suffix) except IOError as e: print 'Error:没有找到文件或者读取文件失败' def GetPath(self,_path): #将文件路劲分割出目录和文件 file_path = os.path.split(_path) if not os.path.isdir(file_path[0]): print '目录不存在,新建', file_path[0] os.system(r'touch %s' % _path) return _path
代码亲测试可以运行。。。。
如有问题欢迎留言。。。。。