Python3 爬虫实战一之爬取糗事百科段子
1 # -*- coding:utf-8 -*- 2 import urllib 3 import urllib.request 4 5 page=1 6 url='http://www.qiushibaike.com/hot/page/'+str(page) 7 user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' 8 headers = { 'User-Agent' : user_agent } 9 10 try: 11 request=urllib.request.Request(url,headers=headers) 12 response=urllib.request.urlopen(request) 13 print(response.read()) 14 except urllib.request.URLError as e: 15 if hasattr(e,"code"): 16 print(e.code) 17 if hasattr(e,"reason"): 18 print(e.reason)
打印出第一页的html代码。
------------------------------继续--------------------------------
用正则表达式取出 用户名:
# -*- coding:utf-8 -*- import urllib import urllib.request import re page=1 url='http://www.qiushibaike.com/hot/page/'+str(page) user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' headers = { 'User-Agent' : user_agent } try: request=urllib.request.Request(url,headers=headers) response=urllib.request.urlopen(request) content=response.read().decode('utf-8') pattern = re.compile('<div.*?author clearfix">.*?title="(.*?)">.*?<span>(.*?)</span>',re.S) #pattern=re.compile('<div.*?author clearfix">.*?title="(.*?)">.*?<span>(.*?)</span>' # +'.*?<span class="stats-vote">.*?<i class="number">(.*?)</i>.*?</span>' # +'.*?<span class="stats-comments">.*?<i class="number">(.*?)</i>',re.S) items = re.findall(pattern, content) for item in items: print(item[0])#, item[1], item[2], item[3], item[4] except urllib.request.URLError as e: if hasattr(e,"code"): print(e.code) if hasattr(e,"reason"): print(e.reason)
------------------------------继续--------------------------------
取出其他信息
# -*- coding:utf-8 -*- import urllib import urllib.request import re page=1 url='http://www.qiushibaike.com/hot/page/'+str(page) user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' headers = { 'User-Agent' : user_agent } try: request=urllib.request.Request(url,headers=headers) response=urllib.request.urlopen(request) content=response.read().decode('utf-8') pattern = re.compile('<div.*?author clearfix">.*?title="(.*?)">' + '.*?span>(.*?)</span>' + '.*?<img src="(.*?)" alt' + '.*?<i class="number">(.*?)</i>' ,re.S) #pattern=re.compile('<div.*?author clearfix">.*?title="(.*?)">.*?<span>(.*?)</span>' # +'.*?<span class="stats-vote">.*?<i class="number">(.*?)</i>.*?</span>' # +'.*?<span class="stats-comments">.*?<i class="number">(.*?)</i>',re.S) items = re.findall(pattern, content) for item in items: haveImg=re.search("img",item[2]) if not haveImg: print(item[0],item[1],item[3])#, item[3], item[4] except urllib.request.URLError as e: if hasattr(e,"code"): print(e.code) if hasattr(e,"reason"): print(e.reason)
------------------------------继续--------------------------------
完善交互界面
# -*- coding:utf-8 -*- import urllib import urllib.request import re import time import _thread class QSBK: def __init__(self): self.pageIndex=1 self.user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' self.headers = {'User-Agent': self.user_agent} self.stories=[] self.enable=False def getPage(self,pageIndex): try: url = 'http://www.qiushibaike.com/hot/page/' + str(pageIndex) request = urllib.request.Request(url, headers=self.headers) response = urllib.request.urlopen(request) pageCode=response.read().decode('utf-8') return pageCode except urllib.request.URLError as e: if hasattr(e, "code"): print(e.code) if hasattr(e, "reason"): print("糗事百科连接失败原因:",e.reason) return None def getPageItems(self,pageIndex): pageCode=self.getPage(pageIndex) if not pageCode: print("页面加载失败") return None pattern=re.compile('<div.*?author clearfix">.*?title="(.*?)">' + '.*?span>(.*?)</span>' + '.*?<img src="(.*?)" alt' + '.*?<i class="number">(.*?)</i>' ,re.S) items = re.findall(pattern,pageCode) pageStories=[] for item in items: haveImg = re.search("img", item[2]) if not haveImg: replaceBR=re.compile('<br/>') text=re.sub(replaceBR,"\n",item[1]) pageStories.append([item[0].strip(),text.strip(),item[2].strip(),item[3].strip()]) return pageStories def loadPage(self): if self.enable==True: if len(self.stories)<2: pageStories=self.getPageItems(self.pageIndex) if pageStories: self.stories.append(pageStories) self.pageIndex+=1 def getOneStor4y(self,pageStories,page): for story in pageStories: input1=input() self.loadPage() if input1=="Q": self.enable=False return print("第%d页\t发布人:%s\t发布时间:%s\t赞:%s\n%s"%(page,story[0],story[2],story[3],story[1])) def start(self): print("正在读取糗事百科,回车-继续,Q-退出 !") self.enable=True self.loadPage() nowPage=0 while self.enable: if len(self.stories)>0: pageStories=self.stories[0] nowPage+=1 del self.stories[0] self.getOneStor4y(pageStories,nowPage) spider=QSBK() spider.start()
参考文档:崔庆才python2.7 Python爬虫实战一之爬取糗事百科段子