原生爬虫实例
1 # coding=utf-8 2 from urllib import request 3 import requests 4 import re 5 # 断点调试 6 # class Spider(): 7 # url='https://www.panda.tv/cate/lol' 8 # root_pattern='<div class="video-info">[\s\S]*?</div>'#?是贪婪,非贪婪,现在是非贪婪 9 # def __fetch_countent(self): #打开要解析的网页 10 # r=request.urlopen(Spider.url) #这里Spider.url是一个实例的意思 11 # htmls= r.read() 12 # htmls=str(htmls,encoding='utf-8') 13 # print(htmls) 14 # return htmls 15 # a=1 16 # 17 # def __analysis(self,htmls): #具体分析 18 # root_html=re.findall (Spider.root_pattern,htmls) 19 # print(root_html) 20 # a=1 21 # def go(self): 22 # htmls=self.__fetch_countent() 23 # self.__analysis(htmls) 24 # 25 # 26 # youtube=Spider() 27 # youtube.go() 28 29 class Spider(): 30 url='https://www.panda.tv/cate/lol' 31 root_pattern='<div class="video-info">([\s\S]*?)</div>' #这里选取非贪婪模式 32 name_pattern='</i>([\s\S]*?)</span>' 33 number_patter='<span class="video-number">([\s\S]*?)</span>' 34 35 def __fetch_content(self): #取得_内容 36 print('1111') 37 r=requests.get(Spider.url) 38 r.enconding = "utf-8" 39 htmls=r.content.decode("utf-8") 40 41 return htmls 42 def __analysis(self,htmls): #分析内容 43 root_html=re.findall(Spider.root_pattern,htmls) #findall 需要2个参数,一个是正则内容,一个是正则对象 44 list_renqi=[] 45 for html in root_html: 46 name=re.findall(Spider.name_pattern,html) 47 number=re.findall(Spider.number_patter,html) 48 dic_renqi={'name':name,'number':number} 49 list_renqi.append(dic_renqi) 50 a=1 51 print('111') 52 53 return list_renqi 54 def __refine(self,list_renqi):#精炼列表 55 l=lambda dic_renqi:{'name':dic_renqi['name'][0].strip(), 56 'number':dic_renqi['number'][0].strip() 57 } 58 return map(l,list_renqi) 59 60 def __sort(self,list_renqi): #排序 61 list_renqi=sorted(list_renqi,key=self.__sort_seed,reverse=True) #key指定需要比较大小的元素 #reverse是排列顺序,是正序还是倒叙 62 print(list_renqi) 63 return list_renqi 64 65 def __sort_seed(self,dic_renqi): #这是给上边排序函数用的,目的是找出key的方法,用来排序 66 r=re.findall('\d*',dic_renqi['number'])# 这里是把‘万’子变成10000,用来排序 67 number=float(r[0]) 68 if '万' in dic_renqi['number']: 69 number *=10000 70 return number 71 72 def __show(self,list_renqi): 73 for rank in range(0,len(list_renqi)): 74 print('rank '+str(rank+1) 75 +':'+list_renqi[rank]['name'] 76 +' '+list_renqi[rank]['number']) 77 # for renqi in list_renqi: 78 # print(renqi['name']+'-------'+renqi['number']) 79 80 def go(self): #总控 81 htmls=self.__fetch_content() #获得内容 82 list_renqi=self.__analysis(htmls) #分析内容 83 list_renqi=list(self.__refine(list_renqi)) #精炼内容 84 print(type(list_renqi)) 85 list_renqi=self.__sort(list_renqi) #排序 86 list_renqi=self.__show(list_renqi) #展示 87 print(list_renqi) 88 89 spider=Spider() 90 spider.go()