[开发参考/Python]Python爬虫框架
本文章的源代码来源于https://github.com/Holit/Web-Crawler-Framwork
一、爬虫框架的代码
1 import urllib.request 2 from bs4 import BeautifulSoup 3 import re 4 import time 5 import _thread 6 7 # Input your Url here#################################### 8 BaseURL = '127.0.0.1/' 9 ######################################################### 10 TaxURL = ".html" 11 12 #Input your data-saving path ############################ 13 SavePath = "" 14 ######################################################### 15 16 #Input your threads count ############################### 17 thread_count = 1 18 ######################################################### 19 20 #Set each spider will spy how many pages ################ 21 thread_spy_count_ench = 5 22 ######################################################### 23 def mkdir(path): 24 # Create the directory 25 import os 26 path=path.strip() 27 path=path.rstrip("\\") 28 isExists=os.path.exists(path) 29 if not isExists: 30 os.makedirs(path) 31 return True 32 else: 33 return False 34 35 def download(start, count): 36 #Spider main 37 for i in range(start,start + count): 38 try: 39 #DEBUG################################################## 40 #print("[INFO] Connecting to page #" + str(i) + "...") 41 ######################################################## 42 43 #Used to record time 44 time_start=time.time() 45 46 #Construct url 47 #This only work like 48 # https://127.0.0.1/articles/00001.html 49 # https://127.0.0.1/articles/00002.html 50 # https://127.0.0.1/articles/00003.html 51 TargetURL = BaseURL + str(i) + TaxURL 52 53 54 #create Request object 55 req = urllib.request.Request(TargetURL) 56 #create headers using general header, you could find this by Fiddler(R) or by Chrome(R) 57 req.add_header('Host','') #Your Host, usally set as url-base 58 req.add_header('Referer',TargetURL) #Your Referer, usally set as url 59 req.add_header('User-Agent', 'Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Safari/535.19') 60 #finishing create Request object 61 62 #get information 63 res = urllib.request.urlopen(req) 64 #decode the html 65 soup = BeautifulSoup(res,"html.parser") 66 ############################################################## 67 #add your functions here.... 68 69 #operate_data(data) 70 71 #soup find div to get inforamtion. 72 #soup is able to opreate html tag very easily, by using soup.find(...) 73 ############################################################## 74 75 #Change saving path here. 76 savetarget = SavePath 77 78 #trying to saving files 79 try: 80 #create directory if it doesn't existed 81 mkdir(SavePath+"\\"+str(zone)+"\\") 82 #using open... 83 f = open(savetarget,'w') 84 85 #edit this 86 f.write("data") 87 88 except Exception as e: 89 time_end=time.time() 90 print(" [Failed] - #" + str(i) + " Error : " + str(e)) 91 else: 92 93 time_end=time.time() 94 print(" [Succeed] - #" + str(i) + " has saved to path.("+str(time_end-time_start)+"s)") 95 96 pass 97 except Exception as e: 98 print(" [Global Failure] - #" + str(i) + " Error : " + str(e)) 99 pass 100 101 102 103 #if __name__ == __main__: 104 try: 105 #Multithreading 106 print("Spidering webiste...") 107 print("Current configuration :") 108 print("--Will create " + str(thread_count) + "threads to access.") 109 print("--Will save to " + SavePath) 110 print("-------------START---------------------------") 111 # press any key to continue 112 # this won't work under linux 113 import os 114 os.system('pause') 115 try: 116 for i in range(0,thread_count): 117 print("[Thread #"+ str (i) +"] started successfully") 118 _thread.start_new_thread(download, (thread_spy_count_ench * i,thread_spy_count_ench)) 119 except Exception as e: 120 print("[Threading@" + str(i) +"] Error:"+ str(e)) 121 except Exception as e: 122 print("[Global Failure] Error:"+ str(e)) 123 while 1: 124 pass
二、对其中功能的实例化操作
1.文本获取功能
文本获取是指对页面的<div class='content'>...</div>中的内容进行获取,这是前提。如果不同需要更改。
(1)思路
使用BeautifulSoup对html分析之后得到解码的文件,例如
1 <div class="content" style="text-align: left"> 2 基础内容 3 </div>
现在对该段落进行选取,即使用soup.find功能
(2)基本代码
1 passages_div = soup.find('div') 2 passages_set = passages_div.findAll(attrs={"class":"content"}) 3 for passages in passages_set: 4 article = str(passages) 5 #文字处理 6 article = article.replace('<div class="content" style="text-align: left">', '') 7 article = article.replace(u'\ue505', u' ')#对Unicode的空格进行处理,如果不处理gbk无法编码 8 article = article.replace(u'\ue4c6', u' ') 9 article = article.replace(u'\xa0', u' ') 10 article = article.replace('<br/>', '\n') 11 article = article.replace('</div>', '') 12 savetarget = 'D:\test\test.txt' 13 try: 14 mkdir('D:\test\') 15 f = open(savetarget,'w') 16 f.write(article) 17 except Exception as e: 18 print(" [Failed] - "+ str(e)) 19 else: 20 time_end=time.time() 21 print(" [Succeed] - saved to path.") 22 23 pass
2.图片获取操作
图片获取一般是通过对网页上的<img src="127.0.0.1/png.png">Hello</img>中src上的内容进行下载操作
目前可以使用多种操作方式,例如urlretrieve,不再赘述
作者发布、转载的任何文章中所涉及的技术、思路、工具仅供以安全目的的学习交流,并严格遵守《中华人民共和国网络安全法》、《中华人民共和国数据安全法》等网络安全法律法规。
任何人不得将技术用于非法用途、盈利用途。否则作者不对未许可的用途承担任何后果。
本文遵守CC BY-NC-SA 3.0协议,您可以在任何媒介以任何形式复制、发行本作品,或者修改、转换或以本作品为基础进行创作
您必须给出适当的署名,提供指向本文的链接,同时标明是否(对原文)作了修改。您可以用任何合理的方式来署名,但是不得以任何方式暗示作者为您或您的使用背书。
同时,本文不得用于商业目的。混合、转换、基于本作品进行创作,必须基于同一协议(CC BY-NC-SA 3.0)分发。
如有问题, 可发送邮件咨询.