工程代码の初體驗
受天天学长影响入坑python,颓了半天多+一晚上终于写出爬虫了,感觉NOIP药丸qaq
在慕课上学的,本来想抓关于OI的百度百科的,因为图样,目标函数太简单,刚开始就偏了……
结果界面也很丑,下回搞个更好的OvO
收货了很多经验,最有用的一条就是把主方法中抓取失败的except写成
except Exception as f:
print 'crew failed: ', f
就可以查出失败的大致方向
python有些东西打错了也不会报错
现在依旧经常犯新手经典错误:self没打或者打错
贴出爬虫的代码(几乎完全是照着慕课上的模板抄的一,一):
1 主要 2 3 import url_manager, html_downloader, html_parser, html_outputer 4 5 class SpiderMain(object): 6 def __init__(self): 7 self.urls = url_manager.UrlManager() 8 self.downloader = html_downloader.HtmlDownloader() 9 self.parser = html_parser.HtmlParser() 10 self.outputer = html_outputer.HtmlOutputer() 11 12 def craw(self, root_url): 13 count = 1 14 self.urls.add_new_url(root_url) 15 while self.urls.has_new_url(): 16 try: 17 new_url = self.urls.get_new_url() 18 print 'craw %d : %s' % (count, new_url) 19 html_cont = self.downloader.download(new_url) 20 new_urls, new_data = self.parser.parse(new_url, html_cont) 21 self.urls.add_new_urls(new_urls) 22 self.outputer.collect_data(new_data) 23 except Exception as f:#这么做可以指出抓取失败的大致方向 24 print 'crew failed: ', f 25 26 if count == 100: 27 break 28 29 count = count + 1 30 31 self.outputer.output_html() 32 33 34 if __name__=="__main__": 35 root_url = "http://baike.baidu.com/item/oi/74020" 36 boj_spider = SpiderMain() 37 boj_spider.craw(root_url) 38 39 40 输出器 41 42 43 class HtmlOutputer(object): 44 def __init__(self): 45 self.datas = [] 46 47 def collect_data(self, data): 48 if data is None: 49 return 50 self.datas.append(data) 51 52 def output_html(self): 53 fout = open('output.html', 'w') 54 55 fout.write("<html>") 56 fout.write("<body>") 57 fout.write("<meta charset='utf-8'>") 58 fout.write("<table>") 59 60 for data in self.datas: 61 fout.write("<tr>") 62 fout.write("<td>%s</td>" % data['url']) 63 fout.write("<td>%s</td>" % data['title'].encode('utf-8')) 64 fout.write("<td>%s</td>" % data['summary'].encode('utf-8'))#old error:didn't have encode 65 fout.write("</tr>") 66 67 fout.write("</table>") 68 fout.write("</body>") 69 fout.write("</html>") 70 71 fout.close() 72 73 74 管理器 75 76 77 class UrlManager(object): 78 79 def __init__(self): 80 self.new_urls = set() 81 self.old_urls = set() 82 83 def add_new_url(self, url): 84 if url is None: 85 return 86 if url not in self.new_urls and url not in self.old_urls: 87 self.new_urls.add(url) 88 89 def add_new_urls(self, urls): 90 if urls is None or len(urls) == 0: 91 return 92 for url in urls: 93 self.add_new_url(url) 94 95 def has_new_url(self): 96 return len(self.new_urls) != 0 97 98 def get_new_url(self): 99 new_url = self.new_urls.pop() 100 self.old_urls.add(new_url) 101 return new_url 102 103 104 解析器 105 106 from bs4 import BeautifulSoup 107 import re 108 import urlparse 109 110 class HtmlParser(object): 111 112 def _get_new_urls(self,page_url, soup):#old error:didn't have self 113 new_urls = set() 114 115 links = soup.find_all('a', href=re.compile(r"/view/\d+\.htm"))#old error:re.complie 116 for link in links: 117 new_url = link['href'] 118 new_full_url = urlparse.urljoin(page_url, new_url) 119 new_urls.add(new_full_url) 120 121 return new_urls 122 123 def _get_new_data(self,page_url, soup): 124 res_data = {} 125 126 # url 127 res_data['url'] = page_url 128 129 # <dd class="lemmaWgt-lemmaTitle-title"><h1>Python</h1> 130 title_node = soup.find('dd', class_="lemmaWgt-lemmaTitle-title").find("h1") 131 res_data['title'] = title_node.get_text() 132 133 # <div class="lemma-summary" label-module="lemmaSummary"> 134 summary_node = soup.find('div', class_="lemma-summary") 135 res_data['summary'] = summary_node.get_text() 136 137 return res_data 138 139 def parse(self, page_url, html_cont): 140 if page_url is None or html_cont is None: 141 return 142 143 soup = BeautifulSoup(html_cont, 'html.parser', from_encoding='utf-8') 144 new_urls = self._get_new_urls(page_url, soup) 145 new_data = self._get_new_data(page_url, soup)#old error:new_urls = 146 return new_urls, new_data 147 148 149 150 下载器 151 152 import urllib2 153 154 class HtmlDownloader(object): 155 156 def download(self, url): 157 if url is None: 158 return None 159 160 response = urllib2.urlopen(url) 161 162 if response.getcode() != 200: 163 return None 164 165 return response.read()