Python练手之爬虫
很久没更新博客了,最近自学Python,写个在百度上爬算法题题解的爬虫,第一次写爬虫..纯当练手
慢慢来..
#coding:utf-8 ''' Created on 2016年11月22日 @author: liyinggang ''' from link_crawler_baidu import link_crawler_baidu import urllib baseUrl = 'http://www.baidu.com/s' page = 1 #第几页 ojname = 'hdu' problemnum = '1006' word = ojname + problemnum data = {'wd':word,'pn':str(page-1)+'0','tn':'baidurt','ie':'utf-8','bsst':'1'} data = urllib.urlencode(data) url = baseUrl+'?'+data print url link_crawler_baidu(url,'lyg',ojname,problemnum)
首先是进入百度主页,main.py
然后我将博客园和csdn的链接筛选出来(编码问题好难弄,最后百度了个笨方法).
#coding:utf-8 ''' Created on 2016年11月22日 @author: liyinggang ''' import re,lxml.html,urlparse from download import download import sys from link_crawler_cnblogs import link_crawler_cnblogs from link_crawler_csdn import link_crawler_csdn reload(sys) def link_crawler_baidu(seed_url,user_agent=None,ojname='hdu',problemnum='1001'): html = download(seed_url,user_agent=user_agent) url1 = "http://www.cnblogs.com/"; url2 = "http://blog.csdn.net/"; regex1 = re.compile(url1) regex2 = re.compile(url2) cnt1 = 0 cnt2 = 0 links = [link for link in get_links(html) if \ re.match(regex1,link) or re.match(regex2, link)] for link in links: link = union(seed_url,link) html = download(link,user_agent=user_agent) html = unicode(html, "utf-8") #只能用这个笨方法了 tree = lxml.html.fromstring(html) text = tree.cssselect('title')[0].text_content() regex1 = re.compile(u'%s([\s\S]*)%s([\s\S]*)博客园'%(ojname,problemnum),re.IGNORECASE) regex2 = re.compile(u'%s([\s\S]*)%s([\s\S]*)CSDN.NET'%(ojname,problemnum),re.IGNORECASE) if(re.search(regex1,text)): filename = ojname+problemnum+'_'+str(cnt1) if(link_crawler_cnblogs(link,user_agent=user_agent,filename=filename)): cnt1+=1 if(re.search(regex2,text)): filename = ojname+problemnum+'_'+str(cnt2) if(link_crawler_csdn(link,user_agent=user_agent,filename=filename)): cnt2+=1 def union(seed_url,link): """ 将seed_url 与 link 拼接 """ link, _ = urlparse.urldefrag(link) #将link分解成去掉fragment的新url和去掉的fragment的二元组 return urlparse.urljoin(seed_url, link) def get_links(html): """ 获取html中的外链链接 """ webpage_regex = re.compile('href=["\'](.*?)["\']', re.IGNORECASE) #忽略大小写 # return all links list return webpage_regex.findall(html)
博客园的代码的获取.
#coding:utf-8 ''' Created on 2016年11月22日 @author: liyinggang ''' from download import download import lxml.html import re from fileinput import filename def link_crawler_cnblogs(url,user_agent=None,filename='filename'): html = download(url,user_agent=user_agent) html = unicode(html,"utf-8") tree = lxml.html.fromstring(html) texts = tree.cssselect('pre') regex = re.compile('^(#include|import)([\s\S]*)main()') #如果是代码里面一定包含 mian() 函数 flag = False for text in texts: text = text.text_content() if(re.search(regex, text)): flag = True f = open("D:\\download\\cnblogs\\%s.txt"%filename,"w") f.write(text) f.close() break return flag
csdn代码的获取:
#coding:utf-8 ''' Created on 2016年11月22日 @author: liyinggang ''' from download import download import lxml.html import re def link_crawler_csdn(url,user_agent=None,filename='filename'): html = download(url,user_agent=user_agent) html = unicode(html,"utf-8") tree = lxml.html.fromstring(html) texts = tree.cssselect('pre') texts.extend(tree.cssselect('p > textarea.cpp')) flag = False regex = re.compile('^(#include|import)([\s\S]*)main()') #如果是代码里面一定包含 mian() 函数 for text in texts: text = text.text_content() if(re.search(regex, text)): flag = True f = open("D:\\download\\csdn\\%s.txt"%filename,"w") f.write(text) f.close() break return flag
下载网页的代码:
#coding:utf-8 ''' Created on 2016-11-20 @author: admin ''' import urllib2 import urlparse def download(url,user_agent='lyg',proxy=None,retest=5): #user_agent是设置代理用户,proxy代表是否使用代理,retest是代表测试多少次 """ This method is to download the website source code """ print 'Downloading:',url headers = {'User-agent':user_agent} #设置用户代理,默认Python-urllib/2.7有可能被一些网站封禁 request = urllib2.Request(url,headers=headers) opener = urllib2.build_opener() if proxy: proxy_params = {urlparse.urlparse(url).schema: proxy} opener.add_handler(urllib2.ProxyHandler(proxy_params)) try: html = opener.open(request).read() except urllib2.URLError as e: print 'Downlaod error:',e.reason html = None if retest>0: if hasattr(e, 'code') and 500<= e.code <=600: #如果错误代码在 500- 600 之间就重新试,404这种就没必要尝试了 download(url, retest-1) return html
得到的结果: