Python练手之爬虫

很久没更新博客了,最近自学Python,写个在百度上爬算法题题解的爬虫,第一次写爬虫..纯当练手

慢慢来..

#coding:utf-8
'''
Created on 2016年11月22日

@author: liyinggang
'''
from link_crawler_baidu import link_crawler_baidu
import urllib
baseUrl = 'http://www.baidu.com/s'
page = 1 #第几页
ojname = 'hdu'
problemnum = '1006'
word = ojname + problemnum
data = {'wd':word,'pn':str(page-1)+'0','tn':'baidurt','ie':'utf-8','bsst':'1'}
data = urllib.urlencode(data)
url = baseUrl+'?'+data
print url
link_crawler_baidu(url,'lyg',ojname,problemnum)

首先是进入百度主页,main.py

然后我将博客园和csdn的链接筛选出来(编码问题好难弄,最后百度了个笨方法).

#coding:utf-8
'''
Created on 2016年11月22日

@author: liyinggang
'''
import re,lxml.html,urlparse
from download import download
import sys
from link_crawler_cnblogs import link_crawler_cnblogs
from link_crawler_csdn import link_crawler_csdn
reload(sys)
def link_crawler_baidu(seed_url,user_agent=None,ojname='hdu',problemnum='1001'):
    html = download(seed_url,user_agent=user_agent)
    url1 = "http://www.cnblogs.com/";
    url2 = "http://blog.csdn.net/";
    regex1 = re.compile(url1)
    regex2 = re.compile(url2)
    cnt1 = 0
    cnt2 = 0
    links = [link for link in get_links(html) if \
             re.match(regex1,link) or re.match(regex2, link)]
    for link in links:
        link = union(seed_url,link)
        html = download(link,user_agent=user_agent)
        html = unicode(html, "utf-8") #只能用这个笨方法了
        tree = lxml.html.fromstring(html)
        text = tree.cssselect('title')[0].text_content()
        regex1 = re.compile(u'%s([\s\S]*)%s([\s\S]*)博客园'%(ojname,problemnum),re.IGNORECASE)
        regex2 = re.compile(u'%s([\s\S]*)%s([\s\S]*)CSDN.NET'%(ojname,problemnum),re.IGNORECASE)
        if(re.search(regex1,text)):
            filename = ojname+problemnum+'_'+str(cnt1) 
            if(link_crawler_cnblogs(link,user_agent=user_agent,filename=filename)):
                cnt1+=1
        if(re.search(regex2,text)): 
            filename = ojname+problemnum+'_'+str(cnt2) 
            if(link_crawler_csdn(link,user_agent=user_agent,filename=filename)):
                cnt2+=1
        
def union(seed_url,link):
    """
          将seed_url 与 link 拼接
    """
    link, _ = urlparse.urldefrag(link)  #将link分解成去掉fragment的新url和去掉的fragment的二元组
    return urlparse.urljoin(seed_url, link)

def get_links(html):
    """
          获取html中的外链链接
    """
    webpage_regex = re.compile('href=["\'](.*?)["\']', re.IGNORECASE) #忽略大小写
    # return all links list
    return webpage_regex.findall(html)

博客园的代码的获取.

#coding:utf-8
'''
Created on 2016年11月22日

@author: liyinggang
'''
from download import download
import lxml.html
import re
from fileinput import filename
def link_crawler_cnblogs(url,user_agent=None,filename='filename'):
    html = download(url,user_agent=user_agent)
    html = unicode(html,"utf-8")
    tree = lxml.html.fromstring(html)
    texts = tree.cssselect('pre')
    regex = re.compile('^(#include|import)([\s\S]*)main()')  #如果是代码里面一定包含 mian() 函数
    flag = False
    for text in texts:
        text = text.text_content()
        if(re.search(regex, text)):
            flag = True
            f = open("D:\\download\\cnblogs\\%s.txt"%filename,"w")
            f.write(text)
            f.close()
            break
    return flag
    
    

csdn代码的获取:

#coding:utf-8
'''
Created on 2016年11月22日

@author: liyinggang
'''
from download import  download
import  lxml.html
import re
def link_crawler_csdn(url,user_agent=None,filename='filename'):
    html = download(url,user_agent=user_agent)
    html = unicode(html,"utf-8")
    tree = lxml.html.fromstring(html)
    texts = tree.cssselect('pre')
    texts.extend(tree.cssselect('p > textarea.cpp'))
    flag = False
    regex = re.compile('^(#include|import)([\s\S]*)main()') #如果是代码里面一定包含 mian() 函数
    for text in texts:
        text = text.text_content()
        if(re.search(regex, text)):
            flag = True
            f = open("D:\\download\\csdn\\%s.txt"%filename,"w")
            f.write(text)
            f.close()
            break
    return flag
    

下载网页的代码:

#coding:utf-8
'''
Created on 2016-11-20

@author: admin
'''
import urllib2
import urlparse
def download(url,user_agent='lyg',proxy=None,retest=5):
#user_agent是设置代理用户,proxy代表是否使用代理,retest是代表测试多少次
    """
    This method is to download the website source code
    """
    print 'Downloading:',url
    headers = {'User-agent':user_agent} #设置用户代理,默认Python-urllib/2.7有可能被一些网站封禁
    request = urllib2.Request(url,headers=headers)
    
    opener = urllib2.build_opener()
    if proxy:
        proxy_params = {urlparse.urlparse(url).schema: proxy}
        opener.add_handler(urllib2.ProxyHandler(proxy_params))
    try:    
        html = opener.open(request).read()
    except urllib2.URLError as e:
        print 'Downlaod error:',e.reason
        html = None
        if retest>0:
            if hasattr(e, 'code') and 500<= e.code <=600:
                #如果错误代码在 500- 600 之间就重新试,404这种就没必要尝试了
                download(url, retest-1)
    return html

得到的结果:

posted @ 2016-11-23 17:04  樱花庄的龙之介大人  阅读(1083)  评论(0编辑  收藏  举报