抓取新闻的爬虫

此爬虫是基于http://www.cnblogs.com/rails3/archive/2012/08/14/2636780.html，修改而成，因自己也在学习过程中，略微加入了一些注释，更改了部分代码。由于发现博主过滤非文本内容时用的正则表达式很不错，记录一下。

#coding=utf-8
import sys
import urllib2
import re
import os

def extract_url(info):
    rege="http://news.qq.com/a/\d{8}/\d{6}.htm"
    re_url = re.findall(rege, info)
    return re_url

def extract_sub_web_title(sub_web):
    re_key = "<title>.+</title>"
    title = re.findall(re_key,sub_web)
    return title

def extract_sub_web_content(sub_web):
    re_key = "<div id=\"Cnt-Main-Article-QQ\".*</div>"
    content = re.findall(re_key,sub_web)
    return content

def filter_tags(htmlstr):
    re_cdata=re.compile('//<!\[CDATA\[[^>]*//\]\]>',re.I) #匹配CDATA
    re_script=re.compile('<\s*script[^>]*>[^<]*<\s*/\s*script\s*>',re.I)#Script
    re_style=re.compile('<\s*style[^>]*>[^<]*<\s*/\s*style\s*>',re.I)#style
    re_p=re.compile('<P\s*?/?>')#处理换行 详细解释: <P开始匹配，\s*表示匹配任意空格多个,?表示使用非贪婪匹配，/?>表示匹配/>或者>
    re_h=re.compile('</?\w+[^>]*>')#HTML标签 详细解释：</开始匹配，?表示前面的/可有可无，w+表示匹配任何字母数字多个，[^>]*表示匹配不是>的字符多个，然后最后匹配到>
    re_comment=re.compile('<!--[^>]*-->')#HTML注释
    s=re_cdata.sub('',htmlstr)#去掉CDATA
    s=re_script.sub('',s) #去掉SCRIPT
    s=re_style.sub('',s)#去掉style
    s=re_p.sub('\r\n',s)#将<p>转换为换行
    s=re_h.sub('',s) #去掉HTML 标签
    s=re_comment.sub('',s)#去掉HTML注释  
    blank_line=re.compile('\n+')#去掉多余的空行
    s=blank_line.sub('\n',s)
    return s

#get news
content = urllib2.urlopen('http://news.qq.com').read()
#generate file
f = file('news.txt','w')
c = 1
#get the url
get_url = list(set(extract_url(content))) #这是用set容器进行去重，因为抓取出来的二级栏目有很多重复的
for i in range(1,50):
    sub_web = urllib2.urlopen(get_url[i]).read()
    sub_title = extract_sub_web_title(sub_web)
    sub_content = extract_sub_web_content(sub_web)
    if sub_title != [] and sub_content != []:
        print 'The %d news' % c
        f.write(str(c)+"\r\n")#写上这是第几个，将i转化为string，记住windows下换行用"\r\n"
        c+=1
        finnal=filter_tags(sub_title[0]+"\r\n"+sub_content[0])
        f.write(finnal)
        f.write("\r\n")
f.close()

　注：

　　1.这种方法对正则表达式要求较高，虽能起到效果，但总感觉不那么智能，待学习过BeautifulSoup后再做抓取联系。

posted @ 2014-08-12 17:22 hi_net 阅读(267) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

hi_net

抓取新闻的爬虫

公告