下载新浪博客文章,保存成文本文件(python)

今天用Python写了一个下载韩寒新浪博客文章的下载器,恩,基本功能如下:

1、从新浪博客上批量下载文章,并按文章标题创建文件

2、对下载的文章进行格式化。

已知Bug:长篇文章格式会错乱

 1 #!/usr/bin/python
 2 #-*- coding:utf-8 -*-
 3 
 4 import urllib
 5 import os
 6 import re
 7 
 8 def article_format(usock,basedir):    
 9     title_flag=True
10     context_start_flag=True
11     context_end_flag=True
12     for line in usock:
13         if title_flag:
14             title=re.findall(r'(<title>.+?<)',line)
15             if title:
16                 title=title[0][7:-1]
17                 filename=basedir+title
18                 print filename
19                 try:
20                     fobj=open(filename,'w+')
21                     fobj.write(title+'\n')
22                     title_flag=False
23                 except IOError,e:
24                     print "Open %s error:%s"%(filename,e)
25             else:
26                 #print "Title has not found,drop it"
27                 pass
28         elif context_start_flag:
29             results1=re.findall(r'(<.+?正文开始.+?>)',line)
30             if results1:
31                 context_start_flag=False
32         elif context_end_flag:
33             results2=re.findall(r'(<.+?正文结束.+?)',line)
34             if results2:
35                 context_end_flag=False
36                 fobj.write('\nEND')
37                 fobj.close()
38                 break
39             else:    
40                 if 'div' in line or 'span' in line or  '<p>' in line:
41                     pass
42                 else:    
43                     line=re.sub('&#65292;',',',line)
44                     line=re.sub('&#65306;',':',line)
45                     line=re.sub('&#65281;','!',line)
46                     line=re.sub('&#65288;','(',line)
47                     line=re.sub('&#65289;',')',line)
48                     line=re.sub('&#8943;','...',line)
49                     line=re.sub('&#65311;','?',line)
50                     line=re.sub('&#65307;',';',line)
51                     line=re.sub(r'<wbr>','',line)
52                     line=re.sub(r'&nbsp;','',line)
53                     line=re.sub(r'<br\s+?/>','',line)
54                     fobj.write(line)
55         else:
56             pass
57 
58 if __name__=='__main__':
59     basedir='/home/tmyyss/article/'
60     if not os.path.exists(basedir):
61         os.makedirs(basedir)
62 
63     usock=urllib.urlopen("http://blog.sina.com.cn/s/articlelist_1191258123_0_1.html")
64     context=usock.read()
65     #print context
66     raw_url_list=re.findall(r'(<a\s+title.+?href="http.+?html)',context)
67     for url in raw_url_list:
68         url=re.findall('(http.+?html)',url)[0]
69         article_usock=urllib.urlopen(url)
70         article_format(article_usock,basedir)
View Code

 

posted @ 2014-12-17 17:40  tmyyss  阅读(756)  评论(0编辑  收藏  举报