一个抓取网站转而发布的例子

#coding=utf-8
#需要BeautifulSoup(美丽的汤)支持:http://crummy.com/software/BeautifulSoup
import urllib
import urllib2
from xml.sax.saxutils import unescape
from BeautifulSoup import BeautifulSoup # For processing HTML
def formalize(text):
result = ''
lines = text.split(u'\n')
for line in lines:
line = line.strip()
if len(line) == 0:
continue
result += line + u'\n\n'
return result
outfile = open("qiushi.txt", "w")
count = 0
for i in range(1, 101):
url = "http://qiushibaike.com/qiushi/best/all/page/%d" % i
data = urllib2.urlopen(url).readlines()
soup = BeautifulSoup("".join(data))
contents = soup.findAll('div', "content")
stories = [str(text) for text in contents]
for story in stories:
count += 1
print "processing page %d, %d items added" % (i, count)
minisoup = BeautifulSoup(story)
text = ''.join([e for e in minisoup.recursiveChildGenerator() ifisinstance(e, unicode)])
text = urllib.unquote(unescape(text, {'"':'"'}))
text = formalize(text).encode("utf-8")
print >> outfile, '-' * 20 + " %05d " % count + '-' * 20 + "\n"
print >> outfile, text + "\r\n"
outfile.close()

posted @ 2010-01-05 04:30  真功夫  阅读(481)  评论(1编辑  收藏  举报