Python爬虫——小说

#encoding:utf8

import re

import urllib2

url = 'http://www.23us.com/html/55/55304/'

request = urllib2.Request(url)

response = urllib2.urlopen(request)

content = response.read().decode('gbk')

the_url = re.compile('<td class=\"L"\><a href=\"(.*?)"\>.*?</a></td>',re.S) last_url = the_url.findall(content)

for i in last_url:

    print i

    url = 'http://www.23us.com/html/55/55304/'+i

    request = urllib2.Request(url)

    response = urllib2.urlopen(request)

    zhi = response.read()

    code = re.compile('.*?content="text.html; charset=(.*?)".*?',re.S)

    last_code = code.findall(zhi)[0]

    try:

        content = zhi.decode(''+last_code)

    except:

        try:

            content = zhi.decode('gb2312')

        except:

            continue

    last_content = re.compile('<title>(.*?)</title>.*?<dd id="contents">(.*?)</dd>',re.S)

    last_content = last_content.findall(content)    

    if last_content==[]:        

            print '采集失败'

            print content

    for I,J in last_content:

        J = J.replace('&nbsp;','').replace('<br/> <br/>','\n')  

       file = open('小说.txt','a+')

        t = '\n\n\t\t' + I + '\n\n' + '\t' + J

        file.write(t.encode('utf-8'))        

        file.close()

posted @ 2017-03-13 20:08  海上生明月~  阅读(908)  评论(0编辑  收藏  举报