python3下BeautifulSoup练习一（爬取小说）

　　上次写博客还是两个月以前的事，今天闲来无事，决定把以前刚接触python爬虫时的一个想法付诸行动：就是从网站上爬取小说，这样可以省下好多流量(^_^)。

　　因为只是闲暇之余写的，还望各位看官海涵；不足之处，不用客气///^_^.......

　　好了，上菜了‘(*>﹏<*)′

from bs4 import BeautifulSoup
import urllib.request
import re
import os,time

def getUrls(url):
    urls = []
    #url = 'http://www.qu.la/book/1258/'
    req = urllib.request.Request(url)
    page = urllib.request.urlopen(req)

    html = page.read()

    soup = BeautifulSoup(html,'html.parser')

    i = 0
    for k in soup.find_all(href=re.compile('.html')):
        #print('www.qu.la'+k['href'],k.get_text())
        if i != 0:
            urls.append('http://www.qu.la'+k['href'])
        i = i+1
    return urls


def getContent(url):
    #url = 'http://www.qu.la/book/1258/759251.html'
    headers = ('User-Agent','Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11')
    opener = urllib.request.build_opener()
    opener.addheaders = [headers]
    html = opener.open(url).read()
   
    soup = BeautifulSoup(html,'html.parser')
    content = soup.find('div',id='content')
    title = soup.find('h1')
    return title.get_text(),content.get_text()

if __name__ == '__main__':
    urls = getUrls('http://www.qu.la/book/1258/')
    #print(urls)
    fp = open("异界之魔武流氓.txt","w")
    for url in urls:
        print(url)
        title,content = getContent(url)
        fp.write(title+"\n")
        fp.write(content.replace('　　　　    ','\n')+"\n")
        time.sleep(2)
    fp.close()
    print("Done")

　　getUrls()函数是为了从小说的目录页获取各章节的链接（这一步也可以获取到章节的名称（⊙o⊙）），getContent()是根据提供的章节链接从网页中获取章节的名称和内容。

　　估计网站有防爬虫的设置，以上代码在测试过程中并不能获取到所有章节。。。。。。^_^|||

　　添加“user_agents”后，模拟多个浏览器访问则可以解决上述问题，同时修改了存入txt文件中的内容，去除获取到的文章内容中的广告信息。修改后的代码如下

from bs4 import BeautifulSoup
import urllib.request
import re
import os,time,random

def getUrls(url):
    urls = []
    #url = 'http://www.qu.la/book/1258/'
    req = urllib.request.Request(url)
    page = urllib.request.urlopen(req)

    html = page.read()

    soup = BeautifulSoup(html,'html.parser')

    i = 0
    for k in soup.find_all(href=re.compile('.html')):
        #print('www.qu.la'+k['href'],k.get_text())
        if i != 0:
            urls.append('http://www.qu.la'+k['href'])
        i = i+1
    return urls


def getContent(url):
    #url = 'http://www.qu.la/book/1258/759251.html'
    user_agents = [
                    'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',
                    'Opera/9.25 (Windows NT 5.1; U; en)',
                    'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
                    'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
                    'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
                    'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',
                    "Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Ubuntu/11.04 Chromium/16.0.912.77 Chrome/16.0.912.77 Safari/535.7",
                    "Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:10.0) Gecko/20100101 Firefox/10.0 ",
                  ]
    agent = random.choice(user_agents)
    opener = urllib.request.build_opener()
    opener.addheaders = [("User-agent",agent),("Accept","*/*"),]
    html = opener.open(url).read()
    '''
    req = urllib.request.Request(url)
    page = urllib.request.urlopen(req)
    html = page.read()
    '''

    soup = BeautifulSoup(html,'html.parser')
    content = soup.find('div',id='content')
    title = soup.find('h1')
    return title.get_text(),content.get_text()

if __name__ == '__main__':
    urls = getUrls('http://www.qu.la/book/1258/')
    #print(urls)
    fp = open("异界之魔武流氓.txt","w")
    i = 0
    for url in urls:
        print(url)
        title,content = getContent(url)
        fp.write(title+"\n")
        content = content.replace('　　　　','\n')
        fp.write(content[0:-71]+"\n")
        #time.sleep(2)
    fp.close()
    print("Done")

posted @ 2017-06-13 17:20 落雷阅读(506) 评论(0) 编辑收藏举报

刷新页面返回顶部

登录后才能查看或发表评论，立即登录或者逛逛博客园首页

公告

昵称：落雷
园龄： 8年6个月
粉丝： 33
关注： 1

+加关注

2025年3月

日

一

二

三

四

五

六

落雷

python3下BeautifulSoup练习一（爬取小说）

公告

搜索

常用链接

合集

随笔分类

随笔档案

阅读排行榜

评论排行榜

推荐排行榜

最新评论