爬取某网站图书信息

小试牛刀!爬取某网站图书信息

#!/usr/bin/env python
# -*- coding:utf-8 -*-


from bs4 import BeautifulSoup
import urllib.request
import time
import random


def generateUrlList(reString="%E5%BF%83%E7%90%86%E5%AD%A6", start=0, end=40, jumperNumber=20):
    urlList = []
    for n in range(start, end, jumperNumber):
        urlList.append("https://xxx/tag/%s?start=%s&type=T" % (reString, n))
    return urlList


def getInfo(webAddress):
    user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4)'
    req = urllib.request.Request(webAddress)
    req.add_header('User-Agent', user_agent)
    html = urllib.request.urlopen(req)
    bsObj = BeautifulSoup(html, 'html5lib')
    bsO = bsObj.find("ul", {"class": "subject-list"})

    bookName = bsO.findAll("div", {'class': 'info'})
    pub = bsO.findAll("div", {'class': 'pub'})
    ratingName = bsO.findAll("div", {'class': 'star clearfix'})
    intro = bsO.findAll("li", {"class": "subject-item"})

    return bookName, pub, ratingName, intro


def foreachUrl(urlList):
    stringFormat = []

    for url in urlList:
        sn = random.randint(1, 3)
        time.sleep(sn)
        print("After %s s, Crawling %s" % (sn, url))
        bookNa, pub, ratingName, intro = getInfo(url)
        for b, p, r, i in zip(bookNa, pub, ratingName, intro):
            bookN = ' '.join(b.h2.a.text.split())
            pubN = ' '.join(p.text.split())
            ratingN = ''.join(r.text.split())
            urlA = b.h2.a["href"]
            stringFormat.append(bookN + "|" + pubN + "|" + ratingN + "|" + urlA)

    return stringFormat


if __name__ == "__main__":

    """
    
    """
    # restring = str(input("Please input your stigma:"))
    # startN = int(input("Please input your start number:"))
    # endN = int(input("Please input your end number:"))
    # jumperN = int(input("Please input your jumper number:"))

    urlList = generateUrlList(reString="%E5%BF%83%E7%90%86%E5%AD%A6", start=0, end=200, jumperNumber=20)
    fh = open("result.txt", 'w')

    for c in foreachUrl(urlList):
        fh.write(c + "\n")
    fh.close()

 

posted @ 2017-05-04 21:37  Ly_Python  阅读(409)  评论(0编辑  收藏  举报