爬取某网站图书信息
小试牛刀!爬取某网站图书信息
#!/usr/bin/env python
# -*- coding:utf-8 -*-
from bs4 import BeautifulSoup
import urllib.request
import time
import random
def generateUrlList(reString="%E5%BF%83%E7%90%86%E5%AD%A6", start=0, end=40, jumperNumber=20):
urlList = []
for n in range(start, end, jumperNumber):
urlList.append("https://xxx/tag/%s?start=%s&type=T" % (reString, n))
return urlList
def getInfo(webAddress):
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4)'
req = urllib.request.Request(webAddress)
req.add_header('User-Agent', user_agent)
html = urllib.request.urlopen(req)
bsObj = BeautifulSoup(html, 'html5lib')
bsO = bsObj.find("ul", {"class": "subject-list"})
bookName = bsO.findAll("div", {'class': 'info'})
pub = bsO.findAll("div", {'class': 'pub'})
ratingName = bsO.findAll("div", {'class': 'star clearfix'})
intro = bsO.findAll("li", {"class": "subject-item"})
return bookName, pub, ratingName, intro
def foreachUrl(urlList):
stringFormat = []
for url in urlList:
sn = random.randint(1, 3)
time.sleep(sn)
print("After %s s, Crawling %s" % (sn, url))
bookNa, pub, ratingName, intro = getInfo(url)
for b, p, r, i in zip(bookNa, pub, ratingName, intro):
bookN = ' '.join(b.h2.a.text.split())
pubN = ' '.join(p.text.split())
ratingN = ''.join(r.text.split())
urlA = b.h2.a["href"]
stringFormat.append(bookN + "|" + pubN + "|" + ratingN + "|" + urlA)
return stringFormat
if __name__ == "__main__":
"""
"""
# restring = str(input("Please input your stigma:"))
# startN = int(input("Please input your start number:"))
# endN = int(input("Please input your end number:"))
# jumperN = int(input("Please input your jumper number:"))
urlList = generateUrlList(reString="%E5%BF%83%E7%90%86%E5%AD%A6", start=0, end=200, jumperNumber=20)
fh = open("result.txt", 'w')
for c in foreachUrl(urlList):
fh.write(c + "\n")
fh.close()