【菜鸟学Python】使用Xpath爬取豆瓣读书
方法一:使用面向过程爬取1.0
import json import requests from lxml import etree url = "https://www.douban.com/doulist/1264675/?start=0" response = requests.get(url).content.decode() dom = etree.HTML(response) books = dom.xpath('//div[@class="article"]/div[@class="doulist-item"]') with open('book.json', 'w', encoding='utf-8')as f: for book in books: item = {} item['title'] = book.xpath('normalize-space(.//div/div[2]/div[3]/a/text())') item['author'] = book.xpath('normalize-space(.//div/div[2]/div[5]/text()[1])') item['year'] = book.xpath('normalize-space(.//div/div[2]/div[5]/text()[3])') item['rank'] = book.xpath('normalize-space(.//div/div[2]/div[4]/span[@class="rating_nums"]/text())') f.write(json.dumps(item, ensure_ascii=False) + ',\n')
方法一:使用面向过程爬取2.0(使用函数封装爬取)
import time import json import requests from lxml import etree from requests.exceptions import RequestException def get_one_page(url): try: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36'} response = requests.get(url, headers=headers) if response.status_code == 200: return response.text return None except RequestException: return None def parse_one_page(html): dom = etree.HTML(html) node = dom.xpath('//div[@class="article"]/div[@class="doulist-item"]') for i in range(25): book_value = node[i] yield { 'title': book_value.xpath('normalize-space(.//div/div[2]/div[3]/a/text())'), 'author': book_value.xpath('normalize-space(.//div/div[2]/div[5]/text()[1])'), 'year': book_value.xpath('normalize-space(.//div/div[2]/div[5]/text()[3])'), 'rank': book_value.xpath('normalize-space(.//div/div[2]/div[4]/span[@class="rating_nums"]/text())') } def write_to_file(content): with open('result.json', 'a', encoding='utf-8') as f: print(type(json.dumps(content))) f.write(json.dumps(content, ensure_ascii=False) + '\n') def main(start): url = 'https://www.douban.com/doulist/1264675/?start=0' + str(start) html = get_one_page(url) for item in parse_one_page(html): print(item) write_to_file(item) if __name__ == '__main__': for i in range(0, 250, 25): main(start=i) time.sleep(1)
方法二:使用面向对象爬取
import time import json import requests from lxml import etree from requests.exceptions import RequestException class DouBanReadBook(): """豆瓣读书排行""" # 初始化属性 def __init__(self): self.url = 'https://www.douban.com/doulist/1264675/' self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36' } # 请求url获取html文本 def get_one_page(self, url): try: response = requests.get(url, headers=self.headers) if response.status_code == 200: return response.text return None except RequestException: return None def process_data(self, html): dom = etree.HTML(html) print(dom) node = dom.xpath('//div[@class="article"]/div[@class="doulist-item"]') for i in range(25): book_value = node[i] yield { 'title': book_value.xpath('normalize-space(.//div/div[2]/div[3]/a/text())'), 'author': book_value.xpath('normalize-space(.//div/div[2]/div[5]/text()[1])'), 'year': book_value.xpath('normalize-space(.//div/div[2]/div[5]/text()[3])'), 'rank': book_value.xpath('normalize-space(.//div/div[2]/div[4]/span[@class="rating_nums"]/text())') } def save_file(self, content): with open('result.json', 'a', encoding='utf-8') as f: print(type(json.dumps(content))) f.write(json.dumps(content, ensure_ascii=False) + '\n') def main(self, start): url = 'https://www.douban.com/doulist/1264675/?start=0' + str(start) html = self.get_one_page(url) for item in self.process_data(html): print(item) self.save_file(item) if __name__ == '__main__': read_book = DouBanReadBook() for i in range(0, 250, 25): # print(i) read_book.main(start=i) time.sleep(1)