【菜鸟学Python】使用Xpath爬取豆瓣读书

方法一:使用面向过程爬取1.0

import json
import requests
from lxml import etree

url = "https://www.douban.com/doulist/1264675/?start=0"
response = requests.get(url).content.decode()
dom = etree.HTML(response)
books = dom.xpath('//div[@class="article"]/div[@class="doulist-item"]')
with open('book.json', 'w', encoding='utf-8')as f:
    for book in books:
        item = {}
        item['title'] = book.xpath('normalize-space(.//div/div[2]/div[3]/a/text())')
        item['author'] = book.xpath('normalize-space(.//div/div[2]/div[5]/text()[1])')
        item['year'] = book.xpath('normalize-space(.//div/div[2]/div[5]/text()[3])')
        item['rank'] = book.xpath('normalize-space(.//div/div[2]/div[4]/span[@class="rating_nums"]/text())')
        f.write(json.dumps(item, ensure_ascii=False) + ',\n')
方法一:使用面向过程爬取2.0(使用函数封装爬取)
import time
import json
import requests
from lxml import etree
from requests.exceptions import RequestException


def get_one_page(url):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36'}
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        return None


def parse_one_page(html):
    dom = etree.HTML(html)
    node = dom.xpath('//div[@class="article"]/div[@class="doulist-item"]')
    for i in range(25):
        book_value = node[i]
        yield {
            'title': book_value.xpath('normalize-space(.//div/div[2]/div[3]/a/text())'),
            'author': book_value.xpath('normalize-space(.//div/div[2]/div[5]/text()[1])'),
            'year': book_value.xpath('normalize-space(.//div/div[2]/div[5]/text()[3])'),
            'rank': book_value.xpath('normalize-space(.//div/div[2]/div[4]/span[@class="rating_nums"]/text())')
        }


def write_to_file(content):
    with open('result.json', 'a', encoding='utf-8') as f:
        print(type(json.dumps(content)))
        f.write(json.dumps(content, ensure_ascii=False) + '\n')


def main(start):
    url = 'https://www.douban.com/doulist/1264675/?start=0' + str(start)
    html = get_one_page(url)
    for item in parse_one_page(html):
        print(item)
        write_to_file(item)


if __name__ == '__main__':
    for i in range(0, 250, 25):
        main(start=i)
        time.sleep(1)

方法二:使用面向对象爬取

import time
import json
import requests
from lxml import etree
from requests.exceptions import RequestException


class DouBanReadBook():
    """豆瓣读书排行"""

    # 初始化属性
    def __init__(self):

        self.url = 'https://www.douban.com/doulist/1264675/'
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36'
        }

    # 请求url获取html文本
    def get_one_page(self, url):
        try:
            response = requests.get(url, headers=self.headers)
            if response.status_code == 200:
                return response.text
            return None
        except RequestException:
            return None

    def process_data(self, html):
        dom = etree.HTML(html)
        print(dom)
        node = dom.xpath('//div[@class="article"]/div[@class="doulist-item"]')
        for i in range(25):
            book_value = node[i]
            yield {
                'title': book_value.xpath('normalize-space(.//div/div[2]/div[3]/a/text())'),
                'author': book_value.xpath('normalize-space(.//div/div[2]/div[5]/text()[1])'),
                'year': book_value.xpath('normalize-space(.//div/div[2]/div[5]/text()[3])'),
                'rank': book_value.xpath('normalize-space(.//div/div[2]/div[4]/span[@class="rating_nums"]/text())')
            }

    def save_file(self, content):
        with open('result.json', 'a', encoding='utf-8') as f:
            print(type(json.dumps(content)))
            f.write(json.dumps(content, ensure_ascii=False) + '\n')

    def main(self, start):
        url = 'https://www.douban.com/doulist/1264675/?start=0' + str(start)
        html = self.get_one_page(url)
        for item in self.process_data(html):
            print(item)
            self.save_file(item)


if __name__ == '__main__':
    read_book = DouBanReadBook()
    for i in range(0, 250, 25):
        # print(i)
        read_book.main(start=i)
        time.sleep(1)

 


 

posted @ 2019-09-03 17:10  Cloud.Li  阅读(971)  评论(0编辑  收藏  举报