静态web网页的爬取

以爬取某论坛数据为例

  1. 数据表的设计:
from peewee import *

db = MySQLDatabase("spider", host="127.0.0.1", port=3306, user="root", password="123456")


class BaseModel(Model):
    class Meta:
        database = db


class Topic(BaseModel):
    topic_id = IntegerField(primary_key=True)   # 主题id,主键
    title = CharField()                         # 主题标题
    author = CharField()                        # 主题作者
    publish_time = DateField()                  # 发表时间
    click_num = IntegerField(default=0)         # 点击数
    answer_num = IntegerField(default=0)        # 回复数量
    final_answer_author = CharField()           # 最后回复作者
    final_answer_time = DateTimeField()         # 最后回复时间


if __name__ == '__main__':
    #   创建表结构
    db.create_tables([Topic])


  1. 单线程版本,脚本如下:
import re
import time
from datetime import datetime

import requests
from scrapy import Selector

from models import *

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0'
}


def parse_url(url):
    res = requests.get(url, headers=headers)
    res.encoding = 'gb2312'    #该论坛静态页面编码为gb2312
    html_text = res.text

    sel = Selector(text=html_text)
    items = sel.xpath('//div[@class="threadbit1"]')
    for item in items:
        title_lst = item.xpath(".//div[@class='thread-row openTitle']/a/font/text()").extract()
        if title_lst:
            title = title_lst[0].strip()
        author_lst = item.xpath(".//div[4]/a[1]/text()").extract()
        if (author_lst):
            author = author_lst[0]

        publish_time_lst = item.xpath(".//div[4]/a[2]/text()").extract()
        if (publish_time_lst):
            publish_time = publish_time_lst[0]
        publish_time = datetime.strptime(publish_time, r'%Y-%m-%d')
        click_answer_lst = item.xpath(".//div[@style='float:right;width:90px;']/text()").extract()
        if click_answer_lst:
            click_answer_str = click_answer_lst[0].strip()
        click_answer_str = click_answer_str.split('/')
        answer_num = int(click_answer_str[0])
        if click_answer_str[1] == '':
            click_num = int(item.xpath(".//div[3]/font/text()").extract()[0])
        else:
            click_num = int(click_answer_str[1])
        final_answer_author = item.xpath(".//div[2]/a[1]/text()").extract()[0]
        final_answer_time = item.xpath(".//div[2]/a[2]/text()").extract()
        if final_answer_time:
            final_answer_time = final_answer_time[0]
        final_answer_time = datetime.strptime(final_answer_time, r'%Y-%m-%d %H:%M')

        id_lst = item.xpath(".//div[@class='thread-row openTitle']/a/@href").extract()
        topic = Topic()
        if id_lst:
            id = int(re.search(r'(\d+)', id_lst[0]).group(1))
            topic.topic_id = id

        topic.title = title
        topic.author = author
        topic.publish_time = publish_time
        topic.click_num = click_num
        topic.answer_num = answer_num
        topic.final_answer_author = final_answer_author
        topic.final_answer_time = final_answer_time
        existed_topics = Topic.select().where(Topic.topic_id == topic.topic_id)
        if existed_topics:
            topic.save()
        else:
            topic.save(force_insert=True)

        print("start download topic: " + str(topic.topic_id))

        time.sleep(1)

if __name__ == "__main__":
    res = requests.get("https://xxx/viewforum-21-1.html", headers=headers)
    res.encoding = 'gb2312'
    html_text = res.text

    sel = Selector(text=html_text)
    # 获取总页数
    td_str = sel.xpath("//div[@class='pagenav']//td[@class='pagenav']/text()").extract()[0]
    match = re.search(r'(\d+/\d+)', td_str)
    if match:
        total_page = int(match.group(1).split('/')[1])

    # total_page = 1
    for i in range(0, total_page):
        parse_url("https://xxx/viewforum-21-{0}.html".format(i+1))

下载某移动端APP里的小说

  1. 下载书名为特种兵王纵横都市的小说,结果保存到工作目录下的特种兵王纵横都市.txt中
import time
import requests


def download_file():
    headers = {
        'User-Agent': 'okhttp/3.2.0'
    }
    ret = requests.get('xxx/v2/book/xxx/volumes?app_key=4037465544', headers=headers)
    
    book_name = ret.json().get('data').get('book_name')
    with open(book_name + '.txt', 'w', encoding='utf-8') as f:
        for i in ret.json().get('data').get('volumes')[0].get('chapters'):
            url = 'xxx/v2/book/xxx/chapter/{0}/content?app_key=4037465544'.format(i.get('chapter_id'))
            print(url)
            ret = requests.get(url, headers=headers)

            if len(ret.json().get('data')) == 0:
                continue
            else:
                content = ret.json().get('data').get('content')
                chapter_name = ret.json().get('data').get('name')

                f.write(chapter_name)
                f.write('\n')
                f.write(content)
                f.write('\n')

                print('正在下载{},'.format(chapter_name))

            time.sleep(1)


if __name__ == '__main__':
    download_file()

自动答题

  1. 静态网页为例,示例如下:
# 导入浏览器对象
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from scrapy import Selector

# 导入数据请求模块
import requests
import time

# 指定chromedriver的位置
service = Service("D:/chrome/chromedriver-win64/chromedriver-win64/chromedriver.exe")
options = Options()
# 指定chrome浏览器二进制文件的位置
options.binary_location = "D:/chrome/Google/Chrome/Application/chrome.exe"

chrome_browser = webdriver.Chrome(options=options, service=service)
chrome_browser.get('xxx')

# 窗口最大化
chrome_browser.maximize_window()

time.sleep(2)

# chrome_browser.implicitly_wait(10)

lis = chrome_browser.find_elements(By.CSS_SELECTOR, '.Content>li')

for li in lis:
    time.sleep(0.2)
    rid = li.get_attribute('c')

    # 获取正确答案
    url = f'xxx/Post/{rid}.htm'
    response = requests.get(url=url).text
    sel = Selector(text=response)
    answer = sel.css('#question u::text').get()

    # ABCD
    if 68 >= ord(answer) >= 65:
        if answer == 'A':
            ele = li.find_element(By.XPATH, './b[1]')
            ele.click()
        elif answer == 'B':
            ele = li.find_element(By.XPATH, './b[2]')
            ele.click()
        elif answer == 'C':
            ele = li.find_element(By.XPATH, './b[3]')
            ele.click()
        else:
            ele = li.find_element(By.XPATH, './b[4]')
            ele.click()

    # 对错
    else:
        if answer == '对':
            ele = li.find_element(By.XPATH, './b[1]')
            ele.click()
        else:
            ele = li.find_element(By.XPATH, './b[2]')
            ele.click()

# 提交试卷
chrome_browser.find_element(By.CSS_SELECTOR, '.btnJJ').click()

# 添加阻塞
input()

# 关闭浏览器
chrome_browser.quit()