静态web网页的爬取
以爬取某论坛数据为例
- 数据表的设计:
from peewee import *
db = MySQLDatabase("spider", host="127.0.0.1", port=3306, user="root", password="123456")
class BaseModel(Model):
class Meta:
database = db
class Topic(BaseModel):
topic_id = IntegerField(primary_key=True) # 主题id,主键
title = CharField() # 主题标题
author = CharField() # 主题作者
publish_time = DateField() # 发表时间
click_num = IntegerField(default=0) # 点击数
answer_num = IntegerField(default=0) # 回复数量
final_answer_author = CharField() # 最后回复作者
final_answer_time = DateTimeField() # 最后回复时间
if __name__ == '__main__':
# 创建表结构
db.create_tables([Topic])
- 单线程版本,脚本如下:
import re
import time
from datetime import datetime
import requests
from scrapy import Selector
from models import *
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0'
}
def parse_url(url):
res = requests.get(url, headers=headers)
res.encoding = 'gb2312' #该论坛静态页面编码为gb2312
html_text = res.text
sel = Selector(text=html_text)
items = sel.xpath('//div[@class="threadbit1"]')
for item in items:
title_lst = item.xpath(".//div[@class='thread-row openTitle']/a/font/text()").extract()
if title_lst:
title = title_lst[0].strip()
author_lst = item.xpath(".//div[4]/a[1]/text()").extract()
if (author_lst):
author = author_lst[0]
publish_time_lst = item.xpath(".//div[4]/a[2]/text()").extract()
if (publish_time_lst):
publish_time = publish_time_lst[0]
publish_time = datetime.strptime(publish_time, r'%Y-%m-%d')
click_answer_lst = item.xpath(".//div[@style='float:right;width:90px;']/text()").extract()
if click_answer_lst:
click_answer_str = click_answer_lst[0].strip()
click_answer_str = click_answer_str.split('/')
answer_num = int(click_answer_str[0])
if click_answer_str[1] == '':
click_num = int(item.xpath(".//div[3]/font/text()").extract()[0])
else:
click_num = int(click_answer_str[1])
final_answer_author = item.xpath(".//div[2]/a[1]/text()").extract()[0]
final_answer_time = item.xpath(".//div[2]/a[2]/text()").extract()
if final_answer_time:
final_answer_time = final_answer_time[0]
final_answer_time = datetime.strptime(final_answer_time, r'%Y-%m-%d %H:%M')
id_lst = item.xpath(".//div[@class='thread-row openTitle']/a/@href").extract()
topic = Topic()
if id_lst:
id = int(re.search(r'(\d+)', id_lst[0]).group(1))
topic.topic_id = id
topic.title = title
topic.author = author
topic.publish_time = publish_time
topic.click_num = click_num
topic.answer_num = answer_num
topic.final_answer_author = final_answer_author
topic.final_answer_time = final_answer_time
existed_topics = Topic.select().where(Topic.topic_id == topic.topic_id)
if existed_topics:
topic.save()
else:
topic.save(force_insert=True)
print("start download topic: " + str(topic.topic_id))
time.sleep(1)
if __name__ == "__main__":
res = requests.get("https://xxx/viewforum-21-1.html", headers=headers)
res.encoding = 'gb2312'
html_text = res.text
sel = Selector(text=html_text)
# 获取总页数
td_str = sel.xpath("//div[@class='pagenav']//td[@class='pagenav']/text()").extract()[0]
match = re.search(r'(\d+/\d+)', td_str)
if match:
total_page = int(match.group(1).split('/')[1])
# total_page = 1
for i in range(0, total_page):
parse_url("https://xxx/viewforum-21-{0}.html".format(i+1))
下载某移动端APP里的小说
- 下载书名为特种兵王纵横都市的小说,结果保存到工作目录下的特种兵王纵横都市.txt中
import time
import requests
def download_file():
headers = {
'User-Agent': 'okhttp/3.2.0'
}
ret = requests.get('xxx/v2/book/xxx/volumes?app_key=4037465544', headers=headers)
book_name = ret.json().get('data').get('book_name')
with open(book_name + '.txt', 'w', encoding='utf-8') as f:
for i in ret.json().get('data').get('volumes')[0].get('chapters'):
url = 'xxx/v2/book/xxx/chapter/{0}/content?app_key=4037465544'.format(i.get('chapter_id'))
print(url)
ret = requests.get(url, headers=headers)
if len(ret.json().get('data')) == 0:
continue
else:
content = ret.json().get('data').get('content')
chapter_name = ret.json().get('data').get('name')
f.write(chapter_name)
f.write('\n')
f.write(content)
f.write('\n')
print('正在下载{},'.format(chapter_name))
time.sleep(1)
if __name__ == '__main__':
download_file()
自动答题
- 以静态网页为例,示例如下:
# 导入浏览器对象
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from scrapy import Selector
# 导入数据请求模块
import requests
import time
# 指定chromedriver的位置
service = Service("D:/chrome/chromedriver-win64/chromedriver-win64/chromedriver.exe")
options = Options()
# 指定chrome浏览器二进制文件的位置
options.binary_location = "D:/chrome/Google/Chrome/Application/chrome.exe"
chrome_browser = webdriver.Chrome(options=options, service=service)
chrome_browser.get('xxx')
# 窗口最大化
chrome_browser.maximize_window()
time.sleep(2)
# chrome_browser.implicitly_wait(10)
lis = chrome_browser.find_elements(By.CSS_SELECTOR, '.Content>li')
for li in lis:
time.sleep(0.2)
rid = li.get_attribute('c')
# 获取正确答案
url = f'xxx/Post/{rid}.htm'
response = requests.get(url=url).text
sel = Selector(text=response)
answer = sel.css('#question u::text').get()
# ABCD
if 68 >= ord(answer) >= 65:
if answer == 'A':
ele = li.find_element(By.XPATH, './b[1]')
ele.click()
elif answer == 'B':
ele = li.find_element(By.XPATH, './b[2]')
ele.click()
elif answer == 'C':
ele = li.find_element(By.XPATH, './b[3]')
ele.click()
else:
ele = li.find_element(By.XPATH, './b[4]')
ele.click()
# 对错
else:
if answer == '对':
ele = li.find_element(By.XPATH, './b[1]')
ele.click()
else:
ele = li.find_element(By.XPATH, './b[2]')
ele.click()
# 提交试卷
chrome_browser.find_element(By.CSS_SELECTOR, '.btnJJ').click()
# 添加阻塞
input()
# 关闭浏览器
chrome_browser.quit()
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 全程不用写代码,我用AI程序员写了一个飞机大战
· DeepSeek 开源周回顾「GitHub 热点速览」
· 记一次.NET内存居高不下排查解决与启示
· MongoDB 8.0这个新功能碉堡了,比商业数据库还牛
· .NET10 - 预览版1新功能体验(一)