正则爬取实例

import re

import requests


url = 'https://b.faloo.com/1183478 1.html'


headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'
}
res = requests.get(url,headers=headers)
# 匹配每个章节的链接

chapter_url = re.findall('<a class="c_con_li_datail" href="(.*?)',res.text)
# print(chapter_url)
for data_url in chapter_url:
    new_url = 'https:'+data_url
    # print(new_url)
    # 请求单个章节的完整内容
    resp = requests.get(new_url,headers=headers)
    # 提取标题
    title = re.findall('<h1>(.*?)</h1>',resp.text)
    # print(title)
    # 提取文章内容
    content = re.findall('<div class="noveContent">([\s\S]*?)<b><font color=red>',resp.text)
    # 内容处理
    data = content[0].replace('<p>','\n').replace('</p>','')
    with open('./novel/{}.txt'.format(title),'w',encoding='utf-8') as f1:
        f1.write(data)
        print('正在写入--{}--'.format(title))

posted @ 2022-08-19 14:48 冬天不下雨阅读(17) 评论(0) 编辑收藏举报

刷新页面返回顶部

longwanghzx

正则爬取实例

公告