乐之之

知而行乐,乐而行之,天道酬勤,学无止境。
ssr2

  ssr2

  重点:关闭证书验证和警告提示

  ssr2和ssr1的网站类型相似,只是在请求的基础上加了ssl验证,只需要在requests的请求中把证书验证关掉就可以了。

  • verify=False

  关掉验证再发送请求会出现如上的警告出现,虽然不会影响代码的运行,但是从美观度来看总归不太好,我们可以利用warnings或urllib3模块关掉警告。

  • warnings
import warnings
# 设置忽略警告
warnings.filterwarnings("ignore")
  • urllib3
from requests.packages import urllib3
# 关闭警告
urllib3.disable_warnings()

  接下来,关于数据的解析就不再赘述,这样就可以获取该类型的电影网站数据了。

  具体代码如下:

ssr2
 import requests
from lxml import etree
import warnings
from requests.packages import urllib3
# 关闭警告
urllib3.disable_warnings()
# warnings.filterwarnings("ignore")
"""
    1、根据第一层的目录获取url请求第二层的内容,解析保存至txt文件
"""

headers = {
    "Accept": "application/json, text/plain, */*",
    "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
    "Cache-Control": "no-cache",
    "Connection": "keep-alive",
    "Pragma": "no-cache",
    "Referer": "https://spa2.scrape.center/page/1",
    "Sec-Fetch-Dest": "empty",
    "Sec-Fetch-Mode": "cors",
    "Sec-Fetch-Site": "same-origin",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.57",
}


# 页面1
def get_parse_page1(i=None):
    # url = f'https://ssr1.scrape.center/page/{i}'
    url = 'https://ssr2.scrape.center/'
    resp = requests.get(url=url, headers=headers, verify=False)
    html = etree.HTML(resp.text)
    content_list = html.xpath('.//div[@class="p-h el-col el-col-24 el-col-xs-9 el-col-sm-13 el-col-md-16"]')
    for data in content_list:
        href = data.xpath('.//a[@class="name"]/@href')[0]
        get_parse_page2(href)


# 页面2
def get_parse_page2(href):
    url = f'https://ssr1.scrape.center{href}'
    resp = requests.get(url, headers=headers)
    html = etree.HTML(resp.text)
    content_list = html.xpath('.//div[@class="p-h el-col el-col-24 el-col-xs-16 el-col-sm-12"]')
    # print(resp.text)
    for data in content_list:
        title = data.xpath('.//a/h2/text()')[0]
        # print(title)
        category = data.xpath('.//div[@class="categories"]//button/span/text()')
        area = data.xpath('./div[@class="m-v-sm info"]//span/text()')
        content = str(data.xpath('.//div[@class="drama"]/p/text()')[0]).strip()
        score = str(html.xpath('.//div[@class="el-col el-col-24 el-col-xs-8 el-col-sm-4"]/p/text()')[0]).strip()
        with open(f'{title}.txt', 'a', encoding='utf-8') as f:
            f.write(title + '\n' + str(category) + '\n' + str(area) + '\n' + content + '\n' + score)
        print(title, category, area, score)


if __name__ == '__main__':
    get_parse_page1()
    # for i in range(10):
    #     get_parse_page1(i)

posted on 2023-07-10 12:59  乐之之  阅读(94)  评论(0编辑  收藏  举报