爬取电影天堂最新电影下各个电影标题-电影磁链接

import requests
import re

url = 'xxx/index2.htm'

headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.56'
}

resp = requests.get(url=url, headers=headers)
resp.encoding = 'gb2312'  # 设置字符集,打开页面源代码翻到最上面,会看见charset=gb2312"> 跟着改就行
main_page_text = resp.text

# 匹配主见面最新电影的url
obj = re.compile(r"最新电影下载</a>]<a href='(?P<link>.*?)'>", re.S)
# 匹配标题跟磁链接
obj1 = re.compile(r"译  名 (?P<name>.*?)[/<]"
                  r".*?<blockquote>(?P<magnet>.*?)</blockquote>", re.S)

# 存放到文件里面
f = open('data.txt', mode='w', encoding='utf-8')

# 把匹配到的放到lst里面
href = obj.finditer(main_page_text)
lst = []
for it in href:
    lst.append(it.group('link'))

#
for l in lst:
    # 拼接url 请求最新电影下的电影详情
    under_url = 'https://dy.dytt8.net/' + l
    under = requests.get(url=under_url, headers=headers)
    under.encoding = 'gb2312'
    under_page_text = under.text

    # 搜索到一次就返回 返回字符串
    # name = obj1.search(under_page_text).group('name').strip()
    # magnet = obj1.search(under_page_text).group('magnet')
    # 如果不设置try的话 会报找不到‘group’
    # AttributeError: 'NoneType' object has no attribute 'group'

    # 再把里面页面的进行re判断提取有用的信息,再统一返回成字典,再把value提取出来放文件里面
    result = obj1.finditer(under_page_text)
    for it in result:
        dic = it.groupdict()
        dic['name'] = dic['name'].strip()  # 去除前后空格
        f.write(f"{dic['name']},{dic['magnet']}")
        f.write('\n')
    under.close()

f.close()
resp.close()
print("完成!!!")

 

posted @ 2023-02-28 13:06  0x1e61  阅读(391)  评论(0编辑  收藏  举报