遇事不决,可问春风,春风不语,谨遵本心|

布都御魂

园龄:3年9个月粉丝:2关注:1

80s爬虫scrapy

import requests
from lxml import etree
import pandas as pd
import scrapy

from 技术提升.scrapy.eightys.eightys.items import EightysItem


class MovieSpider(scrapy.Spider):
name = 'movie'
allowed_domains = ['www.i80s.cc']
start_urls = ['http://www.i80s.cc/']

# def parse(self, response):
# html = response.text
# # print(html)
# tree = etree.HTML(html)
# second = tree.xpath(
# '//h3[@class="h3"]/a//@href'
#
# )
# names = []
# second_urls = []
# for second in second:
# second_url = 'https://www.i80s.cc/' + second
# second_urls.append(second_url)
# html2 = requests.get(second_url).content.decode()
# tree2 = etree.HTML(html2)
# name2 = tree2.xpath('//h1[@class="font14w"]//text()')[0]
# names.append(name2)
# data = {'电影名': names, '详情页连接': second_urls}
# print(data)
# s = pd.DataFrame(data, index=False)
# s.to_excel('80s电影.xlsx')
# print('存储完成')
sum=1
def parse(self, response):
html = response.text
# print(html)
tree = etree.HTML(html)
second = tree.xpath(
'//h3[@class="h3"]/a//@href'

)
names = []
second_urls = []
for second in second:
second_url = 'https://www.i80s.cc/' + second
# second_urls.append(second_url)
item = EightysItem()
item['second_url'] = 'https://www.i80s.cc/' + second
print('second_url', second_url)
yield scrapy.Request(second_url, callback=self.parse2)

def parse2(self, response):
html2 = response.text

tree2 = etree.HTML(html2)
item = EightysItem()
name = tree2.xpath('//h1[@class="font14w"]//text()')[0]
img = tree2.xpath('//div[@class="img"]//img//@src')[0]
r = requests.get(img)

with open(f'D:\pycharm项目\技术提升\scrapy\eightys\eightys\spiders\图片\{name}.jpg', 'wb') as f:
f.write(r.content)
print(f'******{name}*****下载成功')

本文作者:布都御魂

本文链接:https://www.cnblogs.com/wolvies/p/16379161.html

版权声明:本作品采用知识共享署名-非商业性使用-禁止演绎 2.5 中国大陆许可协议进行许可。

posted @   布都御魂  阅读(143)  评论(0编辑  收藏  举报
历史上的今天:
2021-06-15 读写csv
2021-06-15 python爬虫,使用requests设置代理
   
点击右上角即可分享
微信分享提示
评论
收藏
关注
推荐
深色
回顶
收起