80s爬虫scrapy

import requests
from lxml import etree
import pandas as pd
import scrapy

from 技术提升.scrapy.eightys.eightys.items import EightysItem


class MovieSpider(scrapy.Spider):
name = 'movie'
allowed_domains = ['www.i80s.cc']
start_urls = ['http://www.i80s.cc/']

# def parse(self, response):
# html = response.text
# # print(html)
# tree = etree.HTML(html)
# second = tree.xpath(
# '//h3[@class="h3"]/a//@href'
#
# )
# names = []
# second_urls = []
# for second in second:
# second_url = 'https://www.i80s.cc/' + second
# second_urls.append(second_url)
# html2 = requests.get(second_url).content.decode()
# tree2 = etree.HTML(html2)
# name2 = tree2.xpath('//h1[@class="font14w"]//text()')[0]
# names.append(name2)
# data = {'电影名': names, '详情页连接': second_urls}
# print(data)
# s = pd.DataFrame(data, index=False)
# s.to_excel('80s电影.xlsx')
# print('存储完成')
sum=1
def parse(self, response):
html = response.text
# print(html)
tree = etree.HTML(html)
second = tree.xpath(
'//h3[@class="h3"]/a//@href'

)
names = []
second_urls = []
for second in second:
second_url = 'https://www.i80s.cc/' + second
# second_urls.append(second_url)
item = EightysItem()
item['second_url'] = 'https://www.i80s.cc/' + second
print('second_url', second_url)
yield scrapy.Request(second_url, callback=self.parse2)

def parse2(self, response):
html2 = response.text

tree2 = etree.HTML(html2)
item = EightysItem()
name = tree2.xpath('//h1[@class="font14w"]//text()')[0]
img = tree2.xpath('//div[@class="img"]//img//@src')[0]
r = requests.get(img)

with open(f'D:\pycharm项目\技术提升\scrapy\eightys\eightys\spiders\图片\{name}.jpg', 'wb') as f:
f.write(r.content)
print(f'******{name}*****下载成功')

posted @ 2022-06-15 17:15  布都御魂  阅读(142)  评论(0编辑  收藏  举报