80s爬虫scrapy
import requests
from lxml import etree
import pandas as pd
import scrapy
from 技术提升.scrapy.eightys.eightys.items import EightysItem
class MovieSpider(scrapy.Spider):
name = 'movie'
allowed_domains = ['www.i80s.cc']
start_urls = ['http://www.i80s.cc/']
# def parse(self, response):
# html = response.text
# # print(html)
# tree = etree.HTML(html)
# second = tree.xpath(
# '//h3[@class="h3"]/a//@href'
#
# )
# names = []
# second_urls = []
# for second in second:
# second_url = 'https://www.i80s.cc/' + second
# second_urls.append(second_url)
# html2 = requests.get(second_url).content.decode()
# tree2 = etree.HTML(html2)
# name2 = tree2.xpath('//h1[@class="font14w"]//text()')[0]
# names.append(name2)
# data = {'电影名': names, '详情页连接': second_urls}
# print(data)
# s = pd.DataFrame(data, index=False)
# s.to_excel('80s电影.xlsx')
# print('存储完成')
sum=1
def parse(self, response):
html = response.text
# print(html)
tree = etree.HTML(html)
second = tree.xpath(
'//h3[@class="h3"]/a//@href'
)
names = []
second_urls = []
for second in second:
second_url = 'https://www.i80s.cc/' + second
# second_urls.append(second_url)
item = EightysItem()
item['second_url'] = 'https://www.i80s.cc/' + second
print('second_url', second_url)
yield scrapy.Request(second_url, callback=self.parse2)
def parse2(self, response):
html2 = response.text
tree2 = etree.HTML(html2)
item = EightysItem()
name = tree2.xpath('//h1[@class="font14w"]//text()')[0]
img = tree2.xpath('//div[@class="img"]//img//@src')[0]
r = requests.get(img)
with open(f'D:\pycharm项目\技术提升\scrapy\eightys\eightys\spiders\图片\{name}.jpg', 'wb') as f:
f.write(r.content)
print(f'******{name}*****下载成功')
本文作者:布都御魂
本文链接:https://www.cnblogs.com/wolvies/p/16379161.html
版权声明:本作品采用知识共享署名-非商业性使用-禁止演绎 2.5 中国大陆许可协议进行许可。
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
2021-06-15 读写csv
2021-06-15 python爬虫,使用requests设置代理