lxml webdriver 抓取街拍
案例
import os from hashlib import md5 from selenium import webdriver import requests from lxml import etree # 首页请求 def get_response(url): headers = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36"} res = requests.get(url, headers=headers) return res # 从返回的json中解析出组图链接 def get_article_title_url(text, i): # json解析获取字典表数据 article_url = text.json()["data"][i]['article_url'] title = text.json()["data"][i]['title'] return article_url,title # 从单个组图链接里解析出每张图片的URL地址-自上而下 def parse_article_url(article_url): driver = webdriver.Chrome(r"D:\python\com\zxsoft\python\chromedriver.exe") driver.get(article_url) text = driver.page_source html = etree.HTML(text) hrefs = html.xpath('//div[@class="article-content"]//div[@class="pgc-img"]//img[@class="syl-page-img"]//@src') driver.close() return hrefs # 将每张图片保存在对应标题的本地文件夹下 def save_jpg(title,href): res = requests.get(href) file_path = '{}/{}.{}'.format(title, md5(res.content).hexdigest(), 'jpg') with open(file_path, 'wb') as f: f.write(res.content) os.chdir(r"E:/ntmssFile/nv/") for i in range(20): url = 'https://www.toutiao.com/api/search/content/?aid=24&app_name=web_search&offset={}&format=json&keyword=%E8%A1%97%E6%8B%8D&autoload=true&count=20&en_qc=1&cur_tab=1&from=search_tab'.format(i * 20) r = get_response(url) data_length = r.json()["data"] for i in range(len(data_length)): try: # 不是所有的列表中都有组图标题和链接信息,用try防止报错 article_url,title_text = get_article_title_url(r, i) if not os.path.exists(title_text): os.makedirs(title_text) hrefs = parse_article_url(article_url) for href in hrefs: save_jpg(title_text,href) except: continue
import requests from lxml import etree from hashlib import md5 import re import os import redis r = redis.StrictRedis(host='172.16.xx.xx', port=6379, db=2, decode_responses=True) headers = { 'Cookie': 'Hm_lvt_c8263f264e5db13b29b03baeb1840f60=1632291839,1632373348; Hm_lpvt_c8263f264e5db13b29b03baeb1840f60=1632373697', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36' } os.chdir(r"E:\home\webContainer\ntmssResource") urlRes = requests.get("https://www.3gbizhi.com/meinv/", headers=headers) html = etree.HTML(urlRes.text) href_list = html.xpath('//div[@class="cl r"]//ul[@class="cl"]/li/a/@href') for href in href_list: parttern_href = re.compile(r'https://www.3gbizhi.com/meinv/(.*?).html', flags=re.DOTALL) linkShort = re.search(parttern_href, href) short = linkShort.group(1) if not os.path.exists(short): os.makedirs(short) for i in range(2, 3): url = f'https://www.3gbizhi.com/meinv/' + short + '_' + str(i) + '.html' response = requests.get(url, headers=headers) html = etree.HTML(response.text) href_list = html.xpath('//div[@class="contlistw mtw"]//ul[@class="cl"]/li/a/@href') title_list = html.xpath('//div[@class="contlistw mtw"]//ul[@class="cl"]/li/a/@title') for href, title in zip(href_list, title_list): res = requests.get(href, headers=headers) html_data = etree.HTML(res.text) img_url_list = html_data.xpath('//div[@class="picimglist pos"]/ul/li/a/img/@src') for img_url in img_url_list: img_url = ''.join(img_url.split('thumb_200_0_')) result = requests.get(img_url, headers=headers).content file_path = '{}/{}.{}'.format(short, md5(result).hexdigest(), 'jpg') path = file_path url_local = "http://192.168.31.155:8889/ntmssResource/" + path r.sadd("img_list", url_local) print(f'正在下载 {title} {url_local}!!!!') with open(file_path, 'wb')as f: f.write(result)
故乡明
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· go语言实现终端里的倒计时
· 如何编写易于单元测试的代码
· 10年+ .NET Coder 心语,封装的思维:从隐藏、稳定开始理解其本质意义
· .NET Core 中如何实现缓存的预热?
· 从 HTTP 原因短语缺失研究 HTTP/2 和 HTTP/3 的设计差异
· 分享一个免费、快速、无限量使用的满血 DeepSeek R1 模型,支持深度思考和联网搜索!
· 基于 Docker 搭建 FRP 内网穿透开源项目(很简单哒)
· ollama系列01:轻松3步本地部署deepseek,普通电脑可用
· 按钮权限的设计及实现
· 25岁的心里话