1 爬取梨视频
"""
优化:爬取梨视频
1.封装
2.多线程
"""
import json
import random
from concurrent.futures import ThreadPoolExecutor
import redis
import requests
from bs4 import BeautifulSoup
def get_proxy():
proxies = []
pool = redis.ConnectionPool(host='127.0.0.1', port=6379)
conn = redis.Redis(connection_pool=pool)
for key in conn.hscan_iter('use_proxy', count=10):
value = json.loads(key[1].decode('utf-8'))
if value.get('https'):
proxy = {"https": "https://{}".format(key[0].decode('utf-8'))}
else:
proxy = {"http": "http://{}".format(key[0].decode('utf-8'))}
proxies.append(proxy)
return proxies[random.randint(0, len(proxies) - 1)]
def get_page(url, proxy=None):
print('GET:%s' % url)
try:
response = requests.get(url, proxies=proxy)
if response.status_code == 200:
return response
except Exception:
pass
def parse_page(res):
soup = BeautifulSoup(res.text, 'html.parser')
a_list = soup.find_all(name='a', class_='vervideo-lilink actplay')
video_url_list = []
for a in a_list:
content_id = a.attrs['href'].split('_')[-1]
video = {'content_id': content_id,
'video_url': f'https://www.pearvideo.com/videoStatus.jsp?contId={content_id}'}
video_url_list.append(video)
return video_url_list
def save(video_url_list, proxy=None):
for video in video_url_list:
content_id = video.get('content_id')
video_url = video.get('video_url')
header = {'Referer': 'https://www.pearvideo.com/video_' + video.get('content_id')}
res = requests.get(video_url, headers=header, proxies=proxy)
mp4_url = res.json().get('videoInfo').get('videos').get('srcUrl')
mp4_url = mp4_url.replace(mp4_url.split('/')[-1].split('-')[0], f'cont-{content_id}')
res_video = requests.get(mp4_url)
with open(f'./pear_video/{content_id}.mp4', 'wb') as f:
for line in res_video.iter_content(1024):
f.write(line)
def main(start=0):
proxy = get_proxy()
category_url = f'https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=9&start={start}'
res = get_page(category_url, proxy)
video_list = parse_page(res)
save(video_list, proxy)
if __name__ == '__main__':
p = ThreadPoolExecutor(10)
for i in range(0, 80, 12):
p.submit(main, i)
2 爬取汽车之家
import requests
from bs4 import BeautifulSoup
import pymysql
res = requests.get('https://www.autohome.com.cn/news/1/#liststart')
soup = BeautifulSoup(res.text, 'lxml')
conn = pymysql.Connection(
host='127.0.0.1',
port=3306,
user='root',
password='123456',
database='test',
charset='utf8'
)
cursor = conn.cursor()
sql = """
create table autohome_news(
id int primary key auto_increment,
title varchar(64),
url varchar(128),
img_url varchar(128),
abstract varchar(128)
);
"""
cursor.execute(sql)
conn.commit()
conn.close()
ul_list = soup.find_all(name='ul', class_='article')
for ul in ul_list:
li_list = ul.find_all(name='li', id=False)
for li in li_list:
a = li.find(name='a')
url = 'https:' + a.attrs['href']
img_url = a.find('img')['src']
if 'https:' not in img_url:
img_url = 'https:' + img_url
title = a.find('h3')
if title:
title = title.text
abstract = a.find('p').text
conn = pymysql.Connection(
host='127.0.0.1',
port=3306,
user='root',
password='123456',
database='test',
charset='utf8'
)
cursor = conn.cursor()
sql = "insert into autohome_news (title, url, img_url, abstract) values(%s, %s,%s,%s)"
rows = cursor.execute(sql, args=(title, url, img_url, abstract))
conn.commit()
conn.close()
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 10年+ .NET Coder 心语 ── 封装的思维:从隐藏、稳定开始理解其本质意义
· 地球OL攻略 —— 某应届生求职总结
· 周边上新:园子的第一款马克杯温暖上架
· Open-Sora 2.0 重磅开源!
· 提示词工程——AI应用必不可少的技术