第七周复习
妹子图
# 没啥难度就是注意图片防盗链和浏览器发出请求
import time
import requests
from bs4 import BeautifulSoup
import os
if not os.path.exists(r'🐍🤮'):
os.mkdir(r'🐍🤮')
tag = input('''
请输入分类拼音
例如 黑丝 heisi
湿身 shishen
''')
page = input('请输入所需页码 防止身体受不了')
url = 'https://www.mzitu.com/tag/' + tag + '/page/%s/' % page
res = requests.get(url,
headers={'Referer': 'https://www.mzitu.com/',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36'})
# print(res.text)
soup = BeautifulSoup(res.text, 'lxml')
li_list = soup.select('ul#pins>li')
for li in li_list:
a_link = li.find('a').get('href')
# print(a_link)
print('lsp正在努力获取下一张')
res2 = requests.get(a_link,
headers={'Referer': 'https://www.mzitu.com/',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36'})
# print(res2.text)
soup1 = BeautifulSoup(res2.text, 'lxml')
img_link = soup1.find('img', attrs={'class': 'blur'}).get('src')
title = soup1.find('h2', attrs={'class': 'main-title'}).text + '.jpg'
# print(img_link)
res3 = requests.get(img_link,
headers={'Referer': 'https://www.mzitu.com/',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36'})
img_path = os.path.join(r'🐍🤮', title)
with open(img_path, 'wb') as f:
f.write(res3.content)
time.sleep(1)
梨视频
import time
import requests
import os
from bs4 import BeautifulSoup
if not os.path.exists(r'梨视频'):
os.mkdir(r'梨视频')
for n in range(0, 120):
res = requests.get('https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=4&start=%s' % n)
# print(res.text)
soup = BeautifulSoup(res.text, 'lxml')
video_link_list = soup.findAll('a', attrs={'class': 'actplay'})
for video_link in video_link_list:
video_id = video_link.get('href').split('_')[1]
# print(vedio_id)
url = 'https://www.pearvideo.com/videoStatus.jsp?contId=' + video_id
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36',
'Referer': 'https://www.pearvideo.com/video_%s' % video_id}
res1 = requests.get(url, headers=headers) # 结果是个伪字典
net = res1.json()['videoInfo']['videos']['srcUrl']
system_num = res1.json()['systemTime']
# print(net, system_num)
real_url = net.replace(system_num, 'cont-%s' % video_id)
# print(real_url)
res2 = requests.get(real_url,headers=headers)
file_path = os.path.join(r'梨视频',system_num) + '.mp4'
with open(file_path,'wb')as f:
f.write(res2.content)
time.sleep(1)
豆瓣top250
import requests
import re
import time
from bs4 import BeautifulSoup
from openpyxl import Workbook
wb = Workbook() # 打开表格
wb1 = wb.create_sheet('豆瓣表格', 0) # 定义好工作簿
wb1.append(['电影名', '导演', '主演', '评分', '评价人数', '短评']) # 插入表头
def movie_rank(n):
url = 'https://movie.douban.com/top250'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36'}
res = requests.get(url,
headers=headers,
params={'start': n})
# print(res.text)
net = res.text
soup = BeautifulSoup(net, 'lxml')
title_list = re.findall('<img width="100" alt="(.*?)" src=', net)
direct_list = re.findall('导演: (.*?) ', net)
actor_list = re.findall(' 主演: (.*?) /...', net)
score_list = re.findall('property="v:average">(.*?)</span>', net)
comment_count_list = re.findall('<span>(.*?)人评价</span>', net)
li_list = soup.select('ol.grid_view>li')
inq_list = []
for li in li_list:
li = str(li) # 前一次写在这里耽误好久 li直接拿到是没有type的
if '<span class="inq">' in li: # 如果包含在这个里面
inq_part = re.findall('<span class="inq">(.*?)</span>', li) # 有就正则淦他
inq_list.append(inq_part[0])
else:
inq_part = '等你评价' # 没有就加上说明
inq_list.append(inq_part)
full_info = zip(title_list, direct_list, actor_list, score_list, comment_count_list, inq_list)
# zip是将多个列表按照顺序一一串联 回顾一下
for i in full_info:
wb1.append(list(i))
time.sleep(10)
for n in (0, 226, 25):
movie_rank(n)
wb.save(r'豆瓣top250.xlsx')
链家
import time
import requests
from bs4 import BeautifulSoup
from openpyxl import Workbook
'''由于这个网站的网页源码排版看起来十分不友好
所以只能放弃正则通过bs4解析器爬取'''
wb = Workbook()
wb1 = wb.create_sheet('二手房数据', 0)
wb1.append(['房屋名称', '详情链接', '小区名称', '区域名称', '房型', '面积', '朝向', '装修', '楼层', '竣工时间', '房屋属性', '关注人数', '发布时间', '总价', '单价'])
# 表头根据后面的东西慢慢插
for n in range(0, 101):
res = requests.get('https://sh.lianjia.com/ershoufang/pg%s/' % n) # 配合前面用占位符实现换行
soup = BeautifulSoup(res.text, 'lxml')
li_list = soup.select('ul.sellListContent>li.clear')
for li in li_list: # 一个个li拿出来再筛选
a_tag = li.select('div.title>a')[0]
title = a_tag.text
detail_link = a_tag['href']
location = li.select('div.positionInfo>a')
if len(location) == 2: # 对分割后可能不全的数据处理一下
estate = location[0].text.strip()
section = location[1].text.strip()
elif len(location) == 1:
estate = section = location[0].text
else: # 如果没有就自行定义
estate = section = '暂无信息'
info = li.select('div.houseInfo')[0].text
info_list = info.split('|')
room = info_list[0]
area = info_list[1]
towards = info_list[2]
decoration = info_list[3]
floor = info_list[4]
if len(info_list) == 7: # 前面不可能少就正常赋值 后面的处理一下
finish_time = info_list[5]
kind = info_list[6]
if len(info_list) == 6:
finish_time = kind = info_list[5]
else:
finish_time = kind = None
followInfo = li.find(name='div', attrs={'class': 'followInfo'}).text # 更具条件找到一个div 偶尔用用find不能冷落某个方法
focus_num, publish_time = followInfo.split('/') # 解压赋值掉
full_price = li.select('div.totalPrice')[0].text # 和前面一样拿总价
part_price = li.select('div.unitPrice')[0].text # 再拿每平方米价格
wb1.append(
[title, detail_link, estate, section, room, area, towards, decoration, floor, finish_time, kind, focus_num.strip(),
publish_time.strip(), full_price, part_price]) # 加进表格
time.sleep(1)
wb.save(r'二手房数据改.xlsx')
汽车之家
import requests
from bs4 import BeautifulSoup
from openpyxl import Workbook
wb = Workbook()
wb1 = wb.create_sheet(r'汽车之家新闻', 0)
wb1.append(['标题', '链接', '图片链接', '更新时间', '详情', '浏览人数', '评论人数'])
for n in range(0, 10):
url = 'https://www.autohome.com.cn/all/%s/' % n
res = requests.get(url,
headers={
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36'})
res.encoding = 'gbk' # 前面print看一下有没有乱码有翻到最前面看一下编码
soup = BeautifulSoup(res.text, 'lxml')
li_list = soup.select('div#auto-channel-lazyload-article li')
for li in li_list:
a_tag = li.find('a')
if not a_tag: # 针对页面干扰项处理
continue
link = 'https:' + a_tag.get('href') # 拼接网址常规操作
res1 = requests.get(link,
headers={
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36'})
# 加请求头防止防爬政策
'''
由于最后数据是准备放入表格的所以并没有去爬取大段的文字
但是实际上能够访问到详情页的数据
别的数据需求也只需要筛选即可难度不大
'''
soup1 = BeautifulSoup(res1.text, 'lxml')
title = soup1.select('div#articlewrap>h1')[0].text
img_link = 'https:' + li.select('div.article-pic>img')[0].get('src')
publish_time = soup1.select('div.article-info>span.time')[0].text
detail = li.find('p').text
view_count = li.select('span.fn-right>em')[0].text
comment_count = li.select('span.fn-right>em')[1].text
wb1.append([title.strip(), link, img_link, publish_time.strip(), detail, view_count, comment_count]) # 将他们插入表格
wb.save(r'汽车之家.xlsx')