第七周复习

妹子图

# 没啥难度就是注意图片防盗链和浏览器发出请求
import time

import requests
from bs4 import BeautifulSoup
import os

if not os.path.exists(r'🐍🤮'):
    os.mkdir(r'🐍🤮')

tag = input('''
        请输入分类拼音
        例如 黑丝 heisi
            湿身 shishen
''')
page = input('请输入所需页码 防止身体受不了')
url = 'https://www.mzitu.com/tag/' + tag + '/page/%s/' % page
res = requests.get(url,
                   headers={'Referer': 'https://www.mzitu.com/',
                            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36'})
# print(res.text)
soup = BeautifulSoup(res.text, 'lxml')
li_list = soup.select('ul#pins>li')
for li in li_list:
    a_link = li.find('a').get('href')
    # print(a_link)
    print('lsp正在努力获取下一张')
    res2 = requests.get(a_link,
                        headers={'Referer': 'https://www.mzitu.com/',
                                 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36'})
    # print(res2.text)
    soup1 = BeautifulSoup(res2.text, 'lxml')
    img_link = soup1.find('img', attrs={'class': 'blur'}).get('src')
    title = soup1.find('h2', attrs={'class': 'main-title'}).text + '.jpg'
    # print(img_link)
    res3 = requests.get(img_link,
                        headers={'Referer': 'https://www.mzitu.com/',
                                 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36'})
    img_path = os.path.join(r'🐍🤮', title)
    with open(img_path, 'wb') as f:
        f.write(res3.content)
    time.sleep(1)

梨视频

import time

import requests
import os
from bs4 import BeautifulSoup

if not os.path.exists(r'梨视频'):
    os.mkdir(r'梨视频')

for n in range(0, 120):
    res = requests.get('https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=4&start=%s' % n)
    # print(res.text)
    soup = BeautifulSoup(res.text, 'lxml')
    video_link_list = soup.findAll('a', attrs={'class': 'actplay'})
    for video_link in video_link_list:
        video_id = video_link.get('href').split('_')[1]
        # print(vedio_id)
        url = 'https://www.pearvideo.com/videoStatus.jsp?contId=' + video_id
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36',
            'Referer': 'https://www.pearvideo.com/video_%s' % video_id}
        res1 = requests.get(url, headers=headers)  # 结果是个伪字典
        net = res1.json()['videoInfo']['videos']['srcUrl']
        system_num = res1.json()['systemTime']
        # print(net, system_num)
        real_url = net.replace(system_num, 'cont-%s' % video_id)
        # print(real_url)
        res2 = requests.get(real_url,headers=headers)
        file_path = os.path.join(r'梨视频',system_num) + '.mp4'
        with open(file_path,'wb')as f:
            f.write(res2.content)
            time.sleep(1)

豆瓣top250

import requests
import re
import time
from bs4 import BeautifulSoup
from openpyxl import Workbook

wb = Workbook()  # 打开表格
wb1 = wb.create_sheet('豆瓣表格', 0)  # 定义好工作簿
wb1.append(['电影名', '导演', '主演', '评分', '评价人数', '短评'])  # 插入表头


def movie_rank(n):
    url = 'https://movie.douban.com/top250'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36'}
    res = requests.get(url,
                       headers=headers,
                       params={'start': n})
    # print(res.text)
    net = res.text
    soup = BeautifulSoup(net, 'lxml')
    title_list = re.findall('<img width="100" alt="(.*?)" src=', net)
    direct_list = re.findall('导演: (.*?)&nbsp;&nbsp;&nbsp;', net)
    actor_list = re.findall(' &nbsp;&nbsp;&nbsp;主演: (.*?) /...', net)
    score_list = re.findall('property="v:average">(.*?)</span>', net)
    comment_count_list = re.findall('<span>(.*?)人评价</span>', net)
    li_list = soup.select('ol.grid_view>li')
    inq_list = []
    for li in li_list:  
        li = str(li)  # 前一次写在这里耽误好久 li直接拿到是没有type的
        if '<span class="inq">' in li:  # 如果包含在这个里面
            inq_part = re.findall('<span class="inq">(.*?)</span>', li)  # 有就正则淦他
            inq_list.append(inq_part[0])  
        else:
            inq_part = '等你评价'  # 没有就加上说明
            inq_list.append(inq_part)  
    full_info = zip(title_list, direct_list, actor_list, score_list, comment_count_list, inq_list)  
    # zip是将多个列表按照顺序一一串联  回顾一下
    for i in full_info: 
        wb1.append(list(i))  
    time.sleep(10) 


for n in (0, 226, 25): 
    movie_rank(n)  

wb.save(r'豆瓣top250.xlsx')

链家

import time

import requests
from bs4 import BeautifulSoup
from openpyxl import Workbook

'''由于这个网站的网页源码排版看起来十分不友好
所以只能放弃正则通过bs4解析器爬取'''

wb = Workbook()  
wb1 = wb.create_sheet('二手房数据', 0)
wb1.append(['房屋名称', '详情链接', '小区名称', '区域名称', '房型', '面积', '朝向', '装修', '楼层', '竣工时间', '房屋属性', '关注人数', '发布时间', '总价', '单价'])
# 表头根据后面的东西慢慢插
for n in range(0, 101):  
    res = requests.get('https://sh.lianjia.com/ershoufang/pg%s/' % n)  # 配合前面用占位符实现换行
    soup = BeautifulSoup(res.text, 'lxml') 
    li_list = soup.select('ul.sellListContent>li.clear') 
    for li in li_list:  # 一个个li拿出来再筛选
        a_tag = li.select('div.title>a')[0]  
        title = a_tag.text  
        detail_link = a_tag['href']  
        location = li.select('div.positionInfo>a')  
        if len(location) == 2: # 对分割后可能不全的数据处理一下
            estate = location[0].text.strip()  
            section = location[1].text.strip()
        elif len(location) == 1: 
            estate = section = location[0].text
        else:  # 如果没有就自行定义
            estate = section = '暂无信息'
        info = li.select('div.houseInfo')[0].text 
        info_list = info.split('|')  
        room = info_list[0]  
        area = info_list[1]
        towards = info_list[2]
        decoration = info_list[3]
        floor = info_list[4]
        if len(info_list) == 7:  # 前面不可能少就正常赋值 后面的处理一下
            finish_time = info_list[5]  
            kind = info_list[6]  
        if len(info_list) == 6:  
            finish_time = kind = info_list[5]
        else:  
            finish_time = kind = None
        followInfo = li.find(name='div', attrs={'class': 'followInfo'}).text  # 更具条件找到一个div 偶尔用用find不能冷落某个方法
        focus_num, publish_time = followInfo.split('/')  # 解压赋值掉
        full_price = li.select('div.totalPrice')[0].text  # 和前面一样拿总价
        part_price = li.select('div.unitPrice')[0].text  # 再拿每平方米价格
        wb1.append(
            [title, detail_link, estate, section, room, area, towards, decoration, floor, finish_time, kind, focus_num.strip(),
             publish_time.strip(), full_price, part_price])  # 加进表格
        time.sleep(1)
wb.save(r'二手房数据改.xlsx')

汽车之家

import requests
from bs4 import BeautifulSoup
from openpyxl import Workbook

wb = Workbook()
wb1 = wb.create_sheet(r'汽车之家新闻', 0)
wb1.append(['标题', '链接', '图片链接', '更新时间', '详情', '浏览人数', '评论人数'])

for n in range(0, 10):
    url = 'https://www.autohome.com.cn/all/%s/' % n
    res = requests.get(url,
                       headers={
                           'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36'})
    res.encoding = 'gbk'  # 前面print看一下有没有乱码有翻到最前面看一下编码
    soup = BeautifulSoup(res.text, 'lxml')
    li_list = soup.select('div#auto-channel-lazyload-article li')
    for li in li_list:
        a_tag = li.find('a')
        if not a_tag:  # 针对页面干扰项处理
            continue
        link = 'https:' + a_tag.get('href')  # 拼接网址常规操作
        res1 = requests.get(link,
                            headers={
                                'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36'})
        # 加请求头防止防爬政策
        '''
        由于最后数据是准备放入表格的所以并没有去爬取大段的文字
        但是实际上能够访问到详情页的数据  
        别的数据需求也只需要筛选即可难度不大
        '''
        soup1 = BeautifulSoup(res1.text, 'lxml')
        title = soup1.select('div#articlewrap>h1')[0].text
        img_link = 'https:' + li.select('div.article-pic>img')[0].get('src')
        publish_time = soup1.select('div.article-info>span.time')[0].text  
        detail = li.find('p').text  
        view_count = li.select('span.fn-right>em')[0].text 
        comment_count = li.select('span.fn-right>em')[1].text  
        wb1.append([title.strip(), link, img_link, publish_time.strip(), detail, view_count, comment_count])  # 将他们插入表格

    wb.save(r'汽车之家.xlsx')

posted @ 2021-09-26 21:19 草卆鱼阅读(137) 评论(0) 编辑收藏举报

刷新页面返回顶部