爬虫 使用requests实战 bs4和css选择器

内容详细

1 使用requests爬取视频

        # 模拟发送http请求的库:requests---》只能发送http请求----》没有解析库--》re、bs4、lxml
        # requests-html:发送请求+解析xml

        # 视频m3u8格式,分段---》会员试看6分钟---》之加载了6分钟
        # 收费视频:视频解析



        # 视频去水印--》fmmpeg--》加水印,拼接裁剪,抠图,转码。。。
        # 装上使用python来调用处理



        # re 解析想要的数据
        # import requests
        # res=requests.get("https://www.pearvideo.com/")
        # print(res.text)


        # https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=8&start=24
        import requests
        import re
        res=requests.get('https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=8&start=24')
        # print(res.text)
        # 解析出页面中所有的视频地址

        video_list=re.findall('<a href="(.*?)" class="vervideo-lilink actplay">',res.text)
        # print(video_list)
        for video in video_list:
            video_url='https://www.pearvideo.com/'+video
            video_id=video_url.split('_')[-1]
            header={
                'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36',
                'Referer': video_url
            }
            # 第一层反扒是加refer
            res_video=requests.get('https://www.pearvideo.com/videoStatus.jsp?contId=%s&mrd=0.7113776105084832'%video_id,headers=header)
            mp4_url=res_video.json()['videoInfo']['videos']['srcUrl']

            # 第二层反扒是把不能播放地址变成能播放地址
            mp4_url = mp4_url.replace(mp4_url.split('/')[-1].split('-')[0], 'cont-%s' % video_id)
            print(mp4_url)
            # 下载到本地
            res_video_detail=requests.get(mp4_url)
            with open('./video/%s.mp4'%video_id,'wb') as f:
                for line in res_video_detail.iter_content(1024):
                    f.write(line)


        # 单线程下载,速度不快,全是io操作,开启多线程能够显著提高速度---》使用多线程全站下载视频
        # 线程池整站爬取


        # 不能播放的地址
        # https://video.pearvideo.com/mp4/third/20220314/1652060493892-10097838-231626-hd.mp4
        # https://video.pearvideo.com/mp4/third/20220314/    cont-1754713     -10097838-231626-hd.mp4

        # mp4_url='https://video.pearvideo.com/mp4/third/20220314/      1652060493892   -10097838-231626-hd.mp4'
        # mp4_url=mp4_url.replace(mp4_url.split('/')[-1].split('-')[0],'cont-%s'%video_id)

2 requests+bs4爬取网站

        import requests
        # pip3 install beautifulsoup4
        from bs4 import BeautifulSoup

        res = requests.get('https://www.autohome.com.cn/news/1/#liststart')
        # print(res.text)
        # html.parser bs4默认的解析库
        soup = BeautifulSoup(res.text, 'html.parser')
        # 使用bs4的查找
        ul_list = soup.find_all(name='ul', class_='article')
        # print(len(ul_list))
        for ul in ul_list:
            # 找ul标签下所有的li标签
            li_list = ul.find_all(name='li')
            for li in li_list:
                h3 = li.find(name='h3')
                if h3:
                    title = h3.text  # 获取h3标签的文本内容
                    desc = li.find(name='p').text
                    img = li.find(name='img')['src']
                    if not img.startswith('http'):
                        img='https:'+img
                    url = 'https:' + li.find('a')['href']
                    print('''
                    新闻标题:%s
                    新闻摘要:%s
                    新闻图片:%s
                    新闻地址:%s
                    ''' % (title, desc, img, url))
                    # 把图片保存到本地
                    res_img=requests.get(img)
                    img_name=img.split('/')[-1]
                    with open('./img/%s'%img_name,'wb') as f:
                        for line in res_img.iter_content(1024):
                            f.write(line)

                    # 把数据存到数据库 pymysql写入数据库--》建库建表--》cursor.exec(insert ..)-->commit


3 bs4遍历文档树


        from bs4 import BeautifulSoup

        html_doc = """
        <html><head><title>The Dormouse's story</title></head>
        <body>
        <p class="title" id="id_p">lqz<b>The Dormouse's story</b></p>

        <p class="story">Once upon a time there were three little sisters; and their names were
        <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
        <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
        <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
        and they lived at the bottom of a well.</p>

        <p class="story">...</p>
        """
        # html.parser 内置的,速度一般,容错能力强
        # lxml    第三方,速度快,容错能力强
        # soup=BeautifulSoup(html_doc,'html.parser')
        # pip3 install lxml
        soup=BeautifulSoup(html_doc,'lxml')

        # print(soup.prettify()) # 对html进行美化

        #1 遍历文档树之  .  遍历    速度快
        # print(soup.title)
        # print(soup.body.p)
        # print(soup.body.p.b)

        #2、获取标签的名称
        # print(soup.title.name)
        # print(soup.body.name)
        #3、获取标签的属性
        # print(soup.body.p)
        # print(soup.p['class'])  # 因为class可能有多个,所以是列表
        # print(soup.p['id'])
        # print(soup.p.attrs) # 所有属性放到字典中

        #4、获取标签的内容--文本内容
        # print(soup.p.text)  # 当前标签和子子孙的文本内容拼到一起
        # print(soup.p.string) # 当前标签只有文本或只有一个子有文本才拿出来,如果有多个子子孙孙,返回None
        # print(list(soup.p.strings)) # 把子子孙孙的文本内容放到generator

        #5、嵌套选择
        # 可以连续点嵌套选择
        # print(soup.head.title.string)

        #6、子节点、子孙节点
        # print(soup.p.contents) #p下所有子节点,放到列表中
        # print(list(soup.p.children)) #得到一个迭代器,包含p下所有子节点,跟contents本质一样,只是节约内存
        # print(list(soup.p.descendants)) #获取子孙节点,p下所有的标签都会选择出来  子子孙孙


        # for i,child in enumerate(soup.p.children):
        #     print(i,child)
        # for i,child in enumerate(soup.p.descendants):
        #     print(i,child)
        #7、父节点、祖先节点
        # print(soup.a.parent) #获取a标签的父节点
        # print(list(soup.a.parents)) #找到a标签所有的祖先节点,父亲的父亲,父亲的父亲的父亲...
        #8、兄弟节点

        print(soup.a.next_sibling) #下一个兄弟
        print(soup.a.previous_sibling) #上一个兄弟

        print(list(soup.a.next_siblings)) #下面的兄弟们=>生成器对象
        print(soup.a.previous_siblings) #上面的兄弟们=>生成器对象



        # . 遍历  
        # 取属性 [] attrrs.get()
        # 取文本  text  string  strings

4 bs4搜索文档树

        from bs4 import BeautifulSoup

        html_doc = """
        <html><head><title>The Dormouse's story</title></head>
        <body>
        <p class="title" id="id_p">lqz<b>The Dormouse's story</b></p>

        <p class="story">Once upon a time there were three little sisters; and their names were
        <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
        <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
        <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
        and they lived at the bottom of a well.</p>

        <p class="story">...</p>
        """
        soup = BeautifulSoup(html_doc, 'lxml')


        # 1、五种过滤器: 字符串、正则表达式、列表、True、方法
        # find:找到第一个      find_all:找所有

        # 字符串  --->value值是字符串
        # res=soup.find_all(name='p')
        # res=soup.find(id='id_p')
        # res=soup.find_all(class_='story')
        # res=soup.find_all(name='p',class_='story')  # and条件
        # res=soup.find(name='a',id='link2').text
        # res=soup.find(name='a',id='link2').attrs.get('href')
        # res=soup.find(attrs={'id':'link2','class':'sister'}).attrs.get('href')
        # print(res)


        # 正则表达式--->value是正则表达式
        # import re
        #
        # # res=soup.find_all(name=re.compile('^b'))
        # # res=soup.find_all(href=re.compile('^http'))
        # res=soup.find_all(class_=re.compile('^s'))
        # print(res)


        # 列表  value值是列表
        # res=soup.find_all(name=['body','a'])
        # res=soup.find_all(class_=['sister','story'])
        # res=soup.find_all(id=['link2','link3'])
        # print(res)

        # True   value值是True
        # res=soup.find_all(name=True)
        # res=soup.find_all(id=True)
        # res=soup.find_all(href=True)
        # print(res)


        # 方法

        # def has_class_but_no_id(tag):
        #     return tag.has_attr('class') and not tag.has_attr('id')
        #
        # print(soup.find_all(name=has_class_but_no_id))  # 有class但是没有id的标签


        #1 html页面中,只要有的东西,通过bs4都可以解析出来
        #2 遍历文档树+搜索文档树混用
        # def has_class_but_no_id(tag):
        #     return tag.has_attr('class') and not tag.has_attr('id')
        # print(soup.find(name=has_class_but_no_id).a.text)

        # 3 find_all的其他参数limit:限制取几条  recursive:是否递归查找

        # def has_class_but_no_id(tag):
        #     return tag.has_attr('class') and not tag.has_attr('id')
        # res=soup.find_all(name=has_class_but_no_id,limit=1)
        #
        # print(res)
        #
        # res=soup.find_all(name='a',recursive=False)  #不递归查找,速度快,只找一层
        # print(res)

5 css选择器


        ### css,xpath选择器是通用的---》基本所有的解析库(bs4,lxml,pyquery,selenium的解析库)--->都支持css选择器-->css在前端通用

        from bs4 import BeautifulSoup

        html_doc = """
        <html><head><title>The Dormouse's story</title></head>
        <body>
        <p class="title" id="id_p">lqz<b>The Dormouse's story</b></p>

        <p class="story">Once upon a time there were three little sisters; and their names were
        <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
        <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
        <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
        and they lived at the bottom of a well.</p>

        <p class="story">...</p>
        """
        soup = BeautifulSoup(html_doc, 'lxml')

        # soup.select() # 找所有
        # soup.select_one() # 找一个

        '''
        div  找div标签
        div>a  找div下的紧邻的a
        div a  找div下的子子孙孙的a
        .sister  找类名为sister的标签
        #id_p    找id为id_p的标签
        '''

        # res=soup.select('#id_p')
        # res=soup.select('.sister')
        # res=soup.select_one('.story>a').attrs.get('href')
        # print(res)

        # 终极大招
        import requests
        response=requests.get('https://www.runoob.com/cssref/css-selectors.html')
        soup=BeautifulSoup(response.text,'lxml')
        res=soup.select_one('#content > table > tbody > tr:nth-child(2) > td:nth-child(3)').text
        print(res)

        # 只要页面中有的通过bs4都能解析出来

posted @ 2022-05-22 22:55  风花雪月*  阅读(147)  评论(0编辑  收藏  举报