实战小案例

1 爬取汽车之家新闻

View Code
 import requests
from bs4 import BeautifulSoup

# 分析页面,不要一上来就爬根路径,里面东西很杂
# 可以按分页爬、分类爬、日期归档爬,要进到数据所在的具体的url里面再去爬

# 1、浏览器调试,数据都在ul--li标签里,用bs4解析
res=requests.get('https://www.autohome.com.cn/news/1/#liststart')
soup=BeautifulSoup(res.text,'lxml')

# 2、查找class为article-wrapper的div-->找div下的ul--->再找ul下的li,更准确,因为页面其他地方也有li
div=soup.find(class_='article-wrapper')             # 按class查找div
div=soup.find(id='auto-channel-lazyload-article')   # 按id查找div
ul=soup.find(class_='article')                      # 直接查找ul
# 继续找ul下的所有li,查找出来ul是Tag对象,可以继续用soup对象方法    
# li标签没有class和id,就按名字name查找,第一个参数是name可以不写
li_list=ul.find_all(name='li')         
for li in li_list:
    title=li.find(name='h3')    # 查找标题,title还是Tag对象,text方法转文本
    if title:                   # 新闻中间插了广告,没有h3标签,爬取会报错,因此顶层加判断,广告跳过不爬
        title=title.text
        url='https:'+li.find(name='a').attrs.get('href')   # 链接地址
        desc=li.find(name='p').text                        # 获取摘要
        img='https:'+li.find(name='img').attrs.get('src')  # 获取图片
        print('''
        新闻标题:%s
        新闻地址:%s
        新闻摘要:%s
        新闻图片:%s
        
        '''%(title,url,desc,img))

 

2、爬糗事百科段子

View Code
 #https://www.qiushibaike.com/text/page/2/
import requests
from bs4 import BeautifulSoup
ret=requests.get('https://www.qiushibaike.com/text/page/2/')
# print(ret.text)

soup=BeautifulSoup(ret.text,'html.parser')

article_list=soup.find_all(class_='article')
# print(article_list)
for article in article_list:
    content=article.find(class_='content').text
    print(content)
    print('-------')

 

3、爬红楼梦小说

View Code
 #http://www.shicimingju.com/book/hongloumeng.html

import requests

from bs4 import BeautifulSoup
ret=requests.get('https://www.shicimingju.com/book/hongloumeng.html')
# print(ret.text)

soup=BeautifulSoup(ret.text,'lxml')
li_list=soup.find(class_='book-mulu').find('ul').find_all('li')
with open('hlm.txt','w',encoding='utf-8') as f:
    for li in li_list:
        content=li.find('a').text
        url='https://www.shicimingju.com'+li.find('a').get('href')

        f.write(content)
        f.write('\n')
        res_content=requests.get(url)
        soup2=BeautifulSoup(res_content.text,'lxml')
        content_detail=soup2.find(class_='chapter_content').text
        f.write(content_detail)
        f.write('\n')
        print(content,'写入了')

 

4、爬取豆瓣250电影信息

View Code
 # 爬取豆瓣top250电影信息
# 用redis存储数据
1、建连接池POOL
2、转成json格式字符串
3、存Hash格式
# 翻页参数,start加25

import requests
from lxml import etree
import json
import redis
from redis_pool import POOL

header = {'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)',
          'Accept': '*/*',
          'Connection': 'keep-alive',
          'Accept-Language': 'zh-CN,zh;q=0.8',
          }

count = 0
for i in range(10):
    url = 'https://movie.douban.com/top250?start={}&filter='.format(count)
    res = requests.get(url, headers=header)
    html = etree.HTML(res.text)
    li_list = html.xpath('//*[@id="content"]/div/div[1]/ol/li')

    for li in li_list:
        try:
            data_dic = {}
            index = li.xpath('./div/div[1]/em/text()')[0]
            title = li.xpath('./div/div[2]/div[1]/a/span[1]/text()')[0]
            team = li.xpath('./div/div[2]/div[2]/p[1]/text()')[0].strip().replace("\xa0\xa0\xa0", "、")
            release_time = li.xpath('./div/div[2]/div[2]/p[1]/text()')[1].strip().split('/')[0].strip()
            country = li.xpath('./div/div[2]/div[2]/p[1]/text()')[1].strip().split('/')[1].strip()
            genre = li.xpath('./div/div[2]/div[2]/p[1]/text()')[1].strip().split('/')[2].strip()
            score = li.xpath('./div/div[2]/div[2]/div/span[2]/text()')[0]
            comment = li.xpath('./div/div[2]/div[2]/div/span[4]/text()')[0]
            quote = li.xpath('./div/div[2]/div[2]/p[2]/span/text()')
            pic_link = li.xpath('./div/div[1]/a/img/@src')
            detail_link = li.xpath('./div/div[2]/div[1]/a/@href')
            if quote:
                foreword = quote[0]
            else:
                foreword = ''

            data_dic['排名'] = index
            data_dic['片名'] = title
            data_dic['主创团队'] = team
            data_dic['上映日期'] = release_time
            data_dic['国家'] = country
            data_dic['类型'] = genre
            data_dic['评分'] = score
            data_dic['评论数'] = comment
            data_dic['引言'] = foreword
            data_dic['图片地址'] = pic_link
            data_dic['详情地址'] = detail_link

            json_payload = json.dumps(data_dic)
            conn = redis.Redis(connection_pool=POOL)
            conn.hset('db_film', index, json_payload)
        except Exception as e:
            print(e)

    count += 25

print('爬取结束')

 

5、爬取京东商品信息

View Code
 from selenium import webdriver
import time
from selenium.webdriver.common.keys import Keys  # 模拟键盘输入

bro=webdriver.Chrome(executable_path='./chromedriver.exe')
bro.implicitly_wait(10)  # 设置隐式等待

def get_goods_info(bro):
    goods = bro.find_elements_by_css_selector('.gl-item')
    for good in goods:
        try:
            price = good.find_element_by_css_selector('.p-price i').text
            name = good.find_element_by_css_selector('.p-name em').text
            url = good.find_element_by_css_selector('.p-img a').get_attribute('href')
            commits = good.find_element_by_css_selector('.p-commit strong>a').text
            photo_url = good.find_element_by_css_selector('.p-img img').get_attribute('src')

            print('''
            商品名字:%s
            商品价格:%s
            商品地址:%s
            商品评论数:%s
            商品图片地址:%s
    
            ''' % (name, price, url, commits, photo_url))
        except Exception as e:
            continue  # 某些页面中间插有广告,跟正常li标签不同,会报错,捕获异常,继续for循环

    next_button = bro.find_element_by_partial_link_text('下一页')
    time.sleep(1)
    next_button.click()
    get_goods_info(bro) # 进入到下一页,递归调用函数,直到最后一页,报错-->捕获异常-->关闭浏览器
   
try:
    bro.get('https://www.jd.com/')
    input_k = bro.find_element_by_id('key')
    input_k.send_keys('手机')      # 可以从数据库随机拿数据动态输入
    input_k.send_keys(Keys.ENTER)  # 模拟键盘的回车键,就不用去找搜索按钮再去点击按钮
    get_goods_info(bro)
except Exception as e:
    print(e)
finally:
    bro.close()

 

 

posted @ 2022-09-30 15:32  不会钓鱼的猫  阅读(38)  评论(0编辑  收藏  举报