实战小案例
1 爬取汽车之家新闻
View Code
import requests
from bs4 import BeautifulSoup
# 分析页面,不要一上来就爬根路径,里面东西很杂
# 可以按分页爬、分类爬、日期归档爬,要进到数据所在的具体的url里面再去爬
# 1、浏览器调试,数据都在ul--li标签里,用bs4解析
res=requests.get('https://www.autohome.com.cn/news/1/#liststart')
soup=BeautifulSoup(res.text,'lxml')
# 2、查找class为article-wrapper的div-->找div下的ul--->再找ul下的li,更准确,因为页面其他地方也有li
div=soup.find(class_='article-wrapper') # 按class查找div
div=soup.find(id='auto-channel-lazyload-article') # 按id查找div
ul=soup.find(class_='article') # 直接查找ul
# 继续找ul下的所有li,查找出来ul是Tag对象,可以继续用soup对象方法
# li标签没有class和id,就按名字name查找,第一个参数是name可以不写
li_list=ul.find_all(name='li')
for li in li_list:
title=li.find(name='h3') # 查找标题,title还是Tag对象,text方法转文本
if title: # 新闻中间插了广告,没有h3标签,爬取会报错,因此顶层加判断,广告跳过不爬
title=title.text
url='https:'+li.find(name='a').attrs.get('href') # 链接地址
desc=li.find(name='p').text # 获取摘要
img='https:'+li.find(name='img').attrs.get('src') # 获取图片
print('''
新闻标题:%s
新闻地址:%s
新闻摘要:%s
新闻图片:%s
'''%(title,url,desc,img))
2、
View Code
#https://www.qiushibaike.com/text/page/2/
import requests
from bs4 import BeautifulSoup
ret=requests.get('https://www.qiushibaike.com/text/page/2/')
# print(ret.text)
soup=BeautifulSoup(ret.text,'html.parser')
article_list=soup.find_all(class_='article')
# print(article_list)
for article in article_list:
content=article.find(class_='content').text
print(content)
print('-------')
3、爬红楼梦小说
View Code
#http://www.shicimingju.com/book/hongloumeng.html
import requests
from bs4 import BeautifulSoup
ret=requests.get('https://www.shicimingju.com/book/hongloumeng.html')
# print(ret.text)
soup=BeautifulSoup(ret.text,'lxml')
li_list=soup.find(class_='book-mulu').find('ul').find_all('li')
with open('hlm.txt','w',encoding='utf-8') as f:
for li in li_list:
content=li.find('a').text
url='https://www.shicimingju.com'+li.find('a').get('href')
f.write(content)
f.write('\n')
res_content=requests.get(url)
soup2=BeautifulSoup(res_content.text,'lxml')
content_detail=soup2.find(class_='chapter_content').text
f.write(content_detail)
f.write('\n')
print(content,'写入了')
4、爬取豆瓣250电影信息
View Code
# 爬取豆瓣top250电影信息
# 用redis存储数据
1、建连接池POOL
2、转成json格式字符串
3、存Hash格式
# 翻页参数,start加25
import requests
from lxml import etree
import json
import redis
from redis_pool import POOL
header = {'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)',
'Accept': '*/*',
'Connection': 'keep-alive',
'Accept-Language': 'zh-CN,zh;q=0.8',
}
count = 0
for i in range(10):
url = 'https://movie.douban.com/top250?start={}&filter='.format(count)
res = requests.get(url, headers=header)
html = etree.HTML(res.text)
li_list = html.xpath('//*[@id="content"]/div/div[1]/ol/li')
for li in li_list:
try:
data_dic = {}
index = li.xpath('./div/div[1]/em/text()')[0]
title = li.xpath('./div/div[2]/div[1]/a/span[1]/text()')[0]
team = li.xpath('./div/div[2]/div[2]/p[1]/text()')[0].strip().replace("\xa0\xa0\xa0", "、")
release_time = li.xpath('./div/div[2]/div[2]/p[1]/text()')[1].strip().split('/')[0].strip()
country = li.xpath('./div/div[2]/div[2]/p[1]/text()')[1].strip().split('/')[1].strip()
genre = li.xpath('./div/div[2]/div[2]/p[1]/text()')[1].strip().split('/')[2].strip()
score = li.xpath('./div/div[2]/div[2]/div/span[2]/text()')[0]
comment = li.xpath('./div/div[2]/div[2]/div/span[4]/text()')[0]
quote = li.xpath('./div/div[2]/div[2]/p[2]/span/text()')
pic_link = li.xpath('./div/div[1]/a/img/@src')
detail_link = li.xpath('./div/div[2]/div[1]/a/@href')
if quote:
foreword = quote[0]
else:
foreword = ''
data_dic['排名'] = index
data_dic['片名'] = title
data_dic['主创团队'] = team
data_dic['上映日期'] = release_time
data_dic['国家'] = country
data_dic['类型'] = genre
data_dic['评分'] = score
data_dic['评论数'] = comment
data_dic['引言'] = foreword
data_dic['图片地址'] = pic_link
data_dic['详情地址'] = detail_link
json_payload = json.dumps(data_dic)
conn = redis.Redis(connection_pool=POOL)
conn.hset('db_film', index, json_payload)
except Exception as e:
print(e)
count += 25
print('爬取结束')
5、
View Code
from selenium import webdriver
import time
from selenium.webdriver.common.keys import Keys # 模拟键盘输入
bro=webdriver.Chrome(executable_path='./chromedriver.exe')
bro.implicitly_wait(10) # 设置隐式等待
def get_goods_info(bro):
goods = bro.find_elements_by_css_selector('.gl-item')
for good in goods:
try:
price = good.find_element_by_css_selector('.p-price i').text
name = good.find_element_by_css_selector('.p-name em').text
url = good.find_element_by_css_selector('.p-img a').get_attribute('href')
commits = good.find_element_by_css_selector('.p-commit strong>a').text
photo_url = good.find_element_by_css_selector('.p-img img').get_attribute('src')
print('''
商品名字:%s
商品价格:%s
商品地址:%s
商品评论数:%s
商品图片地址:%s
''' % (name, price, url, commits, photo_url))
except Exception as e:
continue # 某些页面中间插有广告,跟正常li标签不同,会报错,捕获异常,继续for循环
next_button = bro.find_element_by_partial_link_text('下一页')
time.sleep(1)
next_button.click()
get_goods_info(bro) # 进入到下一页,递归调用函数,直到最后一页,报错-->捕获异常-->关闭浏览器
try:
bro.get('https://www.jd.com/')
input_k = bro.find_element_by_id('key')
input_k.send_keys('手机') # 可以从数据库随机拿数据动态输入
input_k.send_keys(Keys.ENTER) # 模拟键盘的回车键,就不用去找搜索按钮再去点击按钮
get_goods_info(bro)
except Exception as e:
print(e)
finally:
bro.close()