爬虫 简单使用
一、常识
import requests # 模块作用:伪造浏览器请求 response = requests.get(访问的url) from bs4 import BeautifulSoup # 将html的内容解析成对象 bs4 = BeautifulSoup(response.text, 'html.parser') # 查找的方法 bs4.find(name='标签名', attrs={'属性名:‘属性值’}) # find_all查找全部 # 获取内容 # content 原始内容 用于获取bytes数据类型(图片、视频) # text 获取
二、示例
import requests from bs4 import BeautifulSoup import os # path = os.path.join(os.getcwd(), 'img') # 1.伪造浏览器请求 response = requests.get("......") response.encoding = 'gbk' # 2.获取网页的html文件 # print(response.text) # 3.使用bs4将html文件解析成对象 bs4 = BeautifulSoup(response.text, 'html.parser') # print(bs4) div = bs4.find(name='div', attrs={'id': 'auto-channel-lazyload-article'}) # print(div) li_list = div.find_all(name='li') for li in li_list: print('='*120) # print(li) h3 = li.find(name='h3') if not h3: continue print(h3.text) a = li.find(name='a') href = a.get('href') print('https:{}'.format(href)) img = li.find(name='img') src = img.get('src') src = 'https:{}'.format(src) print(src) file_name = src.rsplit('/', maxsplit=1)[1] # print(file_name) file_path = os.path.join(path, file_name) # print(file_path) # src是地址,重新伪造get请求 ret = requests.get(src) # content是获取原始的数据 # print(ret.content) # 保存图片 with open(file_path, 'wb') as f: f.write(ret.content)