一、bs4规则
1 from bs4 import BeautifulSoup 2 3 soup = BeautifulSoup(open('test_bs4.html',encoding='utf'),'lxml') 4 5 '''1.标签查找''' 6 # print(soup.a) 7 8 '''2.获取属性''' 9 # print(soup.a['title']) #获取单个属性 10 # print(soup.a.attrs) #获取所有属性,返回一个字典 11 # print(soup.a.attrs['title']) #获取单个属性 12 13 '''3.获取内容''' 14 # print(soup.a.text) 15 # print(soup.a.string) #只能拿到标签中的文本,拿不到子标签的内容 16 # print(soup.a.get_text()) 17 18 '''4.find方法''' 19 # print(soup.find('a')) #找到第一个a标签 20 # print(soup.find('a',title='qin')) #限制属性 21 # print(soup.find('a',class_='nu')) #class关键字,要加下换线 22 # print(soup.find('a',id='feng')) 23 24 '''5.找到指定标签下的子标签''' 25 # div = soup.find('div',class_='tang') 26 # print(div.find('a')) #找到class='tang'的div下的a 27 28 '''6.find_all方法''' 29 # print(div.find('a')) 30 31 # div = soup.find('div',class_='tang') 32 # print(div.find_all('a')) 33 # print(div.find_all('a',limit=2)) #找到前两个 34 # print(soup.find_all(['a','b'])) #同时找多类标签 35 36 '''select-通过选择器找''' 37 # print(soup.select('.tang .nu')) 38 # print(soup.select('#feng')) 39 # print(soup.select('.tang .nu')[0].text) 40 # print(soup.select('.tang .nu')[0]['href']) 41 '''select选择器返回的永远是列表''' 42 43 div = soup.find('div',class_='tang') 44 print(div.select('#feng'))
二、bs4例子
import urllib.request import urllib.parse from bs4 import BeautifulSoup import json class ZhiLianSpider(object): url = 'https://ty.fang.anjuke.com/loupan/' def __init__(self,qu,num,start_page,end_page): self.qu = qu self.num = num self.start_page = start_page self.end_page = end_page self.items = [] # 生成url,创建请求对象 def handle_request(self,page): '''拼接url''' url = self.url + self.qu + '/' + 'h' + self.num + '/' # print(url) '''创建请求对象''' header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)' ' Chrome/73.0.3683.86 Safari/537.36', } req = urllib.request.Request(url=url,headers=header) return req #解析内容 def parse_content(self,content): soup = BeautifulSoup(content,'lxml') '''思路:先找到所有岗位的盒子,再通过这个盒子对象找里面具体的信息''' # 找到盒子 title_list = soup.select('.key-list > .item-mod')[1:] print(len(title_list)) # print(title_list) #遍历盒子 for box in title_list: #标题 if box.select('.infos .lp-name h3 span'): title = box.select('.infos .lp-name h3 span')[0].text #地址 if box.select('.infos .address span'): address = box.select('.infos .address span')[0].text # 价格 if box.select('.favor-pos p span'): price = box.select('.favor-pos p span')[0].text #户型 if box.select('.huxing span'): huxing = box.select('.huxing span')[0].text #放到字典中 item= {'标题':title, '地址':address, '价格':price, '户型':huxing} #再放到列表中 self.items.append(item) def run(self): #循环爬取每页数据 for page in range(self.start_page,self.end_page+1): #创建请求 request = self.handle_request(page) # print(request) #发送请求,获取内容 content = urllib.request.urlopen(request).read().decode() # print(content) # with open('fang.html','wb') as fp: # fp.write(content) #解析内容,并存入列表 info = self.parse_content(content) #写入文件 string = json.dumps(self.items,ensure_ascii=False) with open('fang.csv','w',encoding='utf8') as fp: fp.write(string) def main(): #输入需求关键字 qu = input('输入地点:') num = input('输入室数:') start_page = int(input('起始页码:')) end_page = int(input('结束页码:')) #创建对象,启动爬取程序 spider = ZhiLianSpider(qu,num,start_page,end_page) spider.run() if __name__ == '__main__': main()
三、xpath提取本地文件
from lxml import etree #生成对象 tree = etree.parse('xpath.html') print(tree) ret = tree.xpath('//div[@class="song"]/p[2]') print(ret) ret = tree.xpath('//div[@class="tang"]/ul/li[2]/a/text()')[0] print(ret)
四、xpath例子
import urllib.request import urllib.parse from lxml import etree import time import os '''爬取建筑图片,用xpath解析 懒加载''' def create_request(url,page): #拼接url if page == 1: req_url = url.format('') else: req_url = url.format('_' + str(page)) header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) ' 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'} #构建请求对象 req = urllib.request.Request(url=req_url,headers=header) return req def download_img(img_src): #创建文件夹 dirname = 'jianzhu' if not os.path.exists(dirname): os.mkdir(dirname) #定义文件名 img_name = os.path.basename(img_src) #拼接图片路径 filepath = dirname + '/' + img_name header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) ' 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'} req = urllib.request.Request(url=img_src, headers=header) rep = urllib.request.urlopen(req) with open(filepath,'wb') as fp: fp.write(rep.read()) def parse_content(content): tree = etree.HTML(content) img_list = tree.xpath('//div[@id="container"]/div/div/a/img/@src2') # print(img_list) # print(len(img_list)) for img_src in img_list: download_img(img_src) def main(): url = 'http://sc.chinaz.com/tupian/tesejianzhutupian{}.html' start_page = int(input('起始页码:')) end_page = int(input('结束页码:')) for page in range(start_page,end_page+1): print('第%s页开始下载...' %page) #创建请求 req = create_request(url,page) #发送请求,得到内容 rep = urllib.request.urlopen(req).read().decode() #解析内容,下载图片 parse_content(rep) print('第%s页结束下载...' % page) time.sleep(2) if __name__ == '__main__': main()
五、json用到的函数
import json lt = [{'name':'王宝强','age':'30'}, {'name':'王保墙','age':'32'}, {'name':'王饱蔷','age':'35'}, {'name':'王煲樯','age':'33'}, ] '''python对象转json格式字符串''' str = json.dumps(lt) print(str) print(type(str)) '''json格式字符串转python对象''' r = json.loads(str) print(r) print(type(r)) '''dump()''' json.dump(tt,open('book1.txt','w',encoding='utf8')) '''load()''' obj = json.load(open('xx.txt','r',encoding='utf8')) print(type(obj)) print(obj)
六、jsonpath规则
import jsonpath import json '''将json格式字符串转换为python对象''' obj = json.load(open('book1.txt','r',encoding='utf8')) # print(type(obj)) '''查找第三本书的作者''' # ret = jsonpath.jsonpath(obj,'$.store.book[2].author') # print(ret) '''查找所有作者''' # ret = jsonpath.jsonpath(obj,'$..author') # print(ret) '''查找第store下所有节点''' # ret = jsonpath.jsonpath(obj,'$.store.*') # print(ret) '''查找store下第三本书''' # ret = jsonpath.jsonpath(obj,'$..book[2]') # print(ret) '''查找store下最后一本书''' # ret = jsonpath.jsonpath(obj,'$..book[(@.length-1)]') # print(ret) # ret = jsonpath.jsonpath(obj,'$..book[:2]') # print(ret) '''查找带有status键的书''' ret = jsonpath.jsonpath(obj,'$..book[?(@.status)]') print(ret)