ceph 010 clustermap ceph调优
cluster map
[ceph: root@clienta /]# ceph mon dump
epoch 4
fsid 2ae6d05a-229a-11ec-925e-52540000fa0c
last_changed 2021-10-01T09:33:53.880442+0000
created 2021-10-01T09:30:30.146231+0000
min_mon_release 16 (pacific)
election_strategy: 1
0: [v2:172.25.250.12:3300/0,v1:172.25.250.12:6789/0] mon.serverc.lab.example.com
1: [v2:172.25.250.10:3300/0,v1:172.25.250.10:6789/0] mon.clienta
2: [v2:172.25.250.13:3300/0,v1:172.25.250.13:6789/0] mon.serverd
3: [v2:172.25.250.14:3300/0,v1:172.25.250.14:6789/0] mon.servere
dumped monmap epoch 4 #数字方便同步
[ceph: root@clienta /]#
[ceph: root@clienta /]# ceph osd dump
epoch 401
fsid 2ae6d05a-229a-11ec-925e-52540000fa0c
created 2021-10-01T09:30:32.028240+0000
modified 2022-08-20T14:56:19.230208+0000
flags sortbitwise,recovery_deletes,purged_snapdirs,pglog_hardlimit
crush_version 77
full_ratio 0.95
backfillfull_ratio 0.9
nearfull_ratio 0.85
require_min_compat_client luminous
min_compat_client jewel
require_osd_release pacific
stretch_mode_enabled false
pool 1 'device_health_metrics' replicated size 3 min_size 2 crush_rule 0 object_hash rjenkins pg_num 1 pgp_num 1 autoscale_mode on last_change 374 flags hashpspool stripe_width 0 pg_num_min 1 application mgr_devicehealth
pool 2 '.rgw.root' replicated size 3 min_size 2 crush_rule 0 object_hash rjenkins pg_num 32 pgp_num 32 autoscale_mode on last_change 48 flags hashpspool stripe_width 0 application rgw
pool 3 'default.rgw.log' replicated size 3 min_size 2 crush_rule 0 object_hash rjenkins pg_num 32 pgp_num 32 autoscale_mode on last_change 50 flags hashpspool stripe_width 0 application rgw
pool 4 'default.rgw.control' replicated size 3 min_size 2 crush_rule 0 object_hash rjenkins pg_num 32 pgp_num 32 autoscale_mode on last_change 52 flags hashpspool stripe_width 0 application rgw
pool 5 'default.rgw.meta' replicated size 3 min_size 2 crush_rule 0 object_hash rjenkins pg_num 8 pgp_num 8 autoscale_mode on last_change 184 lfor 0/184/182 flags hashpspool stripe_width 0 pg_autoscale_bias 4 pg_num_min 8 application rgw
pool 10 'pool1' replicated size 3 min_size 2 crush_rule 1 object_hash rjenkins pg_num 32 pgp_num 32 autoscale_mode on last_change 266 flags hashpspool stripe_width 0
pool 11 'ssdpool' replicated size 3 min_size 2 crush_rule 2 object_hash rjenkins pg_num 32 pgp_num 32 autoscale_mode on last_change 338 flags hashpspool stripe_width 0
pool 12 'myecpool' erasure profile myprofile1 size 4 min_size 3 crush_rule 3 object_hash rjenkins pg_num 32 pgp_num 32 autoscale_mode on last_change 345 flags hashpspool stripe_width 8192
pool 13 'myecpool2' erasure profile myprofile2 size 4 min_size 3 crush_rule 4 object_hash rjenkins pg_num 32 pgp_num 32 autoscale_mode on last_change 350 flags hashpspool stripe_width 8192
max_osd 9
osd.0 up in weight 1 up_from 360 up_thru 398 down_at 354 last_clean_interval [243,350) [v2:172.25.250.12:6800/2528022353,v1:172.25.250.12:6801/2528022353] [v2:172.25.249.12:6802/2528022353,v1:172.25.249.12:6803/2528022353] exists,up 5be66be9-8262-4c4b-9654-ed549f6280f7
osd.1 up in weight 1 up_from 359 up_thru 397 down_at 354 last_clean_interval [244,350) [v2:172.25.250.12:6808/3093181835,v1:172.25.250.12:6809/3093181835] [v2:172.25.249.12:6810/3093181835,v1:172.25.249.12:6811/3093181835] exists,up 3f751363-a03c-4b76-af92-8114e38bfa09
osd.2 up in weight 1 up_from 363 up_thru 378 down_at 354 last_clean_interval [242,350) [v2:172.25.250.12:6816/1645468882,v1:172.25.250.12:6817/1645468882] [v2:172.25.249.12:6818/1645468882,v1:172.25.249.12:6819/1645468882] exists,up 68d72b66-4c99-4d54-a7e4-f1cb8f8e5054
osd.3 up in weight 1 up_from 363 up_thru 390 down_at 354 last_clean_interval [236,350) [v2:172.25.250.13:6816/2535000344,v1:172.25.250.13:6817/2535000344] [v2:172.25.249.13:6818/2535000344,v1:172.25.249.13:6819/2535000344] exists,up 21a9ebe9-908d-4026-8a57-8fbee935033e
osd.4 up in weight 1 up_from 354 up_thru 400 down_at 353 last_clean_interval [237,350) [v2:172.25.250.14:6800/408153468,v1:172.25.250.14:6801/408153468] [v2:172.25.249.14:6802/408153468,v1:172.25.249.14:6803/408153468] exists,up 85202210-9298-4443-9140-027792ddc891
osd.5 up in weight 1 up_from 363 up_thru 399 down_at 354 last_clean_interval [235,350) [v2:172.25.250.13:6802/1745131990,v1:172.25.250.13:6803/1745131990] [v2:172.25.249.13:6804/1745131990,v1:172.25.249.13:6805/1745131990] exists,up 252d1668-c4c2-42ca-85fe-87c7419557d6
osd.6 up in weight 1 up_from 353 up_thru 381 down_at 352 last_clean_interval [237,350) [v2:172.25.250.14:6804/1927667266,v1:172.25.250.14:6806/1927667266] [v2:172.25.249.14:6807/1927667266,v1:172.25.249.14:6811/1927667266] exists,up 2d753bfc-32f6-4663-9411-16067f366977
osd.7 up in weight 1 up_from 363 up_thru 378 down_at 354 last_clean_interval [236,350) [v2:172.25.250.13:6800/4217605284,v1:172.25.250.13:6801/4217605284] [v2:172.25.249.13:6806/4217605284,v1:172.25.249.13:6808/4217605284] exists,up fccc62ed-9b04-456a-95c3-5c3cb27e56d4
osd.8 up in weight 1 up_from 357 up_thru 399 down_at 356 last_clean_interval [237,350) [v2:172.25.250.14:6816/3368063169,v1:172.25.250.14:6817/3368063169] [v2:172.25.249.14:6818/3368063169,v1:172.25.249.14:6819/3368063169] exists,up 8b0789f2-f40e-4d63-ac52-343b8e11f24c
blocklist 172.25.250.14:6825/1595923670 expires 2022-08-21T14:55:16.971863+0000
blocklist 172.25.250.14:6824/1595923670 expires 2022-08-21T14:55:16.971863+0000
blocklist 172.25.250.14:0/3491691321 expires 2022-08-21T14:55:16.971863+0000
blocklist 172.25.250.14:0/2738777763 expires 2022-08-21T14:55:16.971863+0000
blocklist 172.25.250.12:0/1239900377 expires 2022-08-20T16:19:27.333673+0000
blocklist 172.25.250.12:6825/3912612299 expires 2022-08-20T16:19:27.333673+0000
blocklist 172.25.250.12:6824/3912612299 expires 2022-08-20T16:19:27.333673+0000
blocklist 172.25.250.12:0/2171541544 expires 2022-08-20T16:19:27.333673+0000
blocklist 172.25.250.12:0/1139201862 expires 2022-08-20T16:19:27.333673+0000
blocklist 172.25.250.12:0/2525786376 expires 2022-08-21T08:52:54.506446+0000
blocklist 172.25.250.14:0/3949782568 expires 2022-08-21T14:55:16.971863+0000
blocklist 172.25.250.12:0/1486113939 expires 2022-08-21T08:52:54.506446+0000
blocklist 172.25.250.12:6825/2537331399 expires 2022-08-21T08:52:54.506446+0000
blocklist 172.25.250.12:0/2290094124 expires 2022-08-21T08:52:54.506446+0000
blocklist 172.25.250.12:6824/2537331399 expires 2022-08-21T08:52:54.506446+0000
[ceph: root@clienta /]#
[ceph: root@clienta /]# ceph pg dump
#忽略输出,太多了
osd 100-200 最多承载pg,建议值
[ceph: root@clienta /]# ceph mgr dump | grep "dashboard"
"config_dashboard": {
"name": "config_dashboard",
"default_value": "registry.redhat.io/rhceph/rhceph-5-dashboard-rhel8:latest",
"name": "dashboard",
"default_value": "osd,host,dashboard,pool,block,nfs,ceph,monitors,gateway,logs,crush,maps",
"config_dashboard": {
"name": "config_dashboard",
"default_value": "registry.redhat.io/rhceph/rhceph-5-dashboard-rhel8:latest",
"name": "dashboard",
"default_value": "osd,host,dashboard,pool,block,nfs,ceph,monitors,gateway,logs,crush,maps",
"config_dashboard": {
"name": "config_dashboard",
"default_value": "registry.redhat.io/rhceph/rhceph-5-dashboard-rhel8:latest",
"name": "dashboard",
"default_value": "osd,host,dashboard,pool,block,nfs,ceph,monitors,gateway,logs,crush,maps",
"dashboard",
"config_dashboard": {
"name": "config_dashboard",
"default_value": "registry.redhat.io/rhceph/rhceph-5-dashboard-rhel8:latest",
"name": "dashboard",
"default_value": "osd,host,dashboard,pool,block,nfs,ceph,monitors,gateway,logs,crush,maps",
"dashboard": "https://172.25.250.14:8443/",
[ceph: root@clienta /]#
Cluster Map基本查询
ceph mon dump
ceph osd dump
ceph osd crush dump
ceph pg dump all
ceph fs dump
ceph mgr dump
ceph service dump
mon小集群,三节点部署 存放所有map
[root@serverc 2ae6d05a-229a-11ec-925e-52540000fa0c]# pwd
/var/lib/ceph/2ae6d05a-229a-11ec-925e-52540000fa0c
[root@serverc 2ae6d05a-229a-11ec-925e-52540000fa0c]# ll
total 292
drwx------. 3 root root 149 Oct 1 2021 alertmanager.serverc
-rw-r--r--. 1 root root 295991 Oct 1 2021 cephadm.d7a73386d1e46cffff151775b8e1d098069c88b89aea56cab15b079c1a1f555f
drwx------. 3 167 167 20 Oct 1 2021 crash
drwx------. 2 167 167 167 Oct 1 2021 crash.serverc
drwx------. 4 472 472 161 Oct 1 2021 grafana.serverc
drwx------. 2 167 167 167 Oct 1 2021 mgr.serverc.lab.example.com.aiqepd
drwx------. 3 167 167 224 Oct 1 2021 mon.serverc.lab.example.com
drwx------. 2 nobody nobody 138 Oct 1 2021 node-exporter.serverc
drwx------. 2 167 167 275 Aug 20 10:54 osd.0
drwx------. 2 167 167 275 Aug 20 10:54 osd.1
drwx------. 2 167 167 275 Aug 20 10:54 osd.2
drwx------. 4 root root 161 Oct 1 2021 prometheus.serverc
drwx------. 2 167 167 167 Oct 29 2021 rgw.realm.zone.serverc.bqwjcv
drwxr-xr-x. 2 root root 6 Oct 1 2021 selinux
[root@serverc 2ae6d05a-229a-11ec-925e-52540000fa0c]#
角色相关信息
奇数部署好一些
osd之间会发消息,确定心跳。osd无心跳时,osd会汇报mon
数据恢复:副本丢失情况下,恢复副本的过程
数据回填:当有新的osd加入时 (重平衡)
osd是看使用比率
osd 最大70%左右 再大的话就不好恢复
[ceph: root@clienta /]# ceph osd set noout
noout is set
[ceph: root@clienta /]# ceph osd unset noout
noout is unset
[ceph: root@clienta /]#
nearfull_ratio 0.85 提醒集群容量块满了 health warn(扩容)
backfillfull_ratio 0.9 当osd使用比达到90%,数据禁止回填,但是可以恢复,正常对外提供读写
full_ratio 0.95 当osd使用比达到95%,数据禁止写入,可以读,可以恢复
[ceph: root@clienta /]# ceph osd set-full-ratio 0.95
osd set-full-ratio 0.95
[ceph: root@clienta /]# ceph osd set-nearfull-ratio 0.85
osd set-nearfull-ratio 0.85
[ceph: root@clienta /]# ceph osd dump
epoch 426
fsid 2ae6d05a-229a-11ec-925e-52540000fa0c
created 2021-10-01T09:30:32.028240+0000
modified 2022-08-20T17:56:35.571847+0000
flags sortbitwise,recovery_deletes,purged_snapdirs,pglog_hardlimit
crush_version 82
full_ratio 0.95
backfillfull_ratio 0.9
nearfull_ratio 0.85
https://docs.ceph.com/en/quincy/?rtd_search=mon_osd_down_out_interval+
可以寻找这些参数
设置osd权重
0就是尽量不分配在这个osd上面,移除时,先改为0
[ceph: root@clienta /]# ceph osd primary-affinity osd.0 0
降低权重
[ceph: root@clienta /]# ceph pg dump pgs_brief
PG_STAT STATE UP UP_PRIMARY ACTING ACTING_PRIMARY
4.8 active+clean [4,3,0] 4 [4,3,0] 4
3.f active+clean [7,4,0] 7 [7,4,0] 7
2.e active+clean [2,4,3] 2 [2,4,3] 2
4.b active+clean [7,0,4] 7 [7,0,4] 7
3.c active+clean [5,0,6] 5 [5,0,6] 5
2.d active+clean [4,3,2] 4 [4,3,2] 4
4.a active+clean [5,1,4] 5 [5,1,4] 5
3.d active+clean [7,6,2] 7 [7,6,2] 7
2.c active+clean [6,0,5] 6 [6,0,5] 6
3.a active+clean [3,1,8] 3 [3,1,8] 3
osd.0不在作为主了
[ceph: root@clienta /]# ceph pg dump pgs_brief | grep "\[6"
dumped pgs_brief
2.c active+clean [6,0,5] 6 [6,0,5] 6
2.a active+clean [6,1,3] 6 [6,1,3] 6
4.3 active+clean [6,7,1] 6 [6,7,1] 6
过滤带特殊符号
参数
上面的默认值可以去官网查,可能有变化
ceph 调优
ceph对吞吐量较高,需要大内存,则numa架构就不适合
如果你的程序不占用大内存,要求更快的程序运行时间,你应该选择限制值访问本numa node的方式来进行处理
Ceph部署最佳实践
MON的性能对集群总体性能至关重要,应用部署于专用节点,为确保正确仲裁,数量应为奇数个
在OSD节点上,操作系统、OSD数据、OSD日志应当位于独立的磁盘上,以确保满意的吞吐量
在集群安装后,需要监控集群、排除故障并维护,尽管 Ceph具有自愈功能。如果发生性能问题,首先在磁盘、网络和硬件层面上排查。然后逐步转向RADOS块设备和Ceph对象网关
RBD建议
块设备上的工作负载通常是I/O密集型负载,例如在OpenStack中虚拟机上运行数据库。
对于RBD,OSD日志应当位于SSD或者NVMe设备上
对后端存储,可以使用不同的存储设备以提供不同级别的服务
OSD建议硬件
将一个raid1磁盘用于操作系统
每个OSD一块硬盘,将SSD或者NVMe用于日志
使用多个10Gb网卡,每个网络一个双链路绑定
每个OSD预留1个CPU,每个逻辑核心1GHz
分配16GB内存,外加每个OSD 2G内存
现在ceph可以自动计算
cephpgc:可以去看一下,这个红帽官网的计算器,还挺有意思
Ceph网络
尽可能使用10Gb网络带宽
尽可能使用不同的cluster网络和public网络
网络监控
OSD建议硬件
将一个raid1磁盘用于操作系统
每个OSD一块硬盘,将SSD或者NVMe用于日志
使用多个10Gb网卡,每个网络一个双链路绑定
每个OSD预留1个CPU,每个逻辑核心1GHz
分配16GB内存,外加每个OSD 2G内存
其他性能测试工具
dd
echo 3 > /proc/sys/vm/drop_caches
dd if=/dev/zero of=/var/lib/ceph/osd/ceph-0/test.img bs=4M count=1024 oflag=direct
dd if=/var/lib/ceph/osd/ceph-0/test.img of=/dev/null bs=4M count=1024 oflag=direct
fio
https://help.aliyun.com/document_detail/95501.html?spm=a2c4g.11174283.6.640.6e904da23dhdcG
[ceph: root@clienta /]# ceph osd pool create pool1
pool 'pool1' created
[ceph: root@clienta /]# rados bench -p pool1 10 write --no-cleanup
hints = 1
Maintaining 16 concurrent writes of 4194304 bytes to objects of size 4194304 for up to 10 seconds or 0 objects
Object prefix: benchmark_data_clienta.lab.example.com_565
sec Cur ops started finished avg MB/s cur MB/s last lat(s) avg lat(s)
0 0 0 0 0 0 - 0
1 16 16 0 0 0 - 0
2 16 17 1 1.99911 2 1.77034 1.77034
3 16 19 3 3.99849 8 2.89986 2.41562
4 16 20 4 3.99855 4 3.87552 2.78059
5 16 26 10 7.99736 24 4.87003 3.66784
6 16 29 13 8.65267 12 5.8558 3.89705
7 16 36 20 11.4123 28 2.25577 4.20837
8 16 39 23 11.4849 12 3.18817 4.17326
9 16 49 33 14.6481 40 1.93119 3.67961
10 16 54 38 15.1205 20 4.61332 3.71135
11 15 54 39 14.1054 4 4.50752 3.73177
12 14 54 40 13.262 4 3.58412 3.72808
13 11 54 43 13.1608 12 3.9051 3.71667
Total time run: 13.7161
Total writes made: 54
Write size: 4194304
Object size: 4194304
Bandwidth (MB/sec): 15.7479
Stddev Bandwidth: 11.8495
Max bandwidth (MB/sec): 40
Min bandwidth (MB/sec): 0
Average IOPS: 3
Stddev IOPS: 3.00427
Max IOPS: 10
Min IOPS: 0
Average Latency(s): 3.86659
Stddev Latency(s): 1.48435
Max latency(s): 7.45216
Min latency(s): 1.17718
[ceph: root@clienta /]#
[ceph: root@clienta /]# rados bench -p pool1 10 seq
hints = 1
sec Cur ops started finished avg MB/s cur MB/s last lat(s) avg lat(s)
0 0 0 0 0 0 - 0
1 16 24 8 31.8681 32 0.539518 0.460968
2 16 45 29 57.6669 84 0.657187 0.773738
3 5 54 49 65.0267 80 0.595555 0.685997
4 2 54 52 51.6497 12 2.35873 0.808986
Total time run: 4.26827
Total reads made: 54
Read size: 4194304
Object size: 4194304
Bandwidth (MB/sec): 50.606
Average IOPS: 12
Stddev IOPS: 8.90693
Max IOPS: 21
Min IOPS: 3
Average Latency(s): 0.856345
Max latency(s): 3.07995
Min latency(s): 0.0897737
[ceph: root@clienta /]#
[ceph: root@clienta /]# rados bench -p pool1 10 rand
hints = 1
sec Cur ops started finished avg MB/s cur MB/s last lat(s) avg lat(s)
0 0 0 0 0 0 - 0
1 16 26 10 39.8594 40 0.450443 0.523675
2 16 45 29 57.894 76 1.81343 0.569421
3 16 54 38 50.5224 36 2.38602 0.792168
4 16 79 63 62.8348 100 0.0543633 0.813247
5 16 94 78 62.2342 60 2.35538 0.832442
6 16 127 111 73.8291 132 0.141455 0.779658
7 16 158 142 80.881 124 1.5348 0.742651
8 16 188 172 85.4177 120 0.431023 0.71256
9 16 208 192 84.786 80 0.657024 0.690867
10 16 213 197 78.2818 20 0.30201 0.702446
11 11 213 202 72.9987 20 2.83034 0.737541
Total time run: 11.4804
Total reads made: 213
Read size: 4194304
Object size: 4194304
Bandwidth (MB/sec): 74.2134
Average IOPS: 18
Stddev IOPS: 10.4045
Max IOPS: 33
Min IOPS: 5
Average Latency(s): 0.829176
Max latency(s): 3.0047
Min latency(s): 0.0343662
测之前还是得清除缓存和对象
[ceph: root@clienta /]# rados -p pool1 cleanup
Removed 54 objects
[root@clienta ~]# sysctl vm.drop_caches=3
我这是虚拟机部署,不是物理机,与物理机比性能高下立判。物理机Bandwidth (MB/sec): 1000 虚拟机Bandwidth (MB/sec): 74.2134
[ceph: root@clienta /]# rbd pool init pool1
[ceph: root@clienta /]# rbd create --size 1G pool1/image1
[ceph: root@clienta /]# rbd info pool1/image1
rbd image 'image1':
size 1 GiB in 256 objects
order 22 (4 MiB objects)
snapshot_count: 0
id: 197ad26b4bdeb
block_name_prefix: rbd_data.197ad26b4bdeb
format: 2
features: layering, exclusive-lock, object-map, fast-diff, deep-flatten
op_features:
flags:
create_timestamp: Sun Aug 21 10:29:30 2022
access_timestamp: Sun Aug 21 10:29:30 2022
modify_timestamp: Sun Aug 21 10:29:30 2022
[ceph: root@clienta /]# rbd bench --io-type write image1 --pool=pool1
bench type write io_size 4096 io_threads 16 bytes 1073741824 pattern sequential
SEC OPS OPS/SEC BYTES/SEC
1 6288 6174.26 24 MiB/s
2 6800 3117.98 12 MiB/s
3 7232 2402.35 9.4 MiB/s
4 7856 1891.83 7.4 MiB/s
5 8336 1666.05 6.5 MiB/s
6 9040 552.049 2.2 MiB/s
7 14160 1514.69 5.9 MiB/s
8 17472 2018.1 7.9 MiB/s
9 23056 3039.35 12 MiB/s
10 26000 3539.12 14 MiB/s
11 28416 3876.7 15 MiB/s
读
[ceph: root@clienta /]# rbd bench --io-type read image1 --pool=pool1
bench type read io_size 4096 io_threads 16 bytes 1073741824 pattern sequential
SEC OPS OPS/SEC BYTES/SEC
1 400 452.168 1.8 MiB/s
2 816 430.636 1.7 MiB/s
3 1248 431.099 1.7 MiB/s
4 1712 441.599 1.7 MiB/s
5 2144 438.929 1.7 MiB/s
6 2560 429.247 1.7 MiB/s
7 2896 414.255 1.6 MiB/s