Ceph安装部署
部署前准备
修改hostname
hostnamectl set-hostname ceph01
hostnamectl set-hostname ceph02
hostnamectl set-hostname ceph03
配置hosts文件
cat /etc/hosts <<EOF
127.0.0.1 localhost localhost.localdomain localhost4 localhost4.localdomain4
::1 localhost localhost.localdomain localhost6 localhost6.localdomain6
10.0.0.2 ceph01
10.0.0.3 ceph02
10.0.0.4 ceph03
EOF
关闭selinux和firewalld
setenforce 0
sed -i "s/SELINUX=enforcing/SELINUX=permissive/g" /etc/selinux/config
systemctl stop firewalld
systemctl disable firewalld
systemctl stop iptables
systemctl disable iptables
配置yum源,配置Ceph 的阿里yum源,选择安装版本为luminous:
yum install -y yum-utils && sudo yum install --nogpgcheck -y epel-release
cat >/etc/yum.repos.d/ceph.repo<<EOF
[ceph]
name=ceph
baseurl=http://mirrors.aliyun.com/ceph/rpm-luminous/el7/x86_64/
gpgcheck=0
priority=1
[ceph-noarch]
name=cephnoarch
baseurl=http://mirrors.aliyun.com/ceph/rpm-luminous/el7/noarch/
gpgcheck=0
priority=1
[ceph-source]
name=Ceph source packages
baseurl=http://mirrors.aliyun.com/ceph/rpm-luminous/el7/SRPMS
enabled=0
gpgcheck=1
type=rpm-md
gpgkey=http://mirrors.aliyun.com/ceph/keys/release.asc
priority=1
EOF
配置时间同步
yum -y install chrony
systemctl status chrony
systemctl enable chrony
systemctl restart chrony
vim /etc/chrony.conf
...
# 常用国内时间服务器
server cn.pool.ntp.org iburst
server tw.pool.ntp.org iburst
# 可以是指定服务器
server 10.0.0.2 iburst
...
systemctl restart chrony
ceph-deploy部署ceph集群
ceph01控制节点免密登录到其他各个节点:
ssh-keygen
ssh-copy-id ceph02
ssh-copy-id ceph03
在ceph01节点上,安装ceph-deploy
yum -y install python2-pip ceph ceph-deploy
创建集群配置目录,ceph-deploy部署过程中会生成一些集群初始化配置文件和key,后续扩容的时候也需要用到,因此,首先在ceph01上创建一个单独的目录,后续操作都进入到该目录中进行操作,以创建的/opt/ceph-cluster为例。
在ceph01上创建目录:
mkdir -p /opt/ceph-cluster && cd /opt/ceph-cluster
创建一个ceph集群,也就是创建mon节点,并指定mon节点ceph01,可以指定cluster-network(集群内部通讯)和public-network(外部访问ceph集群)
ceph-deploy new ceph01 ceph02 ceph03
通过上面的输出可以看到,new初始化集群过程中会生成ssh key密钥,ceph.conf配置文件,ceph.mon.keyring认证管理密钥,配置cluster network和public network
# ceph.conf文件
[global]
public network = 192.168.50.219/24
cluster network = 10.0.2.3/24
使用ceph-deploy工具ceph软件的二进制包安装到所有节点上,在管理节点(ceph01)上执行命令:
ceph-deploy install --release luminous --no-adjust-repos ceph02 ceph03
在管理节点(ceph01)创建第一个monitor(监控),部署初始监控器并收集密钥,执行一下命令:
ceph-deploy --overwrite-conf mon create-initial
[root@ceph01 ~]# tree ceph-cluster/
my-cluster/
├── ceph.bootstrap-mds.keyring
├── ceph.bootstrap-mgr.keyring
├── ceph.bootstrap-osd.keyring
├── ceph.bootstrap-rgw.keyring
├── ceph.client.admin.keyring
├── ceph.conf
├── ceph-deploy-ceph.log
└── ceph.mon.keyring
列出各个节点上的磁盘命令:
ceph-deploy disk list ceph01 ceph02 ceph03
格式化磁盘,清空磁盘数据和分区表,请慎重选择需要作为osd的磁盘
ceph-deploy disk zap ceph01 /dev/sdb
ceph-deploy disk zap ceph02 /dev/sdb
ceph-deploy disk zap ceph03 /dev/sdb
安装Ceph CLI,方便执行一些管理命令
# ceph-deploy admin ceph01 ceph02 ceph03
配置mgr,用于管理集群
# ceph-deploy mgr create ceph01 ceph02 ceph03
部署rgw
# yum install -y ceph-radosgw
# ceph-deploy rgw create ceph01
部署MDS(CephFS)
# ceph-deploy mds create ceph01 ceph02 ceph03
添加osd(添加的磁盘必须是没有被处理过的裸盘)
ceph-deploy --overwrite-conf osd create ceph01 --data /dev/sdb
ceph-deploy --overwrite-conf osd create ceph02 --data /dev/sdb
ceph-deploy --overwrite-conf osd create ceph03 --data /dev/sdb
创建存储池
ceph osd pool create testpool 128 128 [replicated|erasure] [rule]
其中128指的是归置,官方推荐少于5个OSD时可以配置为128,具体描述可以到下面连接中查看
https://link.zhihu.com/?target=http%3A//docs.ceph.org.cn/rados/operations/placement-groups/
ceph.conf详细参数
[global]#全局设置
fsid = xxxxxxxxxxxxxxx #集群标识ID
mon host = 10.0.1.1,10.0.1.2,10.0.1.3 #monitor IP 地址
auth cluster required = cephx #集群认证
auth service required = cephx #服务认证
auth client required = cephx #客户端认证
osd pool default size = 3 #最小副本数 默认是3
osd pool default min size = 1 #PG 处于 degraded 状态不影响其 IO 能力,min_size是一个PG能接受IO的最小副本数
public network = 10.0.1.0/24 #公共网络(monitorIP段)
cluster network = 10.0.2.0/24 #集群网络
max open files = 131072 #默认0#如果设置了该选项,Ceph会设置系统的max open fds
mon initial members = ceph01, ceph02, ceph03 #初始monitor (由创建monitor命令而定)
##############################################################
[mon]
mon data = /var/lib/ceph/mon/ceph-$id
mon clock drift allowed = 1 #默认值0.05#monitor间的clock drift
mon osd min down reporters = 13 #默认值1#向monitor报告down的最小OSD数
mon osd down out interval = 600 #默认值300 #标记一个OSD状态为down和out之前ceph等待的秒数
##############################################################
[osd]
osd data = /var/lib/ceph/osd/ceph-$id
osd mkfs type = xfs #格式化系统类型
osd max write size = 512 #默认值90 #OSD一次可写入的最大值(MB)
osd client message size cap = 2147483648 #默认值100 #客户端允许在内存中的最大数据(bytes)
osd deep scrub stride = 131072 #默认值524288 #在Deep Scrub时候允许读取的字节数(bytes)
osd op threads = 16 #默认值2 #并发文件系统操作数
osd disk threads = 4 #默认值1 #OSD密集型操作例如恢复和Scrubbing时的线程
osd map cache size = 1024 #默认值500 #保留OSD Map的缓存(MB)
osd map cache bl size = 128 #默认值50 #OSD进程在内存中的OSD Map缓存(MB)
osd mount options xfs = "rw,noexec,nodev,noatime,nodiratime,nobarrier" #默认值rw,noatime,inode64 #Ceph OSD xfs Mount选项
osd recovery op priority = 2 #默认值10 #恢复操作优先级,取值1-63,值越高占用资源越高
osd recovery max active = 10 #默认值15 #同一时间内活跃的恢复请求数
osd max backfills = 4 #默认值10 #一个OSD允许的最大backfills数
osd min pg log entries = 30000 #默认值3000 #修建PGLog是保留的最大PGLog数
osd max pg log entries = 100000 #默认值10000 #修建PGLog是保留的最大PGLog数
osd mon heartbeat interval = 40 #默认值30 #OSD ping一个monitor的时间间隔(默认30s)
ms dispatch throttle bytes = 1048576000 #默认值 104857600 #等待派遣的最大消息数
objecter inflight ops = 819200 #默认值1024 #客户端流控,允许的最大未发送io请求数,超过阀值会堵塞应用io,为0表示不受限
osd op log threshold = 50 #默认值5 #一次显示多少操作的log
osd crush chooseleaf type = 0 #默认值为1 #CRUSH规则用到chooseleaf时的bucket的类型
##############################################################
[client]
rbd cache = true #默认值 true #RBD缓存
rbd cache size = 335544320 #默认值33554432 #RBD缓存大小(bytes)
rbd cache max dirty = 134217728 #默认值25165824 #缓存为write-back时允许的最大dirty字节数(bytes),如果为0,使用write-through
rbd cache max dirty age = 30 #默认值1 #在被刷新到存储盘前dirty数据存在缓存的时间(seconds)
rbd cache writethrough until flush = false #默认值true #该选项是为了兼容linux-2.6.32之前的virtio驱动,避免因为不发送flush请求,数据不回写
#设置该参数后,librbd会以writethrough的方式执行io,直到收到第一个flush请求,才切换为writeback方式。
rbd cache max dirty object = 2 #默认值0 #最大的Object对象数,默认为0,表示通过rbd cache size计算得到,librbd默认以4MB为单位对磁盘Image进行逻辑切分
#每个chunk对象抽象为一个Object;librbd中以Object为单位来管理缓存,增大该值可以提升性能
rbd cache target dirty = 235544320 #默认值16777216 #开始执行回写过程的脏数据大小,不能超过 rbd_cache_max_dirty
ceph的其他重要操作
OSD规则划分(指定OSD创建pool)
[root@ceph0001 ~]# ceph osd tree
ID CLASS WEIGHT TYPE NAME STATUS REWEIGHT PRI-AFF
-15 1.24698 root harbor
-9 0.74799 host ceph0004
3 hdd 0.49899 osd.3 up 1.00000 1.00000
7 hdd 0.24899 osd.7 up 0.70000 1.00000
-11 0.49899 host ceph0005
4 hdd 0.49899 osd.4 up 0.50000 1.00000
-14 1.24698 root prometheus
-9 0.74799 host ceph0004
3 hdd 0.49899 osd.3 up 1.00000 1.00000
7 hdd 0.24899 osd.7 up 0.70000 1.00000
-11 0.49899 host ceph0005
4 hdd 0.49899 osd.4 up 0.50000 1.00000
-13 1.24496 root enos
-3 0.24899 host ceph0001
0 hdd 0.24899 osd.0 up 1.00000 1.00000
-5 0.49799 host ceph0002
1 hdd 0.24899 osd.1 up 1.00000 1.00000
5 hdd 0.24899 osd.5 up 1.00000 1.00000
-7 0.49799 host ceph0003
2 hdd 0.24899 osd.2 up 1.00000 1.00000
6 hdd 0.24899 osd.6 up 1.00000 1.00000
-1 2.49194 root default
-3 0.24899 host ceph0001
0 hdd 0.24899 osd.0 up 1.00000 1.00000
-5 0.49799 host ceph0002
1 hdd 0.24899 osd.1 up 1.00000 1.00000
5 hdd 0.24899 osd.5 up 1.00000 1.00000
-7 0.49799 host ceph0003
2 hdd 0.24899 osd.2 up 1.00000 1.00000
6 hdd 0.24899 osd.6 up 1.00000 1.00000
-9 0.74799 host ceph0004
3 hdd 0.49899 osd.3 up 1.00000 1.00000
7 hdd 0.24899 osd.7 up 0.70000 1.00000
-11 0.49899 host ceph0005
4 hdd 0.49899 osd.4 up 0.50000 1.00000
====================================================================
获取crush map
ceph osd getcrushmap -o crushmap
反编译crush map
crushtool -d crush.map -o crush.map
示例
#vim crush.map
将prometheus class hdd 修改为harbor class hdd
rule prometheushdd {
id 2
type replicated
min_size 1
max_size 10
step take prometheus class hdd
step chooseleaf firstn 0 type host
step emit
}
rule harborhdd {
id 3
type replicated
min_size 1
max_size 10
step take harbor class hdd
step chooseleaf firstn 0 type host
step emit
}
修改crush.map 后重新生成crushmap
cp crush.map crush.map-new
crushtool -c crush.map-new -o crushmap-new
ceph osd setcrushmap -i crushmap-new
下线osd
下线 osd.8
ceph osd out 8
登录osd所在宿主机,停止进程
systemctl stop ceph-osd@8
systemctl disable ceph-osd@8
清理 osd 元数据
ceph osd purge 8 --yes-i-really-mean-it
此时状态是 HELATH_OK , 触发 PG 自动均衡,部分 PG 开始 “active+remapped+backfilling“ ,等待回填完成
将 ceph1014 移除 (此步骤根据实际 hostname 变更)
ceph osd crush remove ceph1014
检测集群状态
# ceph -s
HEALTH_OK
# ceph osd tree
“下线的osd 已删除”
ceph扩容
OSD扩容
扩容前提
pid最大线程数
/etc/sysctl.conf
kernel.pid_max = 4194303
禁用swap
vm.swappiness=0
# sysctl -p
时钟同步
运维人员统一配置ntp server
用户
创建ceph用户
sudo useradd -d /home/ceph -m ceph
sudo passwd ceph
配置ceph用户权限
vim /etc/pam.d/su
auth sufficient pam_wheel.so trust use_uid
配置权限
# usermod -G wheel ceph
# echo "ceph ALL = (root) NOPASSWD:ALL" | sudo tee /etc/sudoers.d/ceph
# chmod 0440 /etc/sudoers.d/ceph
配置ssh
# su - ceph
# ssh-keygen
# 配置 ~/.ssh/authorized_keys
将ceph0001的ceph用户的公钥粘贴进去
# chmod 600 authorized_keys
配置本地yum源
cd /data/apaas/lightning/apaas-yum && nohup python -m SimpleHTTPServer 8888 &
安装CEPH
[ceph@ceph0001 root]$ cd /home/ceph/ceph-deploy
ceph-deploy install --release nautilus ceph8006 --repo-url http://10.27.22.6/ceph/rpm-nautilus/el7 --nogpgcheck
===========================
HDD
/dev/sdc(根据实际情况修改)
ceph-deploy disk zap ceph8006 /dev/sdc
ceph-deploy osd create --data /dev/sdc ceph8006
ceph osd crush reweight osd.6 0
ceph osd in osd.6
# 根据实际情况调整weight
ceph osd crush reweight osd.6 0.3
SDD
/dev/sdd(根据实际情况修改)
ceph osd set noin
ceph-deploy disk zap ceph8006 /dev/sdd
ceph-deploy osd create --data /dev/sdd ceph8006
ceph osd crush reweight osd.7 0
ceph osd crush rm-device-class osd.7
ceph osd crush set-device-class ssd osd.7
ceph osd unset noin
ceph osd in osd.7
# 根据实际情况调整weight
ceph osd crush reweight osd.7 0.3
调整节点归属
# 新建两个ceph存储节点及osd存储,假定为ceph0007和ceph0008
# 调整节点归属
ceph osd crush link ceph0007 root=harbor
ceph osd crush link ceph0008 root=harbor
ceph关机操作
关机前
参数设置(只需执行一次)
ceph osd set nobackfill
ceph osd set norecover
ceph osd set nodown
ceph osd set noout
停止服务
依次关机ceph monitor
systemctl stop ceph-monitor@cephxxx.service
依次关机ceph mgr
systemctl stop ceph-mgr@cephxxx.service
依此关机ceph mds
systemctl stop ceph-mds@cephxxx.service
依次关机ceph rgw
systemctl stop ceph-rgw@cephxxx.service
依次关机ceph osd
systemctl stop ceph-osd@x.service
开机后
ceph osd unset nobackfill
ceph osd unset norecover
ceph osd unset nodown
ceph osd unset noout
Ceph 大文件错误Large object
[root@ceph0001 ~]# ceph health detail
HEALTH_WARN 1 large omap objects
LARGE_OMAP_OBJECTS 1 large omap objects
1 large objects found in pool 'cephfs_metadata'
Search the cluster log for 'Large omap object found' for more details.
ceph里面的对象在集群中的管理文件size是有上限的,大小上限为20万,超过这个大小上限的会自动进行分片分解。
[root@ceph0001 log]# ceph config show-with-defaults osd.14 | grep osd_deep_scrub_large_omap_object_key_threshold
osd_deep_scrub_large_omap_object_key_threshold 200000 default
[root@ceph0001 log]# ceph config show-with-defaults osd.17 | grep osd_deep_scrub_large_omap_object_key_threshold
osd_deep_scrub_large_omap_object_key_threshold 200000 default
[root@ceph0001 log]# ceph config show-with-defaults osd.9 | grep osd_deep_scrub_large_omap_object_key_threshold
osd_deep_scrub_large_omap_object_key_threshold 200000 default
要解决此问题,请按照以下步骤操作:首先,找到大文件所在的归置组。使用以下命令确定 PG 组:
for i in `ceph pg ls-by-pool [POOL NAME] | tail -n +2 | head -n -2 | awk '{print $1}'`; do echo -n "$i: "; ceph pg $i query | grep num_large_omap_objects | head -1 | awk '{print $2}'; done | grep ": 1"
此命令将为您提供受大型对象影响的归置组编号。例如:第 2.26 页
有必要确定PG生活在哪个OSD上。我们可以通过使用以下命令来做到这一点:
# ceph pg map [PG #]
此命令的输出应类似于以下内容
# ceph pg map 2.26
osdmap e8768 pg 2.26 (2.26) -> up [14,17,9] acting [14,17,9]
现在我们已经确定了哪些 OSD 托管了大型 omap 对象,我们需要对它们进行深度清理。
ceph osd deep-scrub osd.14
ceph osd deep-scrub osd.17
ceph osd deep-scrub osd.9
逐个从以下机器中找出主osd并查看对应ceph-osd.14.log
[root@ceph0001 log]# ceph osd find 14
{
"osd": 14,
"addrs": {
"addrvec": [
{
"type": "v2",
"addr": "172.16.2.100:6800",
"nonce": 23656
},
{
"type": "v1",
"addr": "172.16.2.100:6801",
"nonce": 23656
}
]
},
"osd_fsid": "bc0deba4-757a-4f26-97e2-625955217612",
"host": "ceph0013",
"crush_location": {
"host": "ceph0013",
"root": "default"
}
}
[root@ceph0001 log]# ceph osd find 17
{
"osd": 17,
"addrs": {
"addrvec": [
{
"type": "v2",
"addr": "172.16.2.103:6800",
"nonce": 2075334
},
{
"type": "v1",
"addr": "172.16.2.103:6801",
"nonce": 2075334
}
]
},
"osd_fsid": "c74ef526-f3bc-4b30-9397-983160723e49",
"host": "ceph0016",
"crush_location": {
"host": "ceph0016",
"root": "default"
}
}
[root@ceph0001 log]# ceph osd find 9
{
"osd": 9,
"addrs": {
"addrvec": [
{
"type": "v2",
"addr": "172.16.2.94:6800",
"nonce": 23795
},
{
"type": "v1",
"addr": "172.16.2.94:6801",
"nonce": 23795
}
]
},
"osd_fsid": "caa5ec30-d803-4393-ad34-b3f86c31feb2",
"host": "ceph0008",
"crush_location": {
"host": "ceph0008",
"root": "default"
}
}
[root@ceph0013 ceph]# grep 'Large' ceph-osd.14.log
2023-07-11 23:03:56.951 7f9e892b4700 0 log_channel(cluster) log [WRN] : Large omap object found. Object: 1:654134d2:::mds0_openfiles.0:head Key count: 400209 Size (bytes): 26632669
2023-07-12 00:17:01.132 7f9e892b4700 0 log_channel(cluster) log [WRN] : Large omap object found. Object: 1:654134d2:::mds0_openfiles.0:head Key count: 400352 Size (bytes): 26647712
2023-07-12 00:34:14.053 7f9e892b4700 0 log_channel(cluster) log [WRN] : Large omap object found. Object: 1:654134d2:::mds0_openfiles.0:head Key count: 400411 Size (bytes): 26650287
2023-07-12 00:38:45.543 7f9e892b4700 0 log_channel(cluster) log [WRN] : Large omap object found. Object: 1:654134d2:::mds0_openfiles.0:head Key count: 400487 Size (bytes): 26653453
2023-07-12 00:49:08.310 7f9e892b4700 0 log_channel(cluster) log [WRN] : Large omap object found. Object: 1:654134d2:::mds0_openfiles.0:head Key count: 400430 Size (bytes): 26651110
这里使用指令osd deep-scrub来检查,使ceph发现问题,自动重新进行分片即可
ceph osd deep-scrub 2.26
检查的过程可以使用指令查看
tail -f ceph.log |grep "osd\.26"
第一遍是修复,如果还未修复,查看上步的日志,定向修复这个pg
但是本质上,此处涉及到的没分片应该是个ceph的bug,需要升级到14.2.8来修复
重启mds,切换mds重新检查即可完成(可以查看journalctl -u ceph-mds@mds机器名称 -f 查看日志在加载大文件,过程可能较慢)
ceph pg deep-scrub 2.26
Ceph rgw的log池满
由于对象数量可能太大,改为脚本执行,读取出来对象在删除
clean_rgw_log.sh
#!/bin/bash
# author:zhiyong.lin
rados -p default.rgw.log ls > rgw.log.item.tobedeleted
while read line
do
echo ----------delete object $line----------
rados -p default.rgw.log rm $line
done < rgw.log.item.tobedeleted
首先查了一下ceph官网和相关网站
ceph官网http://docs.ceph.org.cn/radosgw/config-ref/#id7
官网没怎么介绍log池的内容,我认为这么来说这里面的数据不重要
Ceph slow request处理
处理流程
1.首先通过ceph -s指令寻找出ceph集群中的mds机器。searchSlowRequest.sh
2.在mds机器上执行此脚本,确定集群中slow request的client端的数据来源。
3.根据脚本输出结果,得到相应的挂载点。寻找相应的pod或者数据进行处理。
如此结果中对应有3个pod存在slow request问题。则需要一一清除这三个pod后,再次使用脚本进行确认
4.若pod删除后,发现ceph集群还未恢复正常,或者此脚本还能查询到相应的pod,则需要使用指令,查询具体的挂载主机等信息
ceph daemon mds.ceph0002 session ls|grep k8sdata/kubelet/pods/336bbdfe-33ea-4ef3-9660-0924f3cad362/volumes/kubernetes.io~cephfs/pvc-3638de51-e84d-4744-a7f6-1ca68de03e7e -C 20
5.如果,查询到具体的主机上之后,登录主机,使用mount指令与ps查询当前挂载点与相应的进程。kill进行并且解除挂载。
6.待进程删除且挂载解除后。等待几分钟,ceph集群则恢复正常。可以使用上面的脚本进行验证。
在mds机器上执行,会显示出所有slow request来自于哪里,这里slow的定义是请求大于60秒(可调整)
#!/bin/bash
base=60
clientStr="client"
bigFlag=0
hostname="`hostname`"
printSlowRequestPod(){
ceph daemon mds.$hostname session ls|grep $2 -C 10|grep mount_point|sed 's/^[ \t]*//g'
}
# begin
ceph daemon /var/run/ceph/ceph-mds.$hostname.asok dump_ops_in_flight|grep -e duration -e "\"client\": \"client" > temp_1
cat temp_1 | awk -F': ' '{print $2}' |awk '{sub(/.$/,"")}1' > temp_2
while read line
do
ifInclude=$(echo $line |grep "${clientStr}")
if [[ "$ifInclude" != "" ]]
then if [ "$bigFlag" = "1" ]
then
inst=$(echo $line|sed 's/\"//g')
printSlowRequestPod $hostname $inst
fi
bigFlag=0
continue
fi
bigFlag=$(echo "$line > $base" | bc)
done < temp_2