采集数据

# 在待分析的服务器上执行命令

# 确定采集时间
# date
# 2023年 01月 17日 星期二 14:30:19 CST

# 通过ps命令及其命令行参数,将需要的数据输出为csv格式 
# ps -e -o %c, -o rss --sort=-rss > mems.csv
# 原数据be like:
! cat mems.csv | head -n 5
COMMAND        ,  RSS
qemu-system-x86,33823984
qemu-system-x86,16924120
qemu-system-x86,16866464
qemu-system-x86,8627980

导入数据

import pandas as pd
df = pd.read_csv("mems.csv",delimiter=",",)
# 索引从1开始
df.index=df.index+1
# header移除空格
df.columns = df.columns.map(lambda x: x.strip())
# 预览:一共1026个进程
df
COMMAND RSS
1 qemu-system-x86 33823984
2 qemu-system-x86 16924120
3 qemu-system-x86 16866464
4 qemu-system-x86 8627980
5 mysqld 7683188
... ... ...
1022 kworker/50:1-mm 0
1023 kworker/55:1-mm 0
1024 kworker/0:0-eve 0
1025 kworker/47:0-i4 0
1026 kworker/65:1-mm 0

1026 rows × 2 columns

处理数据

# 过滤0值,剩下324个进程
df = df[df.RSS>0]
# 这324个进程共占用了115G内存
df["RSS"].sum()/1024/1024
115.3391342163086
df
COMMAND RSS
1 qemu-system-x86 33823984
2 qemu-system-x86 16924120
3 qemu-system-x86 16866464
4 qemu-system-x86 8627980
5 mysqld 7683188
... ... ...
320 kolla_start 4
321 kolla_start 4
322 kolla_start 4
323 kolla_start 4
324 kolla_start 4

324 rows × 2 columns

df = df.copy()
# 增加一列,将RSS单位改成MB
df["RSSMB"] = df.loc[:, ('RSS')].map(lambda x: x/1024)
# 增加GB列
df["RSSGB"] = df.loc[:, ('RSS')].map(lambda x: x/1024/1024)
df.describe()
RSS RSSMB RSSGB
count 3.240000e+02 324.000000 324.000000
mean 3.732773e+05 364.528622 0.355985
std 2.402061e+06 2345.762337 2.290784
min 4.000000e+00 0.003906 0.000004
25% 3.825000e+03 3.735352 0.003648
50% 1.116600e+04 10.904297 0.010649
75% 1.061370e+05 103.649414 0.101220
max 3.382398e+07 33031.234375 32.257065
# 增加一列,显示累计内存消耗量(GB)
df["RSS_GB_CUM"] = df["RSSGB"].cumsum()
df.describe()
RSS RSSMB RSSGB RSS_GB_CUM
count 3.240000e+02 324.000000 324.000000 324.000000
mean 3.732773e+05 364.528622 0.355985 111.379911
std 2.402061e+06 2345.762337 2.290784 8.440401
min 4.000000e+00 0.003906 0.000004 32.257065
25% 3.825000e+03 3.735352 0.003648 110.188729
50% 1.116600e+04 10.904297 0.010649 114.734324
75% 1.061370e+05 103.649414 0.101220 115.253636
max 3.382398e+07 33031.234375 32.257065 115.339134

统计分析

# 按内存用量倒排,前15个进程用了100GB内存,剩下的15G由其他几百个进程消耗
df.head(20)
COMMAND RSS RSSMB RSSGB RSS_GB_CUM
1 qemu-system-x86 33823984 33031.234375 32.257065 32.257065
2 qemu-system-x86 16924120 16527.460938 16.140099 48.397163
3 qemu-system-x86 16866464 16471.156250 16.085114 64.482277
4 qemu-system-x86 8627980 8425.761719 8.228283 72.710560
5 mysqld 7683188 7503.113281 7.327259 80.037819
6 qemu-system-x86 4412644 4309.222656 4.208225 84.246044
7 ceph-osd 3034632 2963.507812 2.894051 87.140095
8 ceph-osd 2644688 2582.703125 2.522171 89.662266
9 ceph-osd 2379332 2323.566406 2.269108 91.931374
10 ceph-osd 2281156 2227.691406 2.175480 94.106853
11 ceph-osd 1928288 1883.093750 1.838959 95.945812
12 qemu-system-x86 1889148 1844.871094 1.801632 97.747444
13 ceph-osd 1744884 1703.988281 1.664051 99.411495
14 ceph-mon 953092 930.753906 0.908939 100.320435
15 ovs-vswitchd 731060 713.925781 0.697193 101.017628
16 openstack-expor 557808 544.734375 0.531967 101.549595
17 beam.smp 431564 421.449219 0.411572 101.961166
18 ceph-mgr 228668 223.308594 0.218075 102.179241
19 systemd-journal 207168 202.312500 0.197571 102.376812
20 nova-api 167436 163.511719 0.159679 102.536491
# 统计进程数量,倒排
df["COMMAND"].value_counts().head(20)
httpd              67
containerd-shim    42
kolla_start        36
neutron-server     13
nova-api           11
heat-api-cfn        6
heat-api            6
neutron-metadat     6
glance-registry     6
heat-engine         6
nova-conductor      6
glance-api          6
qemu-system-x86     6
ceph-osd            6
bash                4
dnsmasq             4
sleep               4
sshd                3
haproxy             3
keepalived          3
Name: COMMAND, dtype: int64
pd.set_option('display.max_rows',None)
# 按进程名称分组统计内存用量
df.groupby("COMMAND")[["RSSMB", "RSSGB"]].sum().sort_values(by="RSSMB", ascending=False)
RSSMB RSSGB
COMMAND
qemu-system-x86 80609.707031 78.720417
ceph-osd 13684.550781 13.363819
mysqld 7503.113281 7.327259
httpd 3424.902344 3.344631
neutron-server 1675.730469 1.636456
nova-api 1672.753906 1.633549
ceph-mon 930.753906 0.908939
ovs-vswitchd 713.925781 0.697193
glance-api 682.386719 0.666393
nova-conductor 674.988281 0.659168
heat-engine 596.457031 0.582478
openstack-expor 586.960938 0.573204
glance-registry 523.085938 0.510826
neutron-metadat 504.976562 0.493141
heat-api 479.804688 0.468559
heat-api-cfn 421.453125 0.411575
beam.smp 421.449219 0.411572
cinder-volume 381.066406 0.372135
ceph-mgr 223.308594 0.218075
systemd-journal 202.312500 0.197571
containerd-shim 184.539062 0.180214
nova-compute 147.371094 0.143917
neutron-openvsw 117.742188 0.114983
nova-scheduler 111.031250 0.108429
cinder-schedule 108.925781 0.106373
cinder-backup 104.390625 0.101944
nova-consoleaut 103.605469 0.101177
neutron-l3-agen 94.960938 0.092735
neutron-dhcp-ag 94.824219 0.092602
neutron-sriov-n 88.132812 0.086067
nova-novncproxy 87.554688 0.085503
dockerd 85.347656 0.083347
memcached 77.582031 0.075764
hs_alarm 74.902344 0.073147
fluentd 61.457031 0.060017
skydive 57.851562 0.056496
containerd 47.425781 0.046314
libvirtd 43.031250 0.042023
hserver 40.523438 0.039574
privsep-helper 35.242188 0.034416
vm_exporter.py 29.085938 0.028404
samd 27.835938 0.027184
syslog-ng 27.675781 0.027027
tuned 26.246094 0.025631
systemd 22.609375 0.022079
ha_monitor 20.277344 0.019802
NetworkManager 16.488281 0.016102
sshd 16.351562 0.015968
polkitd 15.285156 0.014927
sssd_nss 14.851562 0.014503
node_exporter 13.394531 0.013081
keepalived 12.402344 0.012112
haproxy 11.812500 0.011536
ceph-crash 11.773438 0.011497
sssd_be 11.445312 0.011177
sssd 10.937500 0.010681
systemd-udevd 10.050781 0.009815
ovsdb-client 8.867188 0.008659
lldpd 8.386719 0.008190
systemd-logind 8.046875 0.007858
ovsdb-server 7.210938 0.007042
systemd-machine 7.058594 0.006893
registry 6.867188 0.006706
bash 6.175781 0.006031
rngd 5.820312 0.005684
smartd 5.695312 0.005562
dnsmasq 5.585938 0.005455
dbus-daemon 5.546875 0.005417
crond 5.527344 0.005398
monitor 5.132812 0.005013
rpcbind 5.093750 0.004974
ps 4.968750 0.004852
auditd 4.738281 0.004627
irqbalance 4.546875 0.004440
(sd-pam) 4.425781 0.004322
gssproxy 2.980469 0.002911
docker-proxy 2.750000 0.002686
chronyd 2.425781 0.002369
haproxy-systemd 2.003906 0.001957
lsmd 1.875000 0.001831
prom_exporter 1.597656 0.001560
agetty 1.402344 0.001369
inet_gethost 1.117188 0.001091
epmd 0.902344 0.000881
mysqld_safe 0.683594 0.000668
rabbitmq-server 0.574219 0.000561
sleep 0.289062 0.000282
start-ovsdb-ser 0.183594 0.000179
kolla_start 0.140625 0.000137

可疑进程排查

接下来就可以结合进程数量和进程消耗内存总量,逐个排查可疑进程

文章目录