采集数据
# 在待分析的服务器上执行命令
# 确定采集时间
# date
# 2023年 01月 17日 星期二 14:30:19 CST
# 通过ps命令及其命令行参数,将需要的数据输出为csv格式
# ps -e -o %c, -o rss --sort=-rss > mems.csv
# 原数据be like:
! cat mems.csv | head -n 5
COMMAND , RSS
qemu-system-x86,33823984
qemu-system-x86,16924120
qemu-system-x86,16866464
qemu-system-x86,8627980
导入数据
import pandas as pd
df = pd.read_csv("mems.csv",delimiter=",",)
# 索引从1开始
df.index=df.index+1
# header移除空格
df.columns = df.columns.map(lambda x: x.strip())
# 预览:一共1026个进程
df
| COMMAND | RSS |
1 | qemu-system-x86 | 33823984 |
2 | qemu-system-x86 | 16924120 |
3 | qemu-system-x86 | 16866464 |
4 | qemu-system-x86 | 8627980 |
5 | mysqld | 7683188 |
... | ... | ... |
1022 | kworker/50:1-mm | 0 |
1023 | kworker/55:1-mm | 0 |
1024 | kworker/0:0-eve | 0 |
1025 | kworker/47:0-i4 | 0 |
1026 | kworker/65:1-mm | 0 |
1026 rows × 2 columns
处理数据
# 过滤0值,剩下324个进程
df = df[df.RSS>0]
# 这324个进程共占用了115G内存
df["RSS"].sum()/1024/1024
115.3391342163086
df
| COMMAND | RSS |
1 | qemu-system-x86 | 33823984 |
2 | qemu-system-x86 | 16924120 |
3 | qemu-system-x86 | 16866464 |
4 | qemu-system-x86 | 8627980 |
5 | mysqld | 7683188 |
... | ... | ... |
320 | kolla_start | 4 |
321 | kolla_start | 4 |
322 | kolla_start | 4 |
323 | kolla_start | 4 |
324 | kolla_start | 4 |
324 rows × 2 columns
df = df.copy()
# 增加一列,将RSS单位改成MB
df["RSSMB"] = df.loc[:, ('RSS')].map(lambda x: x/1024)
# 增加GB列
df["RSSGB"] = df.loc[:, ('RSS')].map(lambda x: x/1024/1024)
df.describe()
| RSS | RSSMB | RSSGB |
count | 3.240000e+02 | 324.000000 | 324.000000 |
mean | 3.732773e+05 | 364.528622 | 0.355985 |
std | 2.402061e+06 | 2345.762337 | 2.290784 |
min | 4.000000e+00 | 0.003906 | 0.000004 |
25% | 3.825000e+03 | 3.735352 | 0.003648 |
50% | 1.116600e+04 | 10.904297 | 0.010649 |
75% | 1.061370e+05 | 103.649414 | 0.101220 |
max | 3.382398e+07 | 33031.234375 | 32.257065 |
# 增加一列,显示累计内存消耗量(GB)
df["RSS_GB_CUM"] = df["RSSGB"].cumsum()
df.describe()
| RSS | RSSMB | RSSGB | RSS_GB_CUM |
count | 3.240000e+02 | 324.000000 | 324.000000 | 324.000000 |
mean | 3.732773e+05 | 364.528622 | 0.355985 | 111.379911 |
std | 2.402061e+06 | 2345.762337 | 2.290784 | 8.440401 |
min | 4.000000e+00 | 0.003906 | 0.000004 | 32.257065 |
25% | 3.825000e+03 | 3.735352 | 0.003648 | 110.188729 |
50% | 1.116600e+04 | 10.904297 | 0.010649 | 114.734324 |
75% | 1.061370e+05 | 103.649414 | 0.101220 | 115.253636 |
max | 3.382398e+07 | 33031.234375 | 32.257065 | 115.339134 |
统计分析
# 按内存用量倒排,前15个进程用了100GB内存,剩下的15G由其他几百个进程消耗
df.head(20)
| COMMAND | RSS | RSSMB | RSSGB | RSS_GB_CUM |
1 | qemu-system-x86 | 33823984 | 33031.234375 | 32.257065 | 32.257065 |
2 | qemu-system-x86 | 16924120 | 16527.460938 | 16.140099 | 48.397163 |
3 | qemu-system-x86 | 16866464 | 16471.156250 | 16.085114 | 64.482277 |
4 | qemu-system-x86 | 8627980 | 8425.761719 | 8.228283 | 72.710560 |
5 | mysqld | 7683188 | 7503.113281 | 7.327259 | 80.037819 |
6 | qemu-system-x86 | 4412644 | 4309.222656 | 4.208225 | 84.246044 |
7 | ceph-osd | 3034632 | 2963.507812 | 2.894051 | 87.140095 |
8 | ceph-osd | 2644688 | 2582.703125 | 2.522171 | 89.662266 |
9 | ceph-osd | 2379332 | 2323.566406 | 2.269108 | 91.931374 |
10 | ceph-osd | 2281156 | 2227.691406 | 2.175480 | 94.106853 |
11 | ceph-osd | 1928288 | 1883.093750 | 1.838959 | 95.945812 |
12 | qemu-system-x86 | 1889148 | 1844.871094 | 1.801632 | 97.747444 |
13 | ceph-osd | 1744884 | 1703.988281 | 1.664051 | 99.411495 |
14 | ceph-mon | 953092 | 930.753906 | 0.908939 | 100.320435 |
15 | ovs-vswitchd | 731060 | 713.925781 | 0.697193 | 101.017628 |
16 | openstack-expor | 557808 | 544.734375 | 0.531967 | 101.549595 |
17 | beam.smp | 431564 | 421.449219 | 0.411572 | 101.961166 |
18 | ceph-mgr | 228668 | 223.308594 | 0.218075 | 102.179241 |
19 | systemd-journal | 207168 | 202.312500 | 0.197571 | 102.376812 |
20 | nova-api | 167436 | 163.511719 | 0.159679 | 102.536491 |
# 统计进程数量,倒排
df["COMMAND"].value_counts().head(20)
httpd 67
containerd-shim 42
kolla_start 36
neutron-server 13
nova-api 11
heat-api-cfn 6
heat-api 6
neutron-metadat 6
glance-registry 6
heat-engine 6
nova-conductor 6
glance-api 6
qemu-system-x86 6
ceph-osd 6
bash 4
dnsmasq 4
sleep 4
sshd 3
haproxy 3
keepalived 3
Name: COMMAND, dtype: int64
pd.set_option('display.max_rows',None)
# 按进程名称分组统计内存用量
df.groupby("COMMAND")[["RSSMB", "RSSGB"]].sum().sort_values(by="RSSMB", ascending=False)
| RSSMB | RSSGB |
COMMAND | | |
qemu-system-x86 | 80609.707031 | 78.720417 |
ceph-osd | 13684.550781 | 13.363819 |
mysqld | 7503.113281 | 7.327259 |
httpd | 3424.902344 | 3.344631 |
neutron-server | 1675.730469 | 1.636456 |
nova-api | 1672.753906 | 1.633549 |
ceph-mon | 930.753906 | 0.908939 |
ovs-vswitchd | 713.925781 | 0.697193 |
glance-api | 682.386719 | 0.666393 |
nova-conductor | 674.988281 | 0.659168 |
heat-engine | 596.457031 | 0.582478 |
openstack-expor | 586.960938 | 0.573204 |
glance-registry | 523.085938 | 0.510826 |
neutron-metadat | 504.976562 | 0.493141 |
heat-api | 479.804688 | 0.468559 |
heat-api-cfn | 421.453125 | 0.411575 |
beam.smp | 421.449219 | 0.411572 |
cinder-volume | 381.066406 | 0.372135 |
ceph-mgr | 223.308594 | 0.218075 |
systemd-journal | 202.312500 | 0.197571 |
containerd-shim | 184.539062 | 0.180214 |
nova-compute | 147.371094 | 0.143917 |
neutron-openvsw | 117.742188 | 0.114983 |
nova-scheduler | 111.031250 | 0.108429 |
cinder-schedule | 108.925781 | 0.106373 |
cinder-backup | 104.390625 | 0.101944 |
nova-consoleaut | 103.605469 | 0.101177 |
neutron-l3-agen | 94.960938 | 0.092735 |
neutron-dhcp-ag | 94.824219 | 0.092602 |
neutron-sriov-n | 88.132812 | 0.086067 |
nova-novncproxy | 87.554688 | 0.085503 |
dockerd | 85.347656 | 0.083347 |
memcached | 77.582031 | 0.075764 |
hs_alarm | 74.902344 | 0.073147 |
fluentd | 61.457031 | 0.060017 |
skydive | 57.851562 | 0.056496 |
containerd | 47.425781 | 0.046314 |
libvirtd | 43.031250 | 0.042023 |
hserver | 40.523438 | 0.039574 |
privsep-helper | 35.242188 | 0.034416 |
vm_exporter.py | 29.085938 | 0.028404 |
samd | 27.835938 | 0.027184 |
syslog-ng | 27.675781 | 0.027027 |
tuned | 26.246094 | 0.025631 |
systemd | 22.609375 | 0.022079 |
ha_monitor | 20.277344 | 0.019802 |
NetworkManager | 16.488281 | 0.016102 |
sshd | 16.351562 | 0.015968 |
polkitd | 15.285156 | 0.014927 |
sssd_nss | 14.851562 | 0.014503 |
node_exporter | 13.394531 | 0.013081 |
keepalived | 12.402344 | 0.012112 |
haproxy | 11.812500 | 0.011536 |
ceph-crash | 11.773438 | 0.011497 |
sssd_be | 11.445312 | 0.011177 |
sssd | 10.937500 | 0.010681 |
systemd-udevd | 10.050781 | 0.009815 |
ovsdb-client | 8.867188 | 0.008659 |
lldpd | 8.386719 | 0.008190 |
systemd-logind | 8.046875 | 0.007858 |
ovsdb-server | 7.210938 | 0.007042 |
systemd-machine | 7.058594 | 0.006893 |
registry | 6.867188 | 0.006706 |
bash | 6.175781 | 0.006031 |
rngd | 5.820312 | 0.005684 |
smartd | 5.695312 | 0.005562 |
dnsmasq | 5.585938 | 0.005455 |
dbus-daemon | 5.546875 | 0.005417 |
crond | 5.527344 | 0.005398 |
monitor | 5.132812 | 0.005013 |
rpcbind | 5.093750 | 0.004974 |
ps | 4.968750 | 0.004852 |
auditd | 4.738281 | 0.004627 |
irqbalance | 4.546875 | 0.004440 |
(sd-pam) | 4.425781 | 0.004322 |
gssproxy | 2.980469 | 0.002911 |
docker-proxy | 2.750000 | 0.002686 |
chronyd | 2.425781 | 0.002369 |
haproxy-systemd | 2.003906 | 0.001957 |
lsmd | 1.875000 | 0.001831 |
prom_exporter | 1.597656 | 0.001560 |
agetty | 1.402344 | 0.001369 |
inet_gethost | 1.117188 | 0.001091 |
epmd | 0.902344 | 0.000881 |
mysqld_safe | 0.683594 | 0.000668 |
rabbitmq-server | 0.574219 | 0.000561 |
sleep | 0.289062 | 0.000282 |
start-ovsdb-ser | 0.183594 | 0.000179 |
kolla_start | 0.140625 | 0.000137 |
可疑进程排查
接下来就可以结合进程数量和进程消耗内存总量,逐个排查可疑进程