采集数据
# 在待分析的服务器上执行命令
# 确定采集时间
# date
# 2023年 01月 17日 星期二 14:30:19 CST
# 通过ps命令及其命令行参数,将需要的数据输出为csv格式
# ps -e -o %c, -o rss --sort=-rss > mems.csv
# 原数据be like:
! cat mems.csv | head -n 5
COMMAND , RSS
qemu-system-x86,33823984
qemu-system-x86,16924120
qemu-system-x86,16866464
qemu-system-x86,8627980
导入数据
import pandas as pd
df = pd.read_csv("mems.csv",delimiter=",",)
# 索引从1开始
df.index=df.index+1
# header移除空格
df.columns = df.columns.map(lambda x: x.strip())
# 预览:一共1026个进程
df
| COMMAND | RSS |
| 1 | qemu-system-x86 | 33823984 |
| 2 | qemu-system-x86 | 16924120 |
| 3 | qemu-system-x86 | 16866464 |
| 4 | qemu-system-x86 | 8627980 |
| 5 | mysqld | 7683188 |
| ... | ... | ... |
| 1022 | kworker/50:1-mm | 0 |
| 1023 | kworker/55:1-mm | 0 |
| 1024 | kworker/0:0-eve | 0 |
| 1025 | kworker/47:0-i4 | 0 |
| 1026 | kworker/65:1-mm | 0 |
1026 rows × 2 columns
处理数据
# 过滤0值,剩下324个进程
df = df[df.RSS>0]
# 这324个进程共占用了115G内存
df["RSS"].sum()/1024/1024
115.3391342163086
df
| COMMAND | RSS |
| 1 | qemu-system-x86 | 33823984 |
| 2 | qemu-system-x86 | 16924120 |
| 3 | qemu-system-x86 | 16866464 |
| 4 | qemu-system-x86 | 8627980 |
| 5 | mysqld | 7683188 |
| ... | ... | ... |
| 320 | kolla_start | 4 |
| 321 | kolla_start | 4 |
| 322 | kolla_start | 4 |
| 323 | kolla_start | 4 |
| 324 | kolla_start | 4 |
324 rows × 2 columns
df = df.copy()
# 增加一列,将RSS单位改成MB
df["RSSMB"] = df.loc[:, ('RSS')].map(lambda x: x/1024)
# 增加GB列
df["RSSGB"] = df.loc[:, ('RSS')].map(lambda x: x/1024/1024)
df.describe()
| RSS | RSSMB | RSSGB |
| count | 3.240000e+02 | 324.000000 | 324.000000 |
| mean | 3.732773e+05 | 364.528622 | 0.355985 |
| std | 2.402061e+06 | 2345.762337 | 2.290784 |
| min | 4.000000e+00 | 0.003906 | 0.000004 |
| 25% | 3.825000e+03 | 3.735352 | 0.003648 |
| 50% | 1.116600e+04 | 10.904297 | 0.010649 |
| 75% | 1.061370e+05 | 103.649414 | 0.101220 |
| max | 3.382398e+07 | 33031.234375 | 32.257065 |
# 增加一列,显示累计内存消耗量(GB)
df["RSS_GB_CUM"] = df["RSSGB"].cumsum()
df.describe()
| RSS | RSSMB | RSSGB | RSS_GB_CUM |
| count | 3.240000e+02 | 324.000000 | 324.000000 | 324.000000 |
| mean | 3.732773e+05 | 364.528622 | 0.355985 | 111.379911 |
| std | 2.402061e+06 | 2345.762337 | 2.290784 | 8.440401 |
| min | 4.000000e+00 | 0.003906 | 0.000004 | 32.257065 |
| 25% | 3.825000e+03 | 3.735352 | 0.003648 | 110.188729 |
| 50% | 1.116600e+04 | 10.904297 | 0.010649 | 114.734324 |
| 75% | 1.061370e+05 | 103.649414 | 0.101220 | 115.253636 |
| max | 3.382398e+07 | 33031.234375 | 32.257065 | 115.339134 |
统计分析
# 按内存用量倒排,前15个进程用了100GB内存,剩下的15G由其他几百个进程消耗
df.head(20)
| COMMAND | RSS | RSSMB | RSSGB | RSS_GB_CUM |
| 1 | qemu-system-x86 | 33823984 | 33031.234375 | 32.257065 | 32.257065 |
| 2 | qemu-system-x86 | 16924120 | 16527.460938 | 16.140099 | 48.397163 |
| 3 | qemu-system-x86 | 16866464 | 16471.156250 | 16.085114 | 64.482277 |
| 4 | qemu-system-x86 | 8627980 | 8425.761719 | 8.228283 | 72.710560 |
| 5 | mysqld | 7683188 | 7503.113281 | 7.327259 | 80.037819 |
| 6 | qemu-system-x86 | 4412644 | 4309.222656 | 4.208225 | 84.246044 |
| 7 | ceph-osd | 3034632 | 2963.507812 | 2.894051 | 87.140095 |
| 8 | ceph-osd | 2644688 | 2582.703125 | 2.522171 | 89.662266 |
| 9 | ceph-osd | 2379332 | 2323.566406 | 2.269108 | 91.931374 |
| 10 | ceph-osd | 2281156 | 2227.691406 | 2.175480 | 94.106853 |
| 11 | ceph-osd | 1928288 | 1883.093750 | 1.838959 | 95.945812 |
| 12 | qemu-system-x86 | 1889148 | 1844.871094 | 1.801632 | 97.747444 |
| 13 | ceph-osd | 1744884 | 1703.988281 | 1.664051 | 99.411495 |
| 14 | ceph-mon | 953092 | 930.753906 | 0.908939 | 100.320435 |
| 15 | ovs-vswitchd | 731060 | 713.925781 | 0.697193 | 101.017628 |
| 16 | openstack-expor | 557808 | 544.734375 | 0.531967 | 101.549595 |
| 17 | beam.smp | 431564 | 421.449219 | 0.411572 | 101.961166 |
| 18 | ceph-mgr | 228668 | 223.308594 | 0.218075 | 102.179241 |
| 19 | systemd-journal | 207168 | 202.312500 | 0.197571 | 102.376812 |
| 20 | nova-api | 167436 | 163.511719 | 0.159679 | 102.536491 |
# 统计进程数量,倒排
df["COMMAND"].value_counts().head(20)
httpd 67
containerd-shim 42
kolla_start 36
neutron-server 13
nova-api 11
heat-api-cfn 6
heat-api 6
neutron-metadat 6
glance-registry 6
heat-engine 6
nova-conductor 6
glance-api 6
qemu-system-x86 6
ceph-osd 6
bash 4
dnsmasq 4
sleep 4
sshd 3
haproxy 3
keepalived 3
Name: COMMAND, dtype: int64
pd.set_option('display.max_rows',None)
# 按进程名称分组统计内存用量
df.groupby("COMMAND")[["RSSMB", "RSSGB"]].sum().sort_values(by="RSSMB", ascending=False)
| RSSMB | RSSGB |
| COMMAND | | |
| qemu-system-x86 | 80609.707031 | 78.720417 |
| ceph-osd | 13684.550781 | 13.363819 |
| mysqld | 7503.113281 | 7.327259 |
| httpd | 3424.902344 | 3.344631 |
| neutron-server | 1675.730469 | 1.636456 |
| nova-api | 1672.753906 | 1.633549 |
| ceph-mon | 930.753906 | 0.908939 |
| ovs-vswitchd | 713.925781 | 0.697193 |
| glance-api | 682.386719 | 0.666393 |
| nova-conductor | 674.988281 | 0.659168 |
| heat-engine | 596.457031 | 0.582478 |
| openstack-expor | 586.960938 | 0.573204 |
| glance-registry | 523.085938 | 0.510826 |
| neutron-metadat | 504.976562 | 0.493141 |
| heat-api | 479.804688 | 0.468559 |
| heat-api-cfn | 421.453125 | 0.411575 |
| beam.smp | 421.449219 | 0.411572 |
| cinder-volume | 381.066406 | 0.372135 |
| ceph-mgr | 223.308594 | 0.218075 |
| systemd-journal | 202.312500 | 0.197571 |
| containerd-shim | 184.539062 | 0.180214 |
| nova-compute | 147.371094 | 0.143917 |
| neutron-openvsw | 117.742188 | 0.114983 |
| nova-scheduler | 111.031250 | 0.108429 |
| cinder-schedule | 108.925781 | 0.106373 |
| cinder-backup | 104.390625 | 0.101944 |
| nova-consoleaut | 103.605469 | 0.101177 |
| neutron-l3-agen | 94.960938 | 0.092735 |
| neutron-dhcp-ag | 94.824219 | 0.092602 |
| neutron-sriov-n | 88.132812 | 0.086067 |
| nova-novncproxy | 87.554688 | 0.085503 |
| dockerd | 85.347656 | 0.083347 |
| memcached | 77.582031 | 0.075764 |
| hs_alarm | 74.902344 | 0.073147 |
| fluentd | 61.457031 | 0.060017 |
| skydive | 57.851562 | 0.056496 |
| containerd | 47.425781 | 0.046314 |
| libvirtd | 43.031250 | 0.042023 |
| hserver | 40.523438 | 0.039574 |
| privsep-helper | 35.242188 | 0.034416 |
| vm_exporter.py | 29.085938 | 0.028404 |
| samd | 27.835938 | 0.027184 |
| syslog-ng | 27.675781 | 0.027027 |
| tuned | 26.246094 | 0.025631 |
| systemd | 22.609375 | 0.022079 |
| ha_monitor | 20.277344 | 0.019802 |
| NetworkManager | 16.488281 | 0.016102 |
| sshd | 16.351562 | 0.015968 |
| polkitd | 15.285156 | 0.014927 |
| sssd_nss | 14.851562 | 0.014503 |
| node_exporter | 13.394531 | 0.013081 |
| keepalived | 12.402344 | 0.012112 |
| haproxy | 11.812500 | 0.011536 |
| ceph-crash | 11.773438 | 0.011497 |
| sssd_be | 11.445312 | 0.011177 |
| sssd | 10.937500 | 0.010681 |
| systemd-udevd | 10.050781 | 0.009815 |
| ovsdb-client | 8.867188 | 0.008659 |
| lldpd | 8.386719 | 0.008190 |
| systemd-logind | 8.046875 | 0.007858 |
| ovsdb-server | 7.210938 | 0.007042 |
| systemd-machine | 7.058594 | 0.006893 |
| registry | 6.867188 | 0.006706 |
| bash | 6.175781 | 0.006031 |
| rngd | 5.820312 | 0.005684 |
| smartd | 5.695312 | 0.005562 |
| dnsmasq | 5.585938 | 0.005455 |
| dbus-daemon | 5.546875 | 0.005417 |
| crond | 5.527344 | 0.005398 |
| monitor | 5.132812 | 0.005013 |
| rpcbind | 5.093750 | 0.004974 |
| ps | 4.968750 | 0.004852 |
| auditd | 4.738281 | 0.004627 |
| irqbalance | 4.546875 | 0.004440 |
| (sd-pam) | 4.425781 | 0.004322 |
| gssproxy | 2.980469 | 0.002911 |
| docker-proxy | 2.750000 | 0.002686 |
| chronyd | 2.425781 | 0.002369 |
| haproxy-systemd | 2.003906 | 0.001957 |
| lsmd | 1.875000 | 0.001831 |
| prom_exporter | 1.597656 | 0.001560 |
| agetty | 1.402344 | 0.001369 |
| inet_gethost | 1.117188 | 0.001091 |
| epmd | 0.902344 | 0.000881 |
| mysqld_safe | 0.683594 | 0.000668 |
| rabbitmq-server | 0.574219 | 0.000561 |
| sleep | 0.289062 | 0.000282 |
| start-ovsdb-ser | 0.183594 | 0.000179 |
| kolla_start | 0.140625 | 0.000137 |
可疑进程排查
接下来就可以结合进程数量和进程消耗内存总量,逐个排查可疑进程