#!/usr/bin/env bash
usage="usage: $0 \n
系统资源监控: 记录cpu\memory\load\gpu\disk, 入库influxdb, 当超过规定数值时发电邮通知管理员
recommend: run by crontab
"
HOST=$(hostname)
ROOT='/var/log/monitor'
INFLUXDB_HOST='127.0.0.1'
INFLUXDB_PORT=8086
CPU_LOG="${ROOT}/cpu.log"
MEM_LOG="${ROOT}/mem.log"
LOAD_LOG="${ROOT}/load.log"
GPU_LOG="${ROOT}/gpu.log"
RENDERER_LOG="${ROOT}/renderer.log"
NOTICE_EMAIL='admin@admin.com'
CPU_REMARK='/tmp/servermonitor_cpu.remark'
MEM_REMARK='/tmp/servermonitor_mem.remark'
LOAD_REMARK='/tmp/servermonitor_loadaverage.remark'
GPU_REMARK='/tmp/servermonitor_gpu.remark'
REMARK_EXPIRE=3600
NOW=$(date +%s)
LIMIT_CPU=8
LIMIT_MEM=80
LIMIT_LOAD=35
LIMIT_GPU=80
FDISK='/dev/sda5'
function GetCpu() {
cpufree=$(vmstat 1 5 |sed -n '3,$p' |awk '{x = x + $15} END {print x/5}' |awk -F. '{print $1}')
cpuused=$((100 - $cpufree))
echo $cpuused
local remark
remark=$(GetRemark ${CPU_REMARK})
if [ "$remark" = "" ] && [ "$cpuused" -gt ${LIMIT_CPU} ]; then
echo "CPU uses more than ${LIMIT_CPU}%" | CustomSendMail ${NOTICE_EMAIL}
echo "$(date +%s)" > "$CPU_REMARK"
fi
}
function GetMem() {
mem=$(free -m | sed -n '3,3p')
used=$(echo $mem | awk -F ' ' '{print $3}')
free=$(echo $mem | awk -F ' ' '{print $4}')
total=$(($used + $free))
limit=$(($total/10))
echo "${total} ${used} ${free}"
local remark
remark=$(GetRemark ${MEM_REMARK})
if [ "$remark" = "" ] && [ "$used" -gt "${LIMIT_MEM}" ]; then
echo "Memory uses more than ${LIMIT_MEM}%" | CustomSendMail ${NOTICE_EMAIL}
echo "$(date +%s)" > "$MEM_REMARK"
fi
}
function GetLoad() {
load=$(uptime | awk -F 'load average: ' '{print $2}')
m1=$(echo $load | awk -F ', ' '{print $1}')
m5=$(echo $load | awk -F ', ' '{print $2}')
m15=$(echo $load | awk -F ', ' '{print $3}')
echo "${m1} ${m5} ${m15}"
local remark
remark=$(GetRemark ${LOAD_REMARK})
if [ "$remark" = "" ] && [ "$m1" -gt "${LIMIT_LOAD}" ]; then
echo "Load Average more than ${LIMIT_LOAD}" | CustomSendMail ${NOTICE_EMAIL}
echo "$(date +%s)" > "$LOAD_REMARK"
fi
}
function GetGpu() {
used=`nvidia-smi --query-gpu=utilization.gpu --format=csv | grep -v 'utilization' | awk '{print $1}'`
no=0
total_used=0
total_num=0
params=""
for i in ${used};
do
gpu_array[${no}]=${i}
params="${params},used${no}=${i}"
let no+=1
let total_used+=${i}
let total_num+=1
done
echo "$total_used/$total_num"
gpu_average=$(($total_used/$total_num))
echo "${used} $gpu_average"
local remark
remark=$(GetRemark ${GPU_REMARK})
if [ "$remark" = "" ] && [ "${gpu_average}" -gt "${LIMIT_GPU}" ]; then
echo "GPU more than ${LIMIT_GPU}%" | CustomSendMail ${NOTICE_EMAIL}
echo "$(date +%s)" > "GPU_REMARK"
fi
}
function GetStorage() {
OLD_IFS="$IFS"
IFS=","
arr=($FDISK)
IFS="$OLD_IFS"
for s in ${arr[@]}
do
params=`df -h | grep "$s" | awk '{print "total="$2",per="$5",used="$3}' | sed 's/%//' | sed 's/G//g'`
echo $params
done
}
function GetRemark() {
local remark
if [ -f "$1" ] && [ -s "$1" ]; then
remark=$(cat $1)
if [ $(( $NOW - $remark )) -gt "$REMARK_EXPIRE" ]; then
rm -f $1
remark=""
fi
else
remark=""
fi
echo $remark
}
function CustomSendMail() {
sendmail "Subject: ${HOST} $1 $(date +%Y-%m-%d' '%H:%M:%S)"
}
cpuinfo=$(GetCpu)
meminfo=$(GetMem)
loadinfo=$(GetLoad)
gpuinfo=$(GetGpu)
storageinfo=$(GetStorage)
echo "[$(date)] cpu: ${cpuinfo}"
echo "[$(date)] mem: ${meminfo}"
echo "[$(date)] load: ${loadinfo}"
echo "[$(date)] gpu: ${gpuinfo}"
echo "[$(date)] storage: ${storageinfo}"