- 因sql日志频繁写入导致磁盘空间不足,后续资源无法写入,导致错误
- 线上环境要关掉debug,防止错误泄露
- 增加资源监控脚本
#!/bin/bash
##############################################
# 资源监控
##############################################
# 配置日志文件
LOG_FILE="/var/log/supervisory_control.log"
MAX_LOG_SIZE=10485760 # 10MB
# 安全配置:设置安全路径
export PATH="/usr/bin:/bin:/usr/sbin:/sbin"
# 日志函数
log_message() {
local level=$1
local message=$2
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
# 创建日志目录(如果不存在)
local log_dir=$(dirname "$LOG_FILE")
if [ ! -d "$log_dir" ]; then
mkdir -p "$log_dir" 2>/dev/null || {
echo "无法创建日志目录: $log_dir" >&2
return 1
}
fi
# 检查日志文件大小并轮转
if [ -f "$LOG_FILE" ] && [ $(stat -c%s "$LOG_FILE" 2>/dev/null || echo 0) -gt $MAX_LOG_SIZE ]; then
mv "$LOG_FILE" "${LOG_FILE}.old" 2>/dev/null
fi
# 写入日志
echo "[$timestamp] [$level] $message" | tee -a "$LOG_FILE" 2>/dev/null
}
# 检查必需命令是否存在
check_commands() {
local commands=("top" "df" "free" "awk" "date" "curl")
for cmd in "${commands[@]}"; do
if ! command -v "$cmd" >/dev/null 2>&1; then
log_message "ERROR" "未找到命令: $cmd"
exit 1
fi
done
log_message "INFO" "所有必需命令检查通过"
}
# 获取系统信息函数
get_system_info() {
# 获取CPU使用率(去掉百分号后转为数值)
cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | awk -F'%' '{print $1}')
cpu_usage=${cpu_usage%.*} # 取整
log_message "DEBUG" "CPU使用率: ${cpu_usage}%"
# 获取根分区磁盘使用率
disk_usage=$(df / | awk 'NR==2 {print $5}' | sed 's/%//')
log_message "DEBUG" "磁盘使用率: ${disk_usage}%"
# 获取内存信息
mem_total=$(free -m | awk 'NR==2 {print $2}')
mem_used=$(free -m | awk 'NR==2 {print $3}')
# 计算内存使用率百分比(取整)
if [ "$mem_total" -gt 0 ]; then
mem_usage=$((mem_used * 100 / mem_total))
else
mem_usage=0
fi
log_message "DEBUG" "内存使用率: ${mem_usage}% (总计: ${mem_total}MB, 已用: ${mem_used}MB)"
# 获取当前时间
current_time=$(date '+%F %T')
# 获取IP地址(使用ip命令,更现代的方式)
ip_addr=$(ip route get 1 2>/dev/null | awk '{print $7; exit}')
if [ -z "$ip_addr" ]; then
ip_addr="无法获取"
log_message "WARN" "无法获取IP地址"
fi
log_message "DEBUG" "IP地址: ${ip_addr}"
}
# 发送钉钉消息函数
send_dingding_message() {
local is_at_all=$1
local content=$2
# 从环境变量或配置文件中获取token和手机号
local dingding_token=${DINGDING_TOKEN:-""}
local user_phone=${USER_PHONE:-""}
local dingding_url="https://oapi.dingtalk.com/robot/send?access_token=${dingding_token}"
# 构建at对象
local at_object=""
if [ -n "$user_phone" ]; then
at_object="\"at\": {\"atMobiles\": [\"${user_phone}\"], \"isAtAll\": ${is_at_all}},"
else
at_object="\"at\": {\"isAtAll\": ${is_at_all}},"
fi
# 发送请求
local response
response=$(curl -s -w "%{http_code}" "${dingding_url}" \
-H 'Content-Type: application/json' \
-d "{
\"msgtype\": \"text\",
\"text\": {\"content\": \"${content}\"},
${at_object}
\"at\": {\"isAtAll\": ${is_at_all}}
}" 2>/dev/null)
local status_code=${response: -3}
if [ "$status_code" -eq 200 ]; then
log_message "INFO" "钉钉消息发送成功"
else
log_message "ERROR" "钉钉消息发送失败,状态码: ${status_code}"
fi
}
# 主检查函数
main_check() {
log_message "INFO" "开始系统资源检查"
check_commands
get_system_info
local warning_message="exd资源监控\n资源警告!\n巡查时间:${current_time}\nIP地址:${ip_addr}\n资源状况如下:\n【CPU使用率:${cpu_usage}%】\n【磁盘使用率:${disk_usage}%】\n【内存使用率:${mem_usage}%】"
# 检查是否超过阈值
if [ "$cpu_usage" -gt 80 ] || [ "$disk_usage" -gt 80 ] || [ "$mem_usage" -gt 80 ]; then
# 资源占用高,@所有人
log_message "WARN" "资源占用过高 (CPU: ${cpu_usage}%, 磁盘: ${disk_usage}%, 内存: ${mem_usage}%),发送严重告警"
send_dingding_message "true" "$warning_message"
elif [ "$cpu_usage" -gt 70 ] || [ "$disk_usage" -gt 70 ] || [ "$mem_usage" -gt 70 ]; then
# 资源占用中等,不@所有人
log_message "WARN" "资源占用较高 (CPU: ${cpu_usage}%, 磁盘: ${disk_usage}%, 内存: ${mem_usage}%),发送普通警告"
send_dingding_message "false" "$warning_message"
else
log_message "INFO" "资源占用正常 (CPU: ${cpu_usage}%, 磁盘: ${disk_usage}%, 内存: ${mem_usage}%)"
fi
log_message "INFO" "系统资源检查完成"
}
# 使用方法提示
usage() {
echo "使用方法: $0"
echo "环境变量配置:"
echo " export DINGDING_TOKEN='你的钉钉token'"
echo " export USER_PHONE='你的手机号'"
echo " export LOG_FILE='自定义日志路径' (默认: /var/log/supervisory_control.log)"
echo ""
echo "阈值配置:"
echo " >80%: 严重警告,@所有人"
echo " 70-80%: 普通警告,不@所有人"
echo " <70%: 正常,不发送消息"
echo ""
echo "日志文件: $LOG_FILE"
}
# 初始化日志
init_log() {
log_message "INFO" "=============================================="
log_message "INFO" "超级监控脚本启动"
log_message "INFO" "日志文件: $LOG_FILE"
log_message "INFO" "最大日志大小: $(($MAX_LOG_SIZE/1024/1024))MB"
}
# 主程序
if [ "$1" = "-h" ] || [ "$1" = "--help" ]; then
usage
exit 0
fi
# 允许通过环境变量自定义日志路径
if [ -n "$LOG_FILE" ]; then
LOG_FILE="$LOG_FILE"
fi
init_log
main_check
admin
No Leanote account? Sign up now.