Files
ServerGuard/config/config.yaml
2026-03-02 14:14:40 +08:00

122 lines
2.7 KiB
YAML

# ServerGuard 配置文件
# 应用设置
app:
name: "ServerGuard"
version: "1.0.0"
description: "服务器硬件健康诊断系统"
# 日志设置
logging:
level: INFO # DEBUG, INFO, WARNING, ERROR
file: "/var/log/serverguard.log"
max_size_mb: 100
backup_count: 5
console_output: true
# 报告设置
report:
default_format: "text" # text, json, csv, html
output_directory: "./reports"
include_timestamp: true
max_report_size_mb: 10
# 检测模块设置
modules:
# CPU 检测设置
cpu:
enabled: true
temperature_warning: 85 # 温度警告阈值(摄氏度)
temperature_critical: 95 # 温度危险阈值(摄氏度)
stress_test:
duration_seconds: 300 # 压力测试持续时间
check_mce: true # 检查 MCE 错误
# 内存检测设置
memory:
enabled: true
memtester:
enabled: true
memory_percent: 70 # 使用可用内存的百分比进行测试
stress_test:
duration_seconds: 300
check_ecc: true # 检查 ECC 错误
# 存储检测设置
storage:
enabled: true
smart_check: true
check_reallocated_sectors: true
reallocated_threshold: 1 # 重映射扇区警告阈值
temperature_warning: 60 # 硬盘温度警告阈值
temperature_critical: 70 # 硬盘温度危险阈值
run_io_test: false # 是否运行 I/O 性能测试(耗时)
io_test_size_mb: 100
check_raid: true # 检查 RAID 状态
# 传感器检测设置
sensors:
enabled: true
lm_sensors: true
ipmi: true
check_fans: true
fan_min_rpm: 500 # 风扇最低转速警告阈值
voltage_tolerance: 0.1 # 电压偏差容忍度(比例)
# GPU 检测设置
gpu:
enabled: true
check_nvidia: true
check_amd: true
check_intel: true
temperature_warning: 85
# 日志分析设置
log_analyzer:
enabled: true
check_dmesg: true
check_journalctl: true
max_lines: 5000
lookback_days: 7 # 分析最近几天的日志
# 告警设置
alerts:
enabled: false
smtp:
host: ""
port: 587
username: ""
password: ""
use_tls: true
from_address: "serverguard@example.com"
to_addresses: []
webhook:
enabled: false
url: ""
headers: {}
# 告警阈值
thresholds:
cpu_temperature: 85
memory_usage_percent: 90
disk_usage_percent: 90
hardware_error_count: 1
# 压力测试设置(全面诊断模式)
stress_test:
cpu:
enabled: true
workers: 0 # 0 表示使用所有核心
timeout_seconds: 300
memory:
enabled: true
workers: 4
timeout_seconds: 300
io:
enabled: false # I/O 压力测试可能很危险,默认关闭
workers: 4
timeout_seconds: 300