122 lines
2.7 KiB
YAML
122 lines
2.7 KiB
YAML
# ServerGuard 配置文件
|
|
|
|
# 应用设置
|
|
app:
|
|
name: "ServerGuard"
|
|
version: "1.0.0"
|
|
description: "服务器硬件健康诊断系统"
|
|
|
|
# 日志设置
|
|
logging:
|
|
level: INFO # DEBUG, INFO, WARNING, ERROR
|
|
file: "/var/log/serverguard.log"
|
|
max_size_mb: 100
|
|
backup_count: 5
|
|
console_output: true
|
|
|
|
# 报告设置
|
|
report:
|
|
default_format: "text" # text, json, csv, html
|
|
output_directory: "./reports"
|
|
include_timestamp: true
|
|
max_report_size_mb: 10
|
|
|
|
# 检测模块设置
|
|
modules:
|
|
# CPU 检测设置
|
|
cpu:
|
|
enabled: true
|
|
temperature_warning: 85 # 温度警告阈值(摄氏度)
|
|
temperature_critical: 95 # 温度危险阈值(摄氏度)
|
|
stress_test:
|
|
duration_seconds: 300 # 压力测试持续时间
|
|
check_mce: true # 检查 MCE 错误
|
|
|
|
# 内存检测设置
|
|
memory:
|
|
enabled: true
|
|
memtester:
|
|
enabled: true
|
|
memory_percent: 70 # 使用可用内存的百分比进行测试
|
|
stress_test:
|
|
duration_seconds: 300
|
|
check_ecc: true # 检查 ECC 错误
|
|
|
|
# 存储检测设置
|
|
storage:
|
|
enabled: true
|
|
smart_check: true
|
|
check_reallocated_sectors: true
|
|
reallocated_threshold: 1 # 重映射扇区警告阈值
|
|
temperature_warning: 60 # 硬盘温度警告阈值
|
|
temperature_critical: 70 # 硬盘温度危险阈值
|
|
run_io_test: false # 是否运行 I/O 性能测试(耗时)
|
|
io_test_size_mb: 100
|
|
check_raid: true # 检查 RAID 状态
|
|
|
|
# 传感器检测设置
|
|
sensors:
|
|
enabled: true
|
|
lm_sensors: true
|
|
ipmi: true
|
|
check_fans: true
|
|
fan_min_rpm: 500 # 风扇最低转速警告阈值
|
|
voltage_tolerance: 0.1 # 电压偏差容忍度(比例)
|
|
|
|
# GPU 检测设置
|
|
gpu:
|
|
enabled: true
|
|
check_nvidia: true
|
|
check_amd: true
|
|
check_intel: true
|
|
temperature_warning: 85
|
|
|
|
# 日志分析设置
|
|
log_analyzer:
|
|
enabled: true
|
|
check_dmesg: true
|
|
check_journalctl: true
|
|
max_lines: 5000
|
|
lookback_days: 7 # 分析最近几天的日志
|
|
|
|
# 告警设置
|
|
alerts:
|
|
enabled: false
|
|
smtp:
|
|
host: ""
|
|
port: 587
|
|
username: ""
|
|
password: ""
|
|
use_tls: true
|
|
from_address: "serverguard@example.com"
|
|
to_addresses: []
|
|
|
|
webhook:
|
|
enabled: false
|
|
url: ""
|
|
headers: {}
|
|
|
|
# 告警阈值
|
|
thresholds:
|
|
cpu_temperature: 85
|
|
memory_usage_percent: 90
|
|
disk_usage_percent: 90
|
|
hardware_error_count: 1
|
|
|
|
# 压力测试设置(全面诊断模式)
|
|
stress_test:
|
|
cpu:
|
|
enabled: true
|
|
workers: 0 # 0 表示使用所有核心
|
|
timeout_seconds: 300
|
|
|
|
memory:
|
|
enabled: true
|
|
workers: 4
|
|
timeout_seconds: 300
|
|
|
|
io:
|
|
enabled: false # I/O 压力测试可能很危险,默认关闭
|
|
workers: 4
|
|
timeout_seconds: 300
|