# ServerGuard 配置文件 # 应用设置 app: name: "ServerGuard" version: "1.0.0" description: "服务器硬件健康诊断系统" # 日志设置 logging: level: INFO # DEBUG, INFO, WARNING, ERROR file: "/var/log/serverguard.log" max_size_mb: 100 backup_count: 5 console_output: true # 报告设置 report: default_format: "text" # text, json, csv, html output_directory: "./reports" include_timestamp: true max_report_size_mb: 10 # 检测模块设置 modules: # CPU 检测设置 cpu: enabled: true temperature_warning: 85 # 温度警告阈值(摄氏度) temperature_critical: 95 # 温度危险阈值(摄氏度) stress_test: duration_seconds: 300 # 压力测试持续时间 check_mce: true # 检查 MCE 错误 # 内存检测设置 memory: enabled: true memtester: enabled: true memory_percent: 70 # 使用可用内存的百分比进行测试 stress_test: duration_seconds: 300 check_ecc: true # 检查 ECC 错误 # 存储检测设置 storage: enabled: true smart_check: true check_reallocated_sectors: true reallocated_threshold: 1 # 重映射扇区警告阈值 temperature_warning: 60 # 硬盘温度警告阈值 temperature_critical: 70 # 硬盘温度危险阈值 run_io_test: false # 是否运行 I/O 性能测试(耗时) io_test_size_mb: 100 check_raid: true # 检查 RAID 状态 # 传感器检测设置 sensors: enabled: true lm_sensors: true ipmi: true check_fans: true fan_min_rpm: 500 # 风扇最低转速警告阈值 voltage_tolerance: 0.1 # 电压偏差容忍度(比例) # GPU 检测设置 gpu: enabled: true check_nvidia: true check_amd: true check_intel: true temperature_warning: 85 # 日志分析设置 log_analyzer: enabled: true check_dmesg: true check_journalctl: true max_lines: 5000 lookback_days: 7 # 分析最近几天的日志 # 告警设置 alerts: enabled: false smtp: host: "" port: 587 username: "" password: "" use_tls: true from_address: "serverguard@example.com" to_addresses: [] webhook: enabled: false url: "" headers: {} # 告警阈值 thresholds: cpu_temperature: 85 memory_usage_percent: 90 disk_usage_percent: 90 hardware_error_count: 1 # 压力测试设置(全面诊断模式) stress_test: cpu: enabled: true workers: 0 # 0 表示使用所有核心 timeout_seconds: 300 memory: enabled: true workers: 4 timeout_seconds: 300 io: enabled: false # I/O 压力测试可能很危险,默认关闭 workers: 4 timeout_seconds: 300