first commit

This commit is contained in:
zj
2026-03-02 14:14:40 +08:00
commit c4f4fefa0a
20 changed files with 6037 additions and 0 deletions

54
.gitignore vendored Normal file
View File

@@ -0,0 +1,54 @@
# Python
__pycache__/
*.py[cod]
*$py.class
*.so
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
# 虚拟环境
venv/
env/
ENV/
.venv/
# IDE
.vscode/
.idea/
*.swp
*.swo
*~
# 日志和报告
*.log
reports/
*.json
*.csv
*.html
# 配置文件(可能包含敏感信息)
config/local_config.yaml
config/secrets.yaml
# 操作系统
.DS_Store
Thumbs.db
# 临时文件
tmp/
temp/
*.tmp

111
README.md Normal file
View File

@@ -0,0 +1,111 @@
# ServerGuard - 服务器硬件健康诊断系统
ServerGuard 是一款基于 Python 的 Linux 命令行工具用于诊断服务器硬件CPU、内存、存储、电源、显卡等的潜在故障。
## 功能特性
- **硬件信息概览**:收集 CPU、内存、主板、存储、显卡等详细信息
- **CPU 检测**温度监控、MCE 错误检查、压力测试
- **内存检测**DIMM 信息、ECC 状态检查、内存压力测试
- **存储检测**SMART 数据分析、I/O 性能测试、RAID 状态检查
- **传感器监控**:电压、风扇转速、温度监控(支持 IPMI
- **显卡检测**GPU 信息、温度、驱动状态检查
- **日志分析**:自动扫描系统日志中的硬件错误
- **报告生成**:支持 JSON、CSV、纯文本、HTML 格式
## 安装
### 系统要求
- Python 3.8+
- Linux 操作系统
- root 权限(大多数硬件诊断功能需要)
### 安装系统依赖
**Debian/Ubuntu:**
```bash
sudo apt update
sudo apt install -y lshw dmidecode smartmontools lm-sensors stress-ng memtester ipmitool edac-utils fio mdadm pciutils usbutils
```
**CentOS/RHEL:**
```bash
sudo yum install -y lshw dmidecode smartmontools lm_sensors stress-ng memtester OpenIPMI edac-utils fio mdadm pciutils usbutils
```
### 安装 Python 依赖
```bash
python3 -m venv venv
source venv/bin/activate
pip install -r requirements.txt
```
## 使用方法
### 快速检测(非侵入性)
```bash
sudo python3 main.py --quick
```
### 全面诊断(包含压力测试)
```bash
sudo python3 main.py --full
```
### 运行特定模块
```bash
sudo python3 main.py --module cpu
sudo python3 main.py --module memory
sudo python3 main.py --module storage
```
### 生成不同格式的报告
```bash
sudo python3 main.py --full --format json --output report.json
sudo python3 main.py --full --format html --output report.html
```
### 查看帮助
```bash
python3 main.py --help
```
## 项目结构
```
ServerGuard/
├── main.py # 程序入口和核心调度器
├── utils.py # 通用工具库
├── reporter.py # 报告生成模块
├── requirements.txt # Python 依赖
├── README.md # 项目说明
├── config/
│ └── config.yaml # 配置文件
├── modules/
│ ├── __init__.py
│ ├── system_info.py # 系统信息概览
│ ├── cpu.py # CPU 检测
│ ├── memory.py # 内存检测
│ ├── storage.py # 存储检测
│ ├── sensors.py # 传感器监控
│ ├── gpu.py # 显卡检测
│ └── log_analyzer.py # 日志分析
└── tests/ # 测试文件
```
## 注意事项
1. **权限要求**:大多数硬件诊断功能需要 root 权限运行
2. **压力测试**:全面诊断中的压力测试会占用大量系统资源,建议在维护窗口期进行
3. **数据安全**:存储设备坏块扫描可能破坏数据,请谨慎使用
## 许可证
MIT License

121
config/config.yaml Normal file
View File

@@ -0,0 +1,121 @@
# ServerGuard 配置文件
# 应用设置
app:
name: "ServerGuard"
version: "1.0.0"
description: "服务器硬件健康诊断系统"
# 日志设置
logging:
level: INFO # DEBUG, INFO, WARNING, ERROR
file: "/var/log/serverguard.log"
max_size_mb: 100
backup_count: 5
console_output: true
# 报告设置
report:
default_format: "text" # text, json, csv, html
output_directory: "./reports"
include_timestamp: true
max_report_size_mb: 10
# 检测模块设置
modules:
# CPU 检测设置
cpu:
enabled: true
temperature_warning: 85 # 温度警告阈值(摄氏度)
temperature_critical: 95 # 温度危险阈值(摄氏度)
stress_test:
duration_seconds: 300 # 压力测试持续时间
check_mce: true # 检查 MCE 错误
# 内存检测设置
memory:
enabled: true
memtester:
enabled: true
memory_percent: 70 # 使用可用内存的百分比进行测试
stress_test:
duration_seconds: 300
check_ecc: true # 检查 ECC 错误
# 存储检测设置
storage:
enabled: true
smart_check: true
check_reallocated_sectors: true
reallocated_threshold: 1 # 重映射扇区警告阈值
temperature_warning: 60 # 硬盘温度警告阈值
temperature_critical: 70 # 硬盘温度危险阈值
run_io_test: false # 是否运行 I/O 性能测试(耗时)
io_test_size_mb: 100
check_raid: true # 检查 RAID 状态
# 传感器检测设置
sensors:
enabled: true
lm_sensors: true
ipmi: true
check_fans: true
fan_min_rpm: 500 # 风扇最低转速警告阈值
voltage_tolerance: 0.1 # 电压偏差容忍度(比例)
# GPU 检测设置
gpu:
enabled: true
check_nvidia: true
check_amd: true
check_intel: true
temperature_warning: 85
# 日志分析设置
log_analyzer:
enabled: true
check_dmesg: true
check_journalctl: true
max_lines: 5000
lookback_days: 7 # 分析最近几天的日志
# 告警设置
alerts:
enabled: false
smtp:
host: ""
port: 587
username: ""
password: ""
use_tls: true
from_address: "serverguard@example.com"
to_addresses: []
webhook:
enabled: false
url: ""
headers: {}
# 告警阈值
thresholds:
cpu_temperature: 85
memory_usage_percent: 90
disk_usage_percent: 90
hardware_error_count: 1
# 压力测试设置(全面诊断模式)
stress_test:
cpu:
enabled: true
workers: 0 # 0 表示使用所有核心
timeout_seconds: 300
memory:
enabled: true
workers: 4
timeout_seconds: 300
io:
enabled: false # I/O 压力测试可能很危险,默认关闭
workers: 4
timeout_seconds: 300

280
install.sh Executable file
View File

@@ -0,0 +1,280 @@
#!/bin/bash
# ServerGuard 安装脚本
# 支持 Debian/Ubuntu 和 CentOS/RHEL
echo "========================================"
echo "ServerGuard 安装脚本"
echo "========================================"
echo ""
# 检查是否为 root
if [ "$EUID" -ne 0 ]; then
echo "错误: 请以 root 权限运行此脚本"
echo " sudo ./install.sh"
exit 1
fi
# 检测 Linux 发行版
if [ -f /etc/os-release ]; then
. /etc/os-release
OS=$NAME
VER=$VERSION_ID
else
echo "无法检测操作系统类型"
exit 1
fi
echo "检测到操作系统: $OS $VER"
echo ""
# 记录安装失败的包
FAILED_PACKAGES=""
# 安装单个包的函数
install_package() {
local pkg=$1
local pkg_manager=$2
if [ "$pkg_manager" = "apt" ]; then
apt-get install -y "$pkg" 2>/dev/null && return 0
else
yum install -y "$pkg" 2>/dev/null && return 0
fi
FAILED_PACKAGES="$FAILED_PACKAGES $pkg"
return 1
}
# 安装 Debian/Ubuntu 依赖
install_debian_deps() {
echo "正在安装 Debian/Ubuntu 依赖..."
apt-get update
# 核心依赖(必须)
CORE_PKGS="lshw dmidecode smartmontools lm-sensors ipmitool mdadm pciutils usbutils util-linux coreutils grep gawk sed"
# 可选依赖
OPTIONAL_PKGS="stress-ng memtester edac-utils fio nvme-cli"
echo "安装核心依赖..."
for pkg in $CORE_PKGS; do
install_package "$pkg" "apt" || echo "警告: $pkg 安装失败"
done
echo "安装可选依赖..."
for pkg in $OPTIONAL_PKGS; do
install_package "$pkg" "apt" || echo "注意: $pkg 安装失败(可选)"
done
}
# 安装 RHEL/CentOS 依赖
install_redhat_deps() {
echo "正在安装 RHEL/CentOS 依赖..."
# 尝试启用 EPEL
if ! rpm -qa | grep -q epel-release; then
echo "启用 EPEL 仓库..."
yum install -y epel-release 2>/dev/null || true
fi
# 对于 CentOS 8/RHEL 8启用 PowerTools/CRB 仓库
if [[ "$VER" == 8* ]] || [[ "$VER" == "8" ]]; then
echo "启用 PowerTools 仓库..."
yum config-manager --set-enabled powertools 2>/dev/null || \
yum config-manager --set-enabled PowerTools 2>/dev/null || true
# 尝试启用 CRB (CodeReady Builder) 对于 RHEL 8
subscription-manager repos --enable codeready-builder-for-rhel-8-x86_64-rpms 2>/dev/null || true
fi
# 核心依赖(必须)
CORE_PKGS="lshw dmidecode smartmontools lm_sensors ipmitool mdadm pciutils usbutils util-linux coreutils grep gawk sed"
echo "安装核心依赖..."
for pkg in $CORE_PKGS; do
install_package "$pkg" "yum" || echo "警告: $pkg 安装失败"
done
# 尝试安装 OpenIPMI (替代 ipmitool 的依赖)
install_package "OpenIPMI" "yum" || echo "注意: OpenIPMI 安装失败(可选)"
# 可选依赖
OPTIONAL_PKGS="memtester edac-utils fio nvme-cli"
echo "安装可选依赖..."
for pkg in $OPTIONAL_PKGS; do
install_package "$pkg" "yum" || echo "注意: $pkg 安装失败(可选)"
done
# 特别处理 stress-ng
echo "尝试安装 stress-ng..."
if ! yum install -y stress-ng 2>/dev/null; then
echo "注意: stress-ng 从默认仓库安装失败"
# 尝试从 EPEL 安装 stress (备选)
echo "尝试安装 stress 作为备选..."
if yum install -y stress 2>/dev/null; then
echo "stress 安装成功,可作为压力测试备选工具"
else
echo "警告: stress 和 stress-ng 都安装失败"
echo " 压力测试功能将不可用"
FAILED_PACKAGES="$FAILED_PACKAGES stress-ng"
fi
fi
# 对于 CentOS 8提供手动安装 stress-ng 的指导
if [[ "$VER" == 8* ]] && [[ "$FAILED_PACKAGES" == *"stress-ng"* ]]; then
echo ""
echo "============================================"
echo "注意: CentOS 8 中 stress-ng 需要从源码编译安装"
echo "============================================"
echo "手动安装步骤:"
echo " 1. 安装编译依赖:"
echo " yum install -y gcc make libaio-devel libattr-devel libbsd-devel libcap-devel libgcrypt-devel"
echo " 2. 下载并编译 stress-ng:"
echo " cd /tmp"
echo " git clone https://github.com/ColinIanKing/stress-ng.git"
echo " cd stress-ng"
echo " make"
echo " make install"
echo "============================================"
echo ""
fi
}
# 根据发行版安装
case "$OS" in
*Debian*|*Ubuntu*)
install_debian_deps
;;
*CentOS*|*Red*Hat*|*Fedora*|*Alma*|*Rocky*)
install_redhat_deps
;;
*)
echo "不支持的操作系统: $OS"
echo "请手动安装以下工具:"
echo " lshw, dmidecode, smartmontools, lm-sensors, stress-ng, memtester"
echo " ipmitool, edac-utils, fio, mdadm, pciutils, usbutils"
exit 1
;;
esac
echo ""
echo "系统依赖安装完成"
# 显示安装失败的包
if [ -n "$FAILED_PACKAGES" ]; then
echo ""
echo "以下包安装失败: $FAILED_PACKAGES"
echo "某些功能可能受限ServerGuard 仍可运行基本检测"
fi
echo ""
# 检查 Python 版本
echo "检查 Python 版本..."
if command -v python3 &> /dev/null; then
PYTHON_VERSION=$(python3 --version 2>&1 | awk '{print $2}')
echo "找到 Python $PYTHON_VERSION"
elif command -v python &> /dev/null; then
PYTHON_VERSION=$(python --version 2>&1 | awk '{print $2}')
echo "找到 Python $PYTHON_VERSION"
else
echo "错误: 未找到 Python"
echo "请安装 Python 3.6 或更高版本"
exit 1
fi
# 检查 Python 版本号
PYTHON_MAJOR=$(echo $PYTHON_VERSION | cut -d. -f1)
PYTHON_MINOR=$(echo $PYTHON_VERSION | cut -d. -f2)
if [ "$PYTHON_MAJOR" -lt 3 ] || ([ "$PYTHON_MAJOR" -eq 3 ] && [ "$PYTHON_MINOR" -lt 6 ]); then
echo "错误: Python 版本过低 ($PYTHON_VERSION)"
echo "需要 Python 3.6 或更高版本"
exit 1
fi
echo "Python 版本符合要求"
echo ""
# 安装 Python 依赖
echo "安装 Python 依赖..."
PIP_CMD="pip3"
if ! command -v pip3 &> /dev/null; then
PIP_CMD="pip"
fi
$PIP_CMD install -r requirements.txt || {
echo "警告: pip 安装失败,尝试使用 --user 选项"
$PIP_CMD install --user -r requirements.txt
}
echo ""
# 配置 lm-sensors
if command -v sensors-detect &> /dev/null; then
echo ""
echo "检测到 lm-sensors 需要配置"
echo "是否要运行 sensors-detect 配置传感器? (y/N)"
read -r response
if [[ "$response" =~ ^([yY][eE][sS]|[yY])$ ]]; then
echo "正在运行 sensors-detect..."
sensors-detect --auto || true
fi
fi
echo ""
echo "========================================"
echo "安装完成!"
echo "========================================"
echo ""
# 检查依赖状态
echo "依赖检查:"
echo "------------"
for cmd in lshw dmidecode smartctl sensors ipmitool; do
if command -v "$cmd" &> /dev/null; then
echo "$cmd"
else
echo "$cmd (未安装)"
fi
done
echo ""
echo "压力测试工具:"
if command -v stress-ng &> /dev/null; then
echo " ✓ stress-ng (推荐)"
elif command -v stress &> /dev/null; then
echo " ✓ stress (备选)"
else
echo " ✗ stress/stress-ng (未安装,压力测试不可用)"
fi
echo ""
echo "使用方法:"
echo " 快速检测: sudo python3 main.py --quick"
echo " 全面诊断: sudo python3 main.py --full"
echo " 特定模块: sudo python3 main.py --module cpu"
echo " 生成报告: sudo python3 main.py --quick --format json --output report.json"
echo ""
echo "查看帮助: python3 main.py --help"
echo ""
# 创建快捷方式(可选)
echo "是否要创建 /usr/local/bin/serverguard 快捷方式? (y/N)"
read -r response
if [[ "$response" =~ ^([yY][eE][sS]|[yY])$ ]]; then
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
cat > /usr/local/bin/serverguard << EOF
#!/bin/bash
cd "$SCRIPT_DIR"
python3 main.py "\$@"
EOF
chmod +x /usr/local/bin/serverguard
echo "快捷方式已创建: serverguard"
echo "现在可以直接使用: sudo serverguard --quick"
fi
echo ""
echo "安装完成!"

419
main.py Executable file
View File

@@ -0,0 +1,419 @@
#!/usr/bin/env python3
"""
ServerGuard - 服务器硬件健康诊断系统
主程序入口,负责命令行参数解析、模块调度和报告生成。
使用方法:
sudo python3 main.py --quick # 快速检测
sudo python3 main.py --full # 全面诊断(含压力测试)
sudo python3 main.py --module cpu # 仅检测 CPU
sudo python3 main.py --full --format json --output report.json
"""
import argparse
import sys
import os
from typing import Optional, Dict, Any
from utils import setup_logging, check_root_privileges, get_file_timestamp
from reporter import ReportGenerator
def parse_arguments() -> argparse.Namespace:
"""
解析命令行参数。
Returns:
argparse.Namespace: 解析后的参数
"""
parser = argparse.ArgumentParser(
prog='ServerGuard',
description='服务器硬件健康诊断系统 - 用于诊断 Linux 服务器硬件故障',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
示例:
%(prog)s --quick # 快速硬件检测
%(prog)s --full # 全面诊断(含压力测试)
%(prog)s --module cpu # 仅检测 CPU
%(prog)s --module memory,storage # 检测内存和存储
%(prog)s --full --format json # 生成 JSON 格式报告
%(prog)s --list-modules # 列出所有可用模块
注意: 大多数诊断功能需要 root 权限,请使用 sudo 运行。
"""
)
# 主要操作模式(互斥)
mode_group = parser.add_mutually_exclusive_group(required=True)
mode_group.add_argument(
'--quick', '-q',
action='store_true',
help='快速检测模式(非侵入性,仅收集信息)'
)
mode_group.add_argument(
'--full', '-f',
action='store_true',
help='全面诊断模式(包含压力测试,耗时较长)'
)
mode_group.add_argument(
'--module', '-m',
type=str,
metavar='MODULE',
help='运行指定模块,多个模块用逗号分隔 (cpu,memory,storage,sensors,gpu,logs)'
)
mode_group.add_argument(
'--list-modules', '-l',
action='store_true',
help='列出所有可用的检测模块'
)
# 报告选项
parser.add_argument(
'--format',
type=str,
choices=['text', 'json', 'csv', 'html'],
default='text',
help='报告格式 (默认: text)'
)
parser.add_argument(
'--output', '-o',
type=str,
metavar='FILE',
help='输出文件路径(不指定则输出到控制台)'
)
parser.add_argument(
'--log',
type=str,
metavar='FILE',
default='/var/log/serverguard.log',
help='日志文件路径 (默认: /var/log/serverguard.log)'
)
# 测试参数
parser.add_argument(
'--stress-duration',
type=int,
default=300,
metavar='SECONDS',
help='压力测试持续时间,单位秒 (默认: 300)'
)
parser.add_argument(
'--verbose', '-v',
action='store_true',
help='显示详细输出'
)
parser.add_argument(
'--yes', '-y',
action='store_true',
help='自动确认所有警告提示(如压力测试警告)'
)
return parser.parse_args()
def list_available_modules():
"""列出所有可用的检测模块。"""
modules = {
'system': '系统信息概览',
'cpu': 'CPU 检测与压力测试',
'memory': '内存检测与压力测试',
'storage': '存储设备检测',
'sensors': '电源与传感器监控',
'gpu': '显卡检测',
'logs': '日志分析'
}
print("可用的检测模块:")
print("-" * 40)
for name, description in modules.items():
print(f" {name:12} - {description}")
print("-" * 40)
print("\n使用示例:")
print(" sudo python3 main.py --module cpu")
print(" sudo python3 main.py --module cpu,memory,storage")
def confirm_stress_test(duration: int, auto_confirm: bool = False) -> bool:
"""
确认是否执行压力测试。
Args:
duration: 压力测试持续时间
auto_confirm: 是否自动确认
Returns:
bool: 是否继续
"""
if auto_confirm:
return True
print("\n" + "=" * 60)
print("警告:即将执行压力测试")
print("=" * 60)
print(f"测试持续时间: {duration} 秒 ({duration // 60} 分钟)")
print("此测试将占用大量系统资源,可能导致:")
print(" - CPU 和内存使用率接近 100%")
print(" - 系统响应变慢")
print(" - 温度升高")
print("建议在维护窗口期进行,并确保服务器可接受高负载。")
print("=" * 60)
try:
response = input("\n是否继续? [y/N]: ").strip().lower()
return response in ('y', 'yes')
except KeyboardInterrupt:
print("\n操作已取消")
return False
def run_module(module_name: str, stress_test: bool = False, stress_duration: int = 300) -> Dict[str, Any]:
"""
运行指定的检测模块。
Args:
module_name: 模块名称
stress_test: 是否执行压力测试
stress_duration: 压力测试持续时间
Returns:
Dict[str, Any]: 模块检测结果
"""
import logging
logger = logging.getLogger(__name__)
module_map = {
'system': 'modules.system_info',
'cpu': 'modules.cpu',
'memory': 'modules.memory',
'storage': 'modules.storage',
'sensors': 'modules.sensors',
'gpu': 'modules.gpu',
'logs': 'modules.log_analyzer'
}
if module_name not in module_map:
logger.error(f"未知模块: {module_name}")
return {"status": "error", "error": f"未知模块: {module_name}"}
try:
module = __import__(module_map[module_name], fromlist=[''])
if module_name == 'system':
return module.get_system_info()
elif module_name == 'cpu':
return module.run_cpu_check(stress_test, stress_duration)
elif module_name == 'memory':
return module.run_memory_check(stress_test, stress_duration)
elif module_name == 'storage':
return module.run_storage_check()
elif module_name == 'sensors':
return module.run_sensors_check()
elif module_name == 'gpu':
return module.run_gpu_check()
elif module_name == 'logs':
return module.analyze_logs()
except Exception as e:
logger.error(f"运行模块 {module_name} 时出错: {e}")
return {"status": "error", "error": str(e)}
def run_quick_check() -> Dict[str, Any]:
"""
执行快速检测(非侵入性)。
Returns:
Dict[str, Any]: 检测结果
"""
import logging
logger = logging.getLogger(__name__)
print("正在执行快速硬件检测...")
print("-" * 60)
results = {
"scan_type": "quick",
"timestamp": get_file_timestamp(),
"modules": {}
}
modules_to_run = ['system', 'cpu', 'memory', 'storage', 'sensors', 'gpu', 'logs']
for module_name in modules_to_run:
print(f"正在检测: {module_name}...", end=' ', flush=True)
try:
result = run_module(module_name, stress_test=False)
results["modules"][module_name] = result
status = result.get("status", "unknown")
if status == "success":
print("[完成]")
elif status == "warning":
print("[警告]")
elif status == "error":
print("[错误]")
else:
print(f"[{status}]")
except Exception as e:
logger.error(f"模块 {module_name} 执行失败: {e}")
results["modules"][module_name] = {"status": "error", "error": str(e)}
print("[失败]")
print("-" * 60)
return results
def run_full_diagnostic(stress_duration: int, auto_confirm: bool = False) -> Dict[str, Any]:
"""
执行全面诊断(包含压力测试)。
Args:
stress_duration: 压力测试持续时间
auto_confirm: 是否自动确认
Returns:
Dict[str, Any]: 检测结果
"""
import logging
logger = logging.getLogger(__name__)
if not confirm_stress_test(stress_duration, auto_confirm):
print("诊断已取消")
sys.exit(0)
print("\n正在执行全面硬件诊断...")
print("=" * 60)
results = {
"scan_type": "full",
"timestamp": get_file_timestamp(),
"stress_duration": stress_duration,
"modules": {}
}
# 先执行快速检测
modules_to_run = ['system', 'cpu', 'memory', 'storage', 'sensors', 'gpu', 'logs']
for module_name in modules_to_run:
print(f"\n正在检测: {module_name}...")
try:
# CPU 和内存执行压力测试
do_stress = module_name in ['cpu', 'memory']
result = run_module(module_name, stress_test=do_stress, stress_duration=stress_duration)
results["modules"][module_name] = result
status = result.get("status", "unknown")
print(f" 状态: {status}")
except Exception as e:
logger.error(f"模块 {module_name} 执行失败: {e}")
results["modules"][module_name] = {"status": "error", "error": str(e)}
print(f" 状态: 失败 - {e}")
print("\n" + "=" * 60)
return results
def run_specific_modules(module_list: str, stress_duration: int) -> Dict[str, Any]:
"""
运行指定的模块列表。
Args:
module_list: 逗号分隔的模块名称
stress_duration: 压力测试持续时间
Returns:
Dict[str, Any]: 检测结果
"""
modules = [m.strip() for m in module_list.split(',')]
results = {
"scan_type": "custom",
"timestamp": get_file_timestamp(),
"modules": {}
}
print(f"正在执行自定义模块检测: {', '.join(modules)}")
print("-" * 60)
for module_name in modules:
print(f"正在检测: {module_name}...", end=' ', flush=True)
try:
result = run_module(module_name, stress_test=False)
results["modules"][module_name] = result
status = result.get("status", "unknown")
print(f"[{status}]")
except Exception as e:
results["modules"][module_name] = {"status": "error", "error": str(e)}
print(f"[失败: {e}]")
print("-" * 60)
return results
def main():
"""程序主入口。"""
args = parse_arguments()
# 设置日志
log_level = logging.DEBUG if args.verbose else logging.INFO
setup_logging(
log_file=args.log if check_root_privileges() else None,
level=log_level,
console_output=True
)
logger = logging.getLogger(__name__)
# 列出模块
if args.list_modules:
list_available_modules()
sys.exit(0)
# 检查 root 权限警告
if not check_root_privileges():
logger.warning("未以 root 权限运行,部分功能可能受限")
print("警告: 未检测到 root 权限,部分硬件信息可能无法获取")
print("建议: 使用 sudo 运行以获得完整的诊断信息\n")
# 执行诊断
try:
if args.quick:
results = run_quick_check()
elif args.full:
results = run_full_diagnostic(args.stress_duration, args.yes)
elif args.module:
results = run_specific_modules(args.module, args.stress_duration)
else:
print("请指定操作模式: --quick, --full, --module 或 --list-modules")
sys.exit(1)
# 生成报告
generator = ReportGenerator()
if args.output:
generator.save_report(results, args.format, args.output)
print(f"\n报告已保存至: {args.output}")
else:
report = generator.generate_report(results, args.format)
print("\n" + "=" * 60)
print("诊断报告")
print("=" * 60)
print(report)
# 返回退出码:如果有错误则返回 1
has_errors = any(
m.get("status") == "error"
for m in results.get("modules", {}).values()
)
sys.exit(1 if has_errors else 0)
except KeyboardInterrupt:
print("\n\n操作已被用户中断")
sys.exit(130)
except Exception as e:
logger.exception("程序执行过程中发生错误")
print(f"\n错误: {e}")
sys.exit(1)
if __name__ == '__main__':
import logging
main()

15
modules/__init__.py Normal file
View File

@@ -0,0 +1,15 @@
"""
ServerGuard 硬件检测模块
包含以下子模块:
- system_info: 系统信息概览
- cpu: CPU 检测与压力测试
- memory: 内存检测与压力测试
- storage: 存储设备检测
- sensors: 电源与传感器监控
- gpu: 显卡检测
- log_analyzer: 日志分析
"""
__version__ = "1.0.0"
__author__ = "ServerGuard Team"

518
modules/cpu.py Normal file
View File

@@ -0,0 +1,518 @@
"""
ServerGuard - CPU 检测与压力测试模块
检查 CPU 状态、温度、错误日志,并执行压力测试。
"""
import os
import re
import time
from typing import Dict, Any, List, Optional
import sys
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from utils import (
execute_command, check_command_exists, parse_key_value_output,
safe_int, safe_float, require_root
)
def run_cpu_check(stress_test: bool = False, stress_duration: int = 300) -> Dict[str, Any]:
"""
执行 CPU 检测。
Args:
stress_test: 是否执行压力测试
stress_duration: 压力测试持续时间(秒)
Returns:
Dict[str, Any]: 检测结果
"""
result = {
"status": "success",
"cpu_info": {},
"temperature": {},
"mce_errors": {},
"load_average": {},
"stress_test": {}
}
try:
# 获取基本信息
result["cpu_info"] = get_cpu_details()
# 获取温度
result["temperature"] = get_cpu_temperature()
if result["temperature"].get("status") == "warning":
result["status"] = "warning"
# 获取负载
result["load_average"] = get_load_average()
# 检查 MCE 错误
result["mce_errors"] = check_mce_errors()
if result["mce_errors"].get("count", 0) > 0:
result["status"] = "warning"
# 执行压力测试
if stress_test:
result["stress_test"] = run_cpu_stress_test(stress_duration)
if not result["stress_test"].get("passed", False):
result["status"] = "error"
except Exception as e:
result["status"] = "error"
result["error"] = str(e)
return result
def get_cpu_details() -> Dict[str, Any]:
"""获取 CPU 详细信息。"""
info = {
"model": "Unknown",
"architecture": "Unknown",
"cores": 0,
"threads": 0,
"current_frequency_mhz": 0,
"bogomips": 0,
"flags": []
}
try:
with open('/proc/cpuinfo', 'r') as f:
content = f.read()
# 解析第一个 CPU 的信息
cpu_sections = content.split('\n\n')
if cpu_sections:
first_cpu = cpu_sections[0]
data = {}
for line in first_cpu.split('\n'):
if ':' in line:
key, value = line.split(':', 1)
data[key.strip()] = value.strip()
info["model"] = data.get('model name', 'Unknown')
info["vendor"] = data.get('vendor_id', 'Unknown')
info["architecture"] = data.get('cpu family', 'Unknown')
info["bogomips"] = safe_float(data.get('bogomips', 0))
if 'flags' in data:
info["flags"] = data['flags'].split()
# 统计核心数和线程数
info["threads"] = content.count('processor\t:')
info["cores"] = len(set(re.findall(r'physical id\t:\s*(\d+)', content)))
if info["cores"] == 0:
info["cores"] = info["threads"]
# 获取当前频率
if os.path.exists('/proc/cpuinfo'):
with open('/proc/cpuinfo', 'r') as f:
for line in f:
if 'cpu MHz' in line:
info["current_frequency_mhz"] = safe_float(line.split(':')[1].strip())
break
# 获取缩放频率信息
freq_info = get_cpu_frequency_info()
if freq_info:
info["frequency_info"] = freq_info
except Exception as e:
info["error"] = str(e)
return info
def get_cpu_frequency_info() -> Dict[str, Any]:
"""获取 CPU 频率信息。"""
info = {}
# 尝试从 cpufreq 获取
cpu0_path = '/sys/devices/system/cpu/cpu0/cpufreq'
if os.path.exists(cpu0_path):
try:
files = {
"min_mhz": "scaling_min_freq",
"max_mhz": "scaling_max_freq",
"current_mhz": "scaling_cur_freq",
"governor": "scaling_governor",
"driver": "scaling_driver"
}
for key, filename in files.items():
filepath = os.path.join(cpu0_path, filename)
if os.path.exists(filepath):
with open(filepath, 'r') as f:
value = f.read().strip()
if 'freq' in filename:
# 频率值通常以 kHz 存储
info[key] = round(safe_int(value) / 1000, 2)
else:
info[key] = value
except:
pass
return info
def get_cpu_temperature() -> Dict[str, Any]:
"""获取 CPU 温度信息。"""
result = {
"status": "success",
"sensors": {},
"current_c": None,
"high_threshold_c": None,
"critical_threshold_c": None
}
temperatures = []
# 方法 1: 使用 sensors 命令 (lm-sensors)
if check_command_exists('sensors'):
try:
_, stdout, _ = execute_command(
['sensors', '-u'],
check_returncode=False, timeout=10
)
# 解析 sensors -u 输出
current_chip = None
current_adapter = None
for line in stdout.split('\n'):
line = line.strip()
# 检测芯片名称
if line and not line.startswith('Adapter:') and not ':' in line and not line.startswith('temp'):
current_chip = line.rstrip(':')
result["sensors"][current_chip] = {}
continue
if line.startswith('Adapter:'):
current_adapter = line.split(':', 1)[1].strip()
if current_chip:
result["sensors"][current_chip]["adapter"] = current_adapter
continue
# 解析温度输入值
if 'temp' in line and '_input' in line:
match = re.match(r'(temp\d+)_input:\s*([\d.]+)', line)
if match:
temp_name = match.group(1)
temp_value = safe_float(match.group(2))
if current_chip:
if temp_name not in result["sensors"][current_chip]:
result["sensors"][current_chip][temp_name] = {}
result["sensors"][current_chip][temp_name]["current"] = temp_value
temperatures.append(temp_value)
# 解析高温阈值
if 'temp' in line and '_max' in line:
match = re.match(r'(temp\d+)_max:\s*([\d.]+)', line)
if match:
temp_name = match.group(1)
temp_value = safe_float(match.group(2))
if current_chip and temp_name in result["sensors"][current_chip]:
result["sensors"][current_chip][temp_name]["high"] = temp_value
# 解析临界温度
if 'temp' in line and '_crit' in line:
match = re.match(r'(temp\d+)_crit:\s*([\d.]+)', line)
if match:
temp_name = match.group(1)
temp_value = safe_float(match.group(2))
if current_chip and temp_name in result["sensors"][current_chip]:
result["sensors"][current_chip][temp_name]["critical"] = temp_value
except:
pass
# 方法 2: 直接读取 thermal zone
if not temperatures:
try:
thermal_path = '/sys/class/thermal'
if os.path.exists(thermal_path):
for zone in os.listdir(thermal_path):
if zone.startswith('thermal_zone'):
zone_path = os.path.join(thermal_path, zone)
# 读取类型
type_file = os.path.join(zone_path, 'type')
zone_type = 'unknown'
if os.path.exists(type_file):
with open(type_file, 'r') as f:
zone_type = f.read().strip()
# 读取温度 (单位是毫摄氏度)
temp_file = os.path.join(zone_path, 'temp')
if os.path.exists(temp_file):
with open(temp_file, 'r') as f:
temp_mc = safe_int(f.read().strip())
temp_c = temp_mc / 1000.0
if 'x86_pkg_temp' in zone_type or 'cpu' in zone_type.lower():
result["sensors"][zone] = {
"type": zone_type,
"current": temp_c
}
temperatures.append(temp_c)
except:
pass
# 方法 3: 尝试从 hwmon 读取
if not temperatures:
try:
hwmon_path = '/sys/class/hwmon'
if os.path.exists(hwmon_path):
for hwmon in os.listdir(hwmon_path):
hwmon_dir = os.path.join(hwmon_path, hwmon)
# 读取名称
name_file = os.path.join(hwmon_dir, 'name')
if os.path.exists(name_file):
with open(name_file, 'r') as f:
name = f.read().strip()
else:
name = hwmon
# 查找温度输入
for file in os.listdir(hwmon_dir):
if file.startswith('temp') and file.endswith('_input'):
temp_file = os.path.join(hwmon_dir, file)
with open(temp_file, 'r') as f:
temp_mc = safe_int(f.read().strip())
temp_c = temp_mc / 1000.0
sensor_name = file.replace('_input', '')
result["sensors"][f"{name}_{sensor_name}"] = {
"current": temp_c
}
temperatures.append(temp_c)
except:
pass
# 计算平均温度
if temperatures:
result["current_c"] = round(sum(temperatures) / len(temperatures), 1)
result["max_c"] = round(max(temperatures), 1)
# 检查温度警告
if result["max_c"] > 85:
result["status"] = "warning"
result["warning"] = f"CPU 温度过高: {result['max_c']}°C"
else:
result["status"] = "unknown"
result["warning"] = "无法获取 CPU 温度信息"
return result
def get_load_average() -> Dict[str, Any]:
"""获取系统负载信息。"""
result = {}
try:
with open('/proc/loadavg', 'r') as f:
load_data = f.read().strip().split()
if len(load_data) >= 3:
result["1min"] = safe_float(load_data[0])
result["5min"] = safe_float(load_data[1])
result["15min"] = safe_float(load_data[2])
# 获取 CPU 核心数以计算相对负载
num_cores = os.cpu_count() or 1
result["cores"] = num_cores
result["relative_1min"] = round(result["1min"] / num_cores, 2)
result["relative_5min"] = round(result["5min"] / num_cores, 2)
result["relative_15min"] = round(result["15min"] / num_cores, 2)
except:
pass
return result
def check_mce_errors() -> Dict[str, Any]:
"""检查 Machine Check Exception (MCE) 错误。"""
result = {
"count": 0,
"errors": [],
"status": "ok"
}
# 方法 1: 检查 dmesg
if check_command_exists('dmesg'):
try:
_, stdout, _ = execute_command(
['dmesg'],
check_returncode=False, timeout=10
)
mce_keywords = ['Machine check events logged', 'Hardware Error', 'CMCI storm']
for line in stdout.split('\n'):
for keyword in mce_keywords:
if keyword in line:
result["count"] += 1
if len(result["errors"]) < 10: # 限制错误数量
result["errors"].append(line.strip())
result["status"] = "warning"
break
except:
pass
# 方法 2: 检查 mcelog
if check_command_exists('mcelog'):
try:
# 尝试读取 mcelog 输出
_, stdout, _ = execute_command(
['mcelog', '--client'],
check_returncode=False, timeout=5
)
if stdout.strip() and 'no machine check' not in stdout.lower():
result["count"] += stdout.count('MCE')
result["status"] = "warning"
result["mcelog_available"] = True
except:
pass
# 方法 3: 检查 /dev/mcelog
if os.path.exists('/dev/mcelog'):
result["mcelog_device"] = True
return result
@require_root
def run_cpu_stress_test(duration: int = 300) -> Dict[str, Any]:
"""
运行 CPU 压力测试。
Args:
duration: 测试持续时间(秒)
Returns:
Dict[str, Any]: 测试结果
"""
result = {
"passed": False,
"duration_seconds": duration,
"cpu_cores": os.cpu_count() or 1,
"start_time": None,
"end_time": None,
"max_temperature": None,
"tool_used": None,
"errors": []
}
# 使用 stress-ng 进行压力测试(首选)
if check_command_exists('stress-ng'):
result["tool_used"] = "stress-ng"
try:
result["start_time"] = time.strftime('%Y-%m-%d %H:%M:%S')
# 获取测试前温度
temp_before = get_cpu_temperature()
# 运行 stress-ng
# --cpu 0 使用所有 CPU 核心
# --timeout 指定超时时间
# --metrics-brief 输出简要指标
cmd = [
'stress-ng',
'--cpu', '0',
'--timeout', str(duration),
'--metrics-brief'
]
_, stdout, stderr = execute_command(
cmd,
timeout=duration + 30, # 给一些额外时间
check_returncode=False
)
result["end_time"] = time.strftime('%Y-%m-%d %H:%M:%S')
# 获取测试后温度
temp_after = get_cpu_temperature()
# 分析输出
output = stdout + stderr
# 检查是否有错误
if 'error' in output.lower() or 'fail' in output.lower():
result["passed"] = False
result["errors"].append("压力测试过程中发现错误")
else:
result["passed"] = True
# 提取性能指标
bogo_ops = re.search(r'stress-ng:\s+cpu:\s+(\d+)\s+bogo ops', output)
if bogo_ops:
result["bogo_ops"] = safe_int(bogo_ops.group(1))
bogo_ops_per_sec = re.search(r'(\d+\.\d+)\s+bogo ops per second', output)
if bogo_ops_per_sec:
result["bogo_ops_per_second"] = safe_float(bogo_ops_per_sec.group(1))
# 温度分析
if temp_after.get("max_c"):
result["max_temperature"] = temp_after["max_c"]
if temp_after["max_c"] > 95:
result["warnings"] = [f"测试期间温度过高: {temp_after['max_c']}°C"]
result["temperature_before"] = temp_before
result["temperature_after"] = temp_after
except Exception as e:
result["passed"] = False
result["errors"].append(str(e))
# 备选: 使用 stress
elif check_command_exists('stress'):
result["tool_used"] = "stress"
try:
result["start_time"] = time.strftime('%Y-%m-%d %H:%M:%S')
temp_before = get_cpu_temperature()
num_cores = os.cpu_count() or 1
_, stdout, stderr = execute_command(
['stress', '--cpu', str(num_cores), '--timeout', str(duration)],
timeout=duration + 30,
check_returncode=False
)
result["end_time"] = time.strftime('%Y-%m-%d %H:%M:%S')
temp_after = get_cpu_temperature()
result["passed"] = True
result["temperature_before"] = temp_before
result["temperature_after"] = temp_after
if temp_after.get("max_c"):
result["max_temperature"] = temp_after["max_c"]
except Exception as e:
result["passed"] = False
result["errors"].append(str(e))
else:
result["passed"] = False
result["errors"].append("未找到压力测试工具 (stress-ng 或 stress)")
result["note"] = "请安装 stress-ng 或 stress: yum install stress / apt install stress-ng"
return result
if __name__ == '__main__':
import json
print(json.dumps(run_cpu_check(stress_test=False), indent=2, ensure_ascii=False))

497
modules/gpu.py Normal file
View File

@@ -0,0 +1,497 @@
"""
ServerGuard - 显卡检测模块
检测 GPU 信息、温度、驱动状态等。
"""
import os
import re
from typing import Dict, Any, List, Optional
import sys
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from utils import (
execute_command, check_command_exists, parse_key_value_output,
safe_int, safe_float, format_bytes
)
def run_gpu_check() -> Dict[str, Any]:
"""
执行 GPU 检测。
Returns:
Dict[str, Any]: 检测结果
"""
result = {
"status": "success",
"gpus": [],
"errors": []
}
try:
# 检测 NVIDIA GPU
nvidia_gpus = check_nvidia_gpus()
if nvidia_gpus:
result["gpus"].extend(nvidia_gpus)
# 检测 AMD GPU
amd_gpus = check_amd_gpus()
if amd_gpus:
result["gpus"].extend(amd_gpus)
# 检测 Intel GPU
intel_gpus = check_intel_gpus()
if intel_gpus:
result["gpus"].extend(intel_gpus)
# 如果没有找到 GPU使用 lspci 基础检测
if not result["gpus"]:
result["gpus"] = check_generic_gpus()
# 检查系统日志中的 GPU 错误
result["dmesg_errors"] = check_gpu_dmesg_errors()
# 如果有错误,更新状态
if result["dmesg_errors"]:
result["status"] = "warning"
if not result["gpus"]:
result["status"] = "unknown"
result["note"] = "未检测到 GPU 设备"
except Exception as e:
result["status"] = "error"
result["error"] = str(e)
return result
def check_nvidia_gpus() -> List[Dict[str, Any]]:
"""检测 NVIDIA GPU。"""
gpus = []
if not check_command_exists('nvidia-smi'):
return gpus
try:
# 获取 GPU 列表和基本信息
_, stdout, _ = execute_command(
['nvidia-smi', '--query-gpu=gpu_name,gpu_bus_id,pci.bus_id,driver_version,pstate,pcie.link.gen.max,pcie.link.gen.current',
'--format=csv,noheader'],
check_returncode=False, timeout=10
)
for i, line in enumerate(stdout.strip().split('\n')):
if not line.strip():
continue
parts = [p.strip() for p in line.split(',')]
if len(parts) >= 4:
gpu_info = {
"vendor": "NVIDIA",
"index": i,
"name": parts[0],
"bus_id": parts[1] if len(parts) > 1 else "unknown",
"pci_bus_id": parts[2] if len(parts) > 2 else "unknown",
"driver_version": parts[3],
"pstate": parts[4] if len(parts) > 4 else "unknown",
"pcie_max_gen": parts[5] if len(parts) > 5 else "unknown",
"pcie_current_gen": parts[6] if len(parts) > 6 else "unknown"
}
# 获取详细信息
gpu_info.update(get_nvidia_gpu_details(i))
gpus.append(gpu_info)
except Exception as e:
pass
return gpus
def get_nvidia_gpu_details(gpu_index: int) -> Dict[str, Any]:
"""获取单个 NVIDIA GPU 的详细信息。"""
details = {}
try:
# 获取温度和功耗
_, stdout, _ = execute_command(
['nvidia-smi', '--query-gpu=temperature.gpu,power.draw,power.limit,clocks.gr,clocks.mem,utilization.gpu,utilization.memory,memory.total,memory.used,memory.free,serial,uuid,vbios_version',
'--format=csv,noheader,nounits', '-i', str(gpu_index)],
check_returncode=False, timeout=10
)
parts = [p.strip() for p in stdout.split(',')]
if len(parts) >= 10:
details["temperature_c"] = safe_int(parts[0]) if parts[0] != '[Not Supported]' else None
details["power_draw_w"] = safe_float(parts[1]) if parts[1] != '[Not Supported]' else None
details["power_limit_w"] = safe_float(parts[2]) if parts[2] != '[Not Supported]' else None
details["graphics_clock_mhz"] = safe_int(parts[3]) if parts[3] != '[Not Supported]' else None
details["memory_clock_mhz"] = safe_int(parts[4]) if parts[4] != '[Not Supported]' else None
details["gpu_utilization_percent"] = safe_int(parts[5]) if parts[5] != '[Not Supported]' else None
details["memory_utilization_percent"] = safe_int(parts[6]) if parts[6] != '[Not Supported]' else None
details["memory_total_mb"] = safe_int(parts[7]) if parts[7] != '[Not Supported]' else None
details["memory_used_mb"] = safe_int(parts[8]) if parts[8] != '[Not Supported]' else None
details["memory_free_mb"] = safe_int(parts[9]) if parts[9] != '[Not Supported]' else None
if len(parts) > 10:
details["serial"] = parts[10] if parts[10] != '[Not Supported]' else None
if len(parts) > 11:
details["uuid"] = parts[11] if parts[11] != '[Not Supported]' else None
if len(parts) > 12:
details["vbios_version"] = parts[12] if parts[12] != '[Not Supported]' else None
# 获取 ECC 状态
_, ecc_output, _ = execute_command(
['nvidia-smi', '--query-gpu=ecc.mode.current,ecc.mode.pending,ecc.errors.corrected.volatile.total,ecc.errors.uncorrected.volatile.total',
'--format=csv,noheader', '-i', str(gpu_index)],
check_returncode=False, timeout=10
)
ecc_parts = [p.strip() for p in ecc_output.split(',')]
if len(ecc_parts) >= 4:
details["ecc_mode"] = ecc_parts[0] if ecc_parts[0] != '[Not Supported]' else None
details["ecc_pending"] = ecc_parts[1] if ecc_parts[1] != '[Not Supported]' else None
details["ecc_corrected_errors"] = safe_int(ecc_parts[2]) if ecc_parts[2] != '[Not Supported]' else 0
details["ecc_uncorrected_errors"] = safe_int(ecc_parts[3]) if ecc_parts[3] != '[Not Supported]' else 0
# 获取进程信息
_, proc_output, _ = execute_command(
['nvidia-smi', 'pmon', '-s', 'um', '-c', '1', '-i', str(gpu_index)],
check_returncode=False, timeout=5
)
processes = []
for line in proc_output.split('\n')[2:]: # 跳过表头
if line.strip() and not line.startswith('#'):
proc_parts = line.split()
if len(proc_parts) >= 5:
processes.append({
"pid": proc_parts[1],
"type": proc_parts[2],
"sm_util": proc_parts[3],
"mem_util": proc_parts[4]
})
if processes:
details["processes"] = processes
except:
pass
return details
def check_amd_gpus() -> List[Dict[str, Any]]:
"""检测 AMD GPU。"""
gpus = []
# 使用 radeontop 获取信息
if check_command_exists('radeontop'):
try:
# radeontop 需要图形环境,使用 -d 参数输出到文件
import tempfile
with tempfile.NamedTemporaryFile(mode='r', suffix='.txt', delete=False) as f:
dump_file = f.name
try:
_, stdout, _ = execute_command(
['radeontop', '-d', dump_file, '-l', '1'],
check_returncode=False, timeout=5
)
with open(dump_file, 'r') as f:
output = f.read()
gpu_info = {"vendor": "AMD"}
# 解析 radeontop 输出
for line in output.split('\n'):
if 'GPU' in line and ':' in line:
parts = line.split(':')
if len(parts) == 2:
key = parts[0].strip().lower().replace(' ', '_')
value = parts[1].strip()
gpu_info[key] = value
if gpu_info:
gpus.append(gpu_info)
finally:
if os.path.exists(dump_file):
os.unlink(dump_file)
except:
pass
# 尝试从 sysfs 获取 AMD GPU 信息
try:
for card in os.listdir('/sys/class/drm'):
if card.startswith('card') and not card[-1].isdigit() or (card.startswith('card') and os.path.exists(f'/sys/class/drm/{card}/device/vendor')):
vendor_path = f'/sys/class/drm/{card}/device/vendor'
if os.path.exists(vendor_path):
with open(vendor_path, 'r') as f:
vendor_id = f.read().strip()
# AMD vendor ID 是 0x1002
if vendor_id == '0x1002':
gpu_info = {
"vendor": "AMD",
"card": card
}
# 获取设备信息
device_path = f'/sys/class/drm/{card}/device/device'
if os.path.exists(device_path):
with open(device_path, 'r') as f:
gpu_info["device_id"] = f.read().strip()
# 获取驱动
driver_path = f'/sys/class/drm/{card}/device/driver'
if os.path.exists(driver_path):
driver = os.path.basename(os.readlink(driver_path))
gpu_info["driver"] = driver
# 获取温度
temp_path = f'/sys/class/drm/{card}/device/hwmon/hwmon0/temp1_input'
if os.path.exists(temp_path):
with open(temp_path, 'r') as f:
temp_mc = safe_int(f.read().strip())
gpu_info["temperature_c"] = temp_mc / 1000.0
# 获取频率
freq_path = f'/sys/class/drm/{card}/device/pp_dpm_sclk'
if os.path.exists(freq_path):
with open(freq_path, 'r') as f:
gpu_info["core_clock_levels"] = f.read().strip()
gpus.append(gpu_info)
except:
pass
return gpus
def check_intel_gpus() -> List[Dict[str, Any]]:
"""检测 Intel GPU。"""
gpus = []
# 从 sysfs 获取 Intel GPU 信息
try:
for card in os.listdir('/sys/class/drm'):
if not card.startswith('card'):
continue
vendor_path = f'/sys/class/drm/{card}/device/vendor'
if not os.path.exists(vendor_path):
continue
with open(vendor_path, 'r') as f:
vendor_id = f.read().strip()
# Intel vendor ID 是 0x8086
if vendor_id == '0x8086':
gpu_info = {
"vendor": "Intel",
"card": card
}
# 获取设备信息
device_path = f'/sys/class/drm/{card}/device/device'
if os.path.exists(device_path):
with open(device_path, 'r') as f:
gpu_info["device_id"] = f.read().strip()
# 获取驱动
driver_path = f'/sys/class/drm/{card}/device/driver'
if os.path.exists(driver_path):
driver = os.path.basename(os.readlink(driver_path))
gpu_info["driver"] = driver
# Intel GPU 通常集成,标记为集成显卡
gpu_info["type"] = "integrated"
gpus.append(gpu_info)
except:
pass
return gpus
def check_generic_gpus() -> List[Dict[str, Any]]:
"""使用 lspci 进行通用 GPU 检测。"""
gpus = []
if not check_command_exists('lspci'):
return gpus
try:
_, stdout, _ = execute_command(
['lspci', '-nn'],
check_returncode=False, timeout=10
)
for line in stdout.split('\n'):
if 'VGA' in line or '3D controller' in line or 'Display controller' in line:
parts = line.split(': ', 1)
if len(parts) == 2:
bus_id = parts[0].split()[0]
description = parts[1]
gpu_info = {
"bus_id": bus_id,
"description": description
}
# 识别厂商
desc_lower = description.lower()
if 'nvidia' in desc_lower:
gpu_info["vendor"] = "NVIDIA"
elif 'amd' in desc_lower or 'ati' in desc_lower:
gpu_info["vendor"] = "AMD"
elif 'intel' in desc_lower:
gpu_info["vendor"] = "Intel"
else:
gpu_info["vendor"] = "Unknown"
# 识别类型
if 'VGA' in line:
gpu_info["type"] = "vga"
elif '3D controller' in line:
gpu_info["type"] = "3d"
elif 'Display controller' in line:
gpu_info["type"] = "display"
# 获取详细信息
try:
_, detail, _ = execute_command(
['lspci', '-v', '-s', bus_id],
check_returncode=False, timeout=5
)
# 提取驱动信息
driver_match = re.search(r'Kernel driver in use:\s*(\S+)', detail)
if driver_match:
gpu_info["driver"] = driver_match.group(1)
# 提取模块信息
modules_match = re.search(r'Kernel modules:\s*(.+)', detail)
if modules_match:
gpu_info["modules"] = modules_match.group(1).strip()
except:
pass
gpus.append(gpu_info)
except:
pass
return gpus
def check_gpu_dmesg_errors() -> List[Dict[str, str]]:
"""检查 dmesg 中的 GPU 相关错误。"""
errors = []
if not check_command_exists('dmesg'):
return errors
try:
_, stdout, _ = execute_command(
['dmesg'],
check_returncode=False, timeout=10
)
# GPU 相关错误关键词
gpu_error_patterns = [
r'GPU has fallen off the bus',
r'NVRM: Xid',
r'nvidia.*error',
r'amdgpu.*error',
r'i915.*error',
r'GPU hang',
r'ring.*timeout',
r'Failed to load firmware',
r'VRAM lost',
r'gpu.*fault',
r' thermal ',
]
for line in stdout.split('\n'):
line_lower = line.lower()
# 检查是否包含 GPU 相关错误
is_gpu_error = any(
re.search(pattern, line, re.IGNORECASE)
for pattern in gpu_error_patterns
)
if is_gpu_error and ('error' in line_lower or 'fail' in line_lower or 'warn' in line_lower or 'Xid' in line):
# 提取时间戳
timestamp_match = re.match(r'\[\s*([\d.]+)\]', line)
timestamp = timestamp_match.group(1) if timestamp_match else "unknown"
errors.append({
"timestamp": timestamp,
"message": line.strip()
})
# 去重并限制数量
seen = set()
unique_errors = []
for error in errors:
msg = error["message"]
if msg not in seen and len(unique_errors) < 20:
seen.add(msg)
unique_errors.append(error)
return unique_errors
except:
return []
def get_gpu_processes() -> List[Dict[str, Any]]:
"""获取使用 GPU 的进程列表(仅 NVIDIA"""
processes = []
if not check_command_exists('nvidia-smi'):
return processes
try:
_, stdout, _ = execute_command(
['nvidia-smi', 'pmon', '-s', 'um', '-c', '1'],
check_returncode=False, timeout=5
)
lines = stdout.strip().split('\n')
# 跳过前两行(表头)
for line in lines[2:]:
if line.strip() and not line.startswith('#'):
parts = line.split()
if len(parts) >= 8:
processes.append({
"gpu_index": safe_int(parts[0]),
"pid": parts[1],
"type": parts[2],
"sm_util": parts[3],
"mem_util": parts[4],
"enc_util": parts[5],
"dec_util": parts[6],
"command": parts[7]
})
except:
pass
return processes
if __name__ == '__main__':
import json
print(json.dumps(run_gpu_check(), indent=2, ensure_ascii=False))

553
modules/log_analyzer.py Normal file
View File

@@ -0,0 +1,553 @@
"""
ServerGuard - 日志分析模块
自动分析系统日志,查找硬件相关错误关键词。
"""
import os
import re
import gzip
from typing import Dict, Any, List, Optional
from datetime import datetime, timedelta
import sys
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from utils import execute_command, check_command_exists, safe_int
# 硬件错误关键词分类
HARDWARE_ERROR_PATTERNS = {
"cpu_errors": [
r'Machine check events? logged',
r'Hardware Error',
r'CMCI storm',
r'machine check',
r'CPU\s*\d+.*temperature',
r'thermal.*cpu',
r'CPU.*throttl',
r'core.*temp',
r'CPU.*fault',
r'uncorrectable',
r'correctable.*error',
],
"memory_errors": [
r'Hardware error.*memory',
r'EDAC.*error',
r'memory.*error',
r'Memory.*parity',
r'ECC.*error',
r'ue\s+count',
r'ce\s+count',
r'Out of memory',
r'oom-kill',
r'page allocation failure',
],
"storage_errors": [
r'I/O error',
r'Buffer I/O error',
r'blk_update_request',
r'ata\d+.*error',
r'SATA.*error',
r'NVMe.*error',
r'critical.*warning',
r'disk error',
r'block.*error',
r'SMART.*failure',
r'medium error',
r'uncorrectable error',
],
"pci_errors": [
r'PCIe.*error',
r'pcieport.*error',
r'PCI.*error',
r'AER:\s*',
r'Corrected error',
r'Uncorrected error',
r'Non-Fatal error',
r'Fatal error',
r'Unsupported Request',
],
"usb_errors": [
r'usb.*error',
r'USB.*over-current',
r'usb.*disconnect',
r'usb.*timeout',
r'ehci.*error',
r'xhci.*error',
],
"power_errors": [
r'thermal.*shutdown',
r'critical.*temperature',
r'overheat',
r'power.*fail',
r'under.*voltage',
r'over.*voltage',
r'brownout',
r'power.*button',
],
"kernel_panics": [
r'Kernel panic',
r'sysrq.*trigger',
r'watchdog.*bug',
r'softlockup',
r'hardlockup',
r'BUG:.*spinlock',
r'BUG:.*scheduling',
r'Oops:',
r'Call Trace:',
r'general protection fault',
r'double fault',
r'stack.*corruption',
]
}
def analyze_logs() -> Dict[str, Any]:
"""
分析系统日志中的硬件错误。
Returns:
Dict[str, Any]: 分析结果
"""
result = {
"status": "success",
"scan_time": datetime.now().isoformat(),
"dmesg_analysis": {},
"journal_analysis": {},
"hardware_errors": {},
"critical_events": [],
"summary": {}
}
try:
# 分析 dmesg
result["dmesg_analysis"] = analyze_dmesg()
# 分析 journalctl
result["journal_analysis"] = analyze_journalctl()
# 汇总错误统计
result["hardware_errors"] = summarize_errors(result)
# 识别关键事件
result["critical_events"] = identify_critical_events(result)
# 生成摘要
total_errors = sum(result["hardware_errors"].values())
result["summary"] = {
"total_errors_found": total_errors,
"critical_events": len(result["critical_events"]),
"recommend_action": total_errors > 0
}
# 如果有错误,标记警告状态
if total_errors > 0:
result["status"] = "warning"
except Exception as e:
result["status"] = "error"
result["error"] = str(e)
return result
def analyze_dmesg() -> Dict[str, Any]:
"""分析 dmesg 输出。"""
result = {
"available": False,
"error_counts": {key: 0 for key in HARDWARE_ERROR_PATTERNS.keys()},
"recent_errors": [],
"boot_errors": []
}
if not check_command_exists('dmesg'):
result["note"] = "dmesg 不可用"
return result
try:
# 获取 dmesg 输出
_, stdout, _ = execute_command(
['dmesg', '--time-format=iso'],
check_returncode=False, timeout=15
)
result["available"] = True
# 如果没有 --time-format 支持,使用标准格式
if not stdout.strip():
_, stdout, _ = execute_command(
['dmesg'],
check_returncode=False, timeout=15
)
lines = stdout.split('\n')
# 分析每一行
for line in lines:
if not line.strip():
continue
# 检查各类错误
for error_type, patterns in HARDWARE_ERROR_PATTERNS.items():
for pattern in patterns:
if re.search(pattern, line, re.IGNORECASE):
result["error_counts"][error_type] += 1
# 保存最近的一些错误
if len(result["recent_errors"]) < 50:
error_entry = {
"type": error_type,
"message": line.strip(),
"pattern": pattern
}
if error_entry not in result["recent_errors"]:
result["recent_errors"].append(error_entry)
break
# 检查启动错误
result["boot_errors"] = extract_boot_errors(lines)
except Exception as e:
result["error"] = str(e)
return result
def analyze_journalctl() -> Dict[str, Any]:
"""分析 journalctl 日志。"""
result = {
"available": False,
"error_counts": {key: 0 for key in HARDWARE_ERROR_PATTERNS.keys()},
"recent_errors": [],
"boot_events": []
}
if not check_command_exists('journalctl'):
result["note"] = "journalctl 不可用"
return result
try:
# 获取最近 1000 行日志
_, stdout, stderr = execute_command(
['journalctl', '-n', '1000', '--no-pager', '-p', 'err'],
check_returncode=False, timeout=15
)
if 'No journal files were found' in stderr:
result["note"] = "无 journal 文件"
return result
result["available"] = True
lines = stdout.split('\n')
for line in lines:
if not line.strip():
continue
# 检查各类错误
for error_type, patterns in HARDWARE_ERROR_PATTERNS.items():
for pattern in patterns:
if re.search(pattern, line, re.IGNORECASE):
result["error_counts"][error_type] += 1
if len(result["recent_errors"]) < 50:
error_entry = {
"type": error_type,
"message": line.strip()
}
if error_entry not in result["recent_errors"]:
result["recent_errors"].append(error_entry)
break
# 获取启动事件
result["boot_events"] = get_journal_boot_events()
except Exception as e:
result["error"] = str(e)
return result
def extract_boot_errors(lines: List[str]) -> List[Dict[str, str]]:
"""提取启动过程中的错误。"""
boot_errors = []
in_boot = False
for line in lines:
# 检测启动阶段
if 'Linux version' in line or 'Command line:' in line:
in_boot = True
if in_boot and ('error' in line.lower() or 'fail' in line.lower() or 'warn' in line.lower()):
# 排除常见的非关键消息
if not any(x in line.lower() for x in ['firmware', 'efi', 'acpi']):
boot_errors.append({
"stage": "boot",
"message": line.strip()
})
# 启动完成后停止
if in_boot and ('systemd' in line and 'startup' in line):
in_boot = False
return boot_errors[:20] # 限制数量
def get_journal_boot_events() -> List[Dict[str, str]]:
"""获取 journalctl 中的启动事件。"""
events = []
try:
# 获取当前启动的日志
_, stdout, _ = execute_command(
['journalctl', '-b', '0', '--no-pager', '-p', 'warning'],
check_returncode=False, timeout=10
)
for line in stdout.split('\n'):
if 'error' in line.lower() or 'fail' in line.lower() or 'hardware' in line.lower():
events.append({"message": line.strip()})
return events[:20]
except:
return []
def summarize_errors(analysis_result: Dict[str, Any]) -> Dict[str, int]:
"""汇总错误统计。"""
summary = {key: 0 for key in HARDWARE_ERROR_PATTERNS.keys()}
# 合并 dmesg 和 journalctl 的统计
dmesg_counts = analysis_result.get("dmesg_analysis", {}).get("error_counts", {})
journal_counts = analysis_result.get("journal_analysis", {}).get("error_counts", {})
for error_type in summary.keys():
summary[error_type] = dmesg_counts.get(error_type, 0) + journal_counts.get(error_type, 0)
return summary
def identify_critical_events(analysis_result: Dict[str, Any]) -> List[Dict[str, Any]]:
"""识别需要立即关注的关键事件。"""
critical_events = []
# 合并所有错误
all_errors = []
all_errors.extend(analysis_result.get("dmesg_analysis", {}).get("recent_errors", []))
all_errors.extend(analysis_result.get("journal_analysis", {}).get("recent_errors", []))
# 定义关键错误模式
critical_patterns = [
(r'Kernel panic', 'kernel_panic', '内核崩溃'),
(r'hardlockup', 'hard_lockup', 'CPU 硬死锁'),
(r'softlockup', 'soft_lockup', 'CPU 软死锁'),
(r'thermal.*shutdown', 'thermal_shutdown', '过热关机'),
(r'Hardware Error', 'hardware_error', '硬件错误'),
(r'Fatal.*PCIe', 'pcie_fatal', 'PCIe 致命错误'),
(r'I/O error.*sector', 'disk_io_error', '磁盘 I/O 错误'),
(r'Uncorrectable.*error', 'uncorrectable_error', '不可纠正错误'),
(r'out of memory.*kill', 'oom_kill', 'OOM 进程杀死'),
(r'GPU.*fallen.*bus', 'gpu_disconnect', 'GPU 断开连接'),
]
for error in all_errors:
message = error.get("message", "")
for pattern, event_type, description in critical_patterns:
if re.search(pattern, message, re.IGNORECASE):
event = {
"type": event_type,
"description": description,
"message": message[:200], # 限制长度
"source": "dmesg" if error in analysis_result.get("dmesg_analysis", {}).get("recent_errors", []) else "journal"
}
# 避免重复
if event not in critical_events:
critical_events.append(event)
return critical_events
def get_kernel_panic_logs() -> List[Dict[str, str]]:
"""专门查找内核崩溃信息。"""
panics = []
# 检查 dmesg
if check_command_exists('dmesg'):
try:
_, stdout, _ = execute_command(['dmesg'], check_returncode=False, timeout=10)
for line in stdout.split('\n'):
if 'Kernel panic' in line or 'sysrq' in line.lower():
panics.append({
"source": "dmesg",
"message": line.strip()
})
except:
pass
# 检查 journalctl
if check_command_exists('journalctl'):
try:
_, stdout, _ = execute_command(
['journalctl', '-k', '--no-pager', '-g', 'panic'],
check_returncode=False, timeout=10
)
for line in stdout.split('\n'):
if 'panic' in line.lower():
panics.append({
"source": "journalctl",
"message": line.strip()
})
except:
pass
return panics
def get_hardware_error_logs() -> Dict[str, List[str]]:
"""获取特定类型的硬件错误日志。"""
result = {
"mce_errors": [],
"ecc_errors": [],
"io_errors": [],
"thermal_errors": []
}
if check_command_exists('dmesg'):
try:
_, stdout, _ = execute_command(['dmesg'], check_returncode=False, timeout=10)
for line in stdout.split('\n'):
# MCE 错误
if re.search(r'Machine check|CMCI|hardware error', line, re.IGNORECASE):
result["mce_errors"].append(line.strip())
# ECC 错误
if re.search(r'ECC|EDAC|memory error', line, re.IGNORECASE):
result["ecc_errors"].append(line.strip())
# I/O 错误
if re.search(r'I/O error|ata.*error|blk_update', line, re.IGNORECASE):
result["io_errors"].append(line.strip())
# 热错误
if re.search(r'thermal|overheat|critical temp', line, re.IGNORECASE):
result["thermal_errors"].append(line.strip())
except:
pass
# 限制数量
for key in result:
result[key] = result[key][:20]
return result
def search_logs_by_keyword(keyword: str, max_lines: int = 100) -> List[str]:
"""
根据关键词搜索日志。
Args:
keyword: 搜索关键词
max_lines: 最大返回行数
Returns:
List[str]: 匹配的行列表
"""
results = []
# 搜索 dmesg
if check_command_exists('dmesg'):
try:
_, stdout, _ = execute_command(
['dmesg'],
check_returncode=False, timeout=10
)
for line in stdout.split('\n'):
if keyword.lower() in line.lower():
results.append(f"[dmesg] {line.strip()}")
if len(results) >= max_lines:
return results
except:
pass
# 搜索 journalctl
if check_command_exists('journalctl'):
try:
_, stdout, _ = execute_command(
['journalctl', '-n', str(max_lines * 2), '--no-pager'],
check_returncode=False, timeout=10
)
for line in stdout.split('\n'):
if keyword.lower() in line.lower():
results.append(f"[journal] {line.strip()}")
if len(results) >= max_lines:
return results
except:
pass
return results
def get_system_logs(since: Optional[str] = None, until: Optional[str] = None) -> Dict[str, Any]:
"""
获取系统日志。
Args:
since: 开始时间 (格式: '2024-01-01 00:00:00')
until: 结束时间
Returns:
Dict[str, Any]: 日志数据
"""
result = {
"dmesg": "",
"journalctl": "",
"kern_log": ""
}
# dmesg
if check_command_exists('dmesg'):
try:
_, stdout, _ = execute_command(['dmesg'], check_returncode=False, timeout=10)
result["dmesg"] = stdout
except:
pass
# journalctl
if check_command_exists('journalctl'):
try:
cmd = ['journalctl', '--no-pager', '-n', '5000']
if since:
cmd.extend(['--since', since])
if until:
cmd.extend(['--until', until])
_, stdout, _ = execute_command(cmd, check_returncode=False, timeout=15)
result["journalctl"] = stdout
except:
pass
# /var/log/kern.log
kern_log_path = '/var/log/kern.log'
if os.path.exists(kern_log_path):
try:
with open(kern_log_path, 'r', encoding='utf-8', errors='ignore') as f:
lines = f.readlines()[-5000:] # 最后 5000 行
result["kern_log"] = ''.join(lines)
except:
pass
return result
if __name__ == '__main__':
import json
print(json.dumps(analyze_logs(), indent=2, ensure_ascii=False))

577
modules/memory.py Normal file
View File

@@ -0,0 +1,577 @@
"""
ServerGuard - 内存检测与压力测试模块
深度检测内存的读写错误和稳定性。
"""
import os
import re
import time
from typing import Dict, Any, List, Optional
import sys
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from utils import (
execute_command, check_command_exists, safe_int, safe_float,
format_bytes, require_root
)
def run_memory_check(stress_test: bool = False, stress_duration: int = 300) -> Dict[str, Any]:
"""
执行内存检测。
Args:
stress_test: 是否执行压力测试
stress_duration: 压力测试持续时间(秒)
Returns:
Dict[str, Any]: 检测结果
"""
result = {
"status": "success",
"summary": {},
"dimm_info": [],
"ecc_status": {},
"edac_errors": {},
"stress_test": {}
}
try:
# 获取内存摘要信息
result["summary"] = get_memory_summary()
# 获取 DIMM 详细信息
result["dimm_info"] = get_dimm_info()
# 检查 ECC 状态
result["ecc_status"] = check_ecc_status()
# 检查 EDAC 错误
result["edac_errors"] = check_edac_errors()
if result["edac_errors"].get("total_errors", 0) > 0:
result["status"] = "warning"
# 执行内存压力测试
if stress_test:
# 优先使用 memtester
if check_command_exists('memtester'):
result["stress_test"] = run_memtester(stress_duration)
# 备选使用 stress-ng
elif check_command_exists('stress-ng'):
result["stress_test"] = run_memory_stress_ng(stress_duration)
# 最后使用 stress
elif check_command_exists('stress'):
result["stress_test"] = run_memory_stress(stress_duration)
else:
result["stress_test"] = {
"passed": False,
"error": "未找到内存压力测试工具 (memtester/stress-ng/stress)"
}
if not result["stress_test"].get("passed", False):
result["status"] = "error"
except Exception as e:
result["status"] = "error"
result["error"] = str(e)
return result
def get_memory_summary() -> Dict[str, Any]:
"""获取内存摘要信息。"""
result = {
"total_bytes": 0,
"total_gb": 0,
"available_bytes": 0,
"available_gb": 0,
"used_bytes": 0,
"used_gb": 0,
"free_bytes": 0,
"free_gb": 0,
"buffers_bytes": 0,
"cached_bytes": 0,
"swap_total_bytes": 0,
"swap_used_bytes": 0,
"swap_free_bytes": 0
}
try:
with open('/proc/meminfo', 'r') as f:
meminfo = f.read()
# 解析 meminfo
patterns = {
"total_bytes": r'MemTotal:\s+(\d+)',
"free_bytes": r'MemFree:\s+(\d+)',
"available_bytes": r'MemAvailable:\s+(\d+)',
"buffers_bytes": r'Buffers:\s+(\d+)',
"cached_bytes": r'Cached:\s+(\d+)',
"swap_total_bytes": r'SwapTotal:\s+(\d+)',
"swap_free_bytes": r'SwapFree:\s+(\d+)'
}
for key, pattern in patterns.items():
match = re.search(pattern, meminfo)
if match:
kb = safe_int(match.group(1))
bytes_val = kb * 1024
result[key] = bytes_val
# 同时设置 GB 版本
gb_key = key.replace('bytes', 'gb')
result[gb_key] = round(bytes_val / (1024**3), 2)
# 计算已用内存
result["used_bytes"] = result["total_bytes"] - result["free_bytes"] - result["buffers_bytes"] - result["cached_bytes"]
result["used_gb"] = round(result["used_bytes"] / (1024**3), 2)
# 计算交换空间使用情况
result["swap_used_bytes"] = result["swap_total_bytes"] - result["swap_free_bytes"]
result["swap_used_gb"] = round(result["swap_used_bytes"] / (1024**3), 2)
result["swap_free_gb"] = round(result["swap_free_bytes"] / (1024**3), 2)
# 计算使用百分比
if result["total_bytes"] > 0:
result["usage_percent"] = round((result["used_bytes"] / result["total_bytes"]) * 100, 1)
except Exception as e:
result["error"] = str(e)
return result
def get_dimm_info() -> List[Dict[str, Any]]:
"""获取 DIMM内存条详细信息。"""
dimms = []
if check_command_exists('dmidecode'):
try:
_, stdout, _ = execute_command(
['dmidecode', '-t', 'memory'],
check_returncode=False, timeout=15
)
# 分割每个内存设备
devices = stdout.split('Memory Device')
for device in devices[1:]: # 第一个是标题,跳过
dimm = {}
# 解析各项属性
patterns = {
"array_handle": r'Array Handle:\s*(\S+)',
"error_handle": r'Error Information Handle:\s*(\S+)',
"total_width": r'Total Width:\s*(\d+)',
"data_width": r'Data Width:\s*(\d+)',
"size": r'Size:\s*(.*)',
"form_factor": r'Form Factor:\s*(\S+)',
"set": r'Set:\s*(\S+)',
"locator": r'Locator:\s*(.+)',
"bank_locator": r'Bank Locator:\s*(.+)',
"type": r'Type:\s*(\S+)',
"type_detail": r'Type Detail:\s*(.+)',
"speed": r'Speed:\s*(.*)',
"manufacturer": r'Manufacturer:\s*(\S+)',
"serial_number": r'Serial Number:\s*(\S+)',
"asset_tag": r'Asset Tag:\s*(\S+)',
"part_number": r'Part Number:\s*(\S+)',
"rank": r'Rank:\s*(\d+)',
"configured_speed": r'Configured Memory Speed:\s*(.*)',
"minimum_voltage": r'Minimum Voltage:\s*(.+)',
"maximum_voltage": r'Maximum Voltage:\s*(.+)',
"configured_voltage": r'Configured Voltage:\s*(.+)'
}
for key, pattern in patterns.items():
match = re.search(pattern, device, re.IGNORECASE)
if match:
value = match.group(1).strip()
# 跳过无效值
if value not in ['Not Specified', 'To be filled by O.E.M.', 'None', 'No Module Installed', 'Unknown']:
dimm[key] = value
# 解析大小
if 'size' in dimm:
size_str = dimm['size']
if 'MB' in size_str:
dimm["size_mb"] = safe_int(size_str.replace('MB', '').strip())
elif 'GB' in size_str:
dimm["size_gb"] = safe_float(size_str.replace('GB', '').strip())
dimm["size_mb"] = int(dimm["size_gb"] * 1024)
elif 'No Module' in size_str:
continue # 跳过空插槽
# 解析速度
if 'speed' in dimm:
speed_str = dimm['speed']
if 'MT/s' in speed_str:
dimm["speed_mts"] = safe_int(speed_str.replace('MT/s', '').strip())
elif 'MHz' in speed_str:
dimm["speed_mhz"] = safe_int(speed_str.replace('MHz', '').strip())
if dimm:
dimms.append(dimm)
except Exception as e:
pass
return dimms
def check_ecc_status() -> Dict[str, Any]:
"""检查 ECC错误校正码内存状态。"""
result = {
"supported": False,
"enabled": False,
"mode": "unknown",
"errors": 0
}
# 方法 1: 检查 /proc/meminfo
try:
with open('/proc/meminfo', 'r') as f:
content = f.read()
if 'HardwareCorrupted' in content:
result["supported"] = True
match = re.search(r'HardwareCorrupted:\s+(\d+)\s+kB', content)
if match:
result["errors"] = safe_int(match.group(1))
except:
pass
# 方法 2: 使用 dmidecode 检查内存类型
if check_command_exists('dmidecode'):
try:
_, stdout, _ = execute_command(
['dmidecode', '-t', 'memory'],
check_returncode=False, timeout=10
)
if 'ECC' in stdout or 'Error Correction' in stdout:
result["supported"] = True
# 尝试提取 ECC 模式
match = re.search(r'Error Correction Type:\s*(.+)', stdout)
if match:
result["mode"] = match.group(1).strip()
result["enabled"] = result["mode"] != 'None'
except:
pass
# 方法 3: 检查 EDAC
edac_path = '/sys/devices/system/edac/mc'
if os.path.exists(edac_path):
result["edac_available"] = True
try:
# 检查每个内存控制器
for mc in os.listdir(edac_path):
if mc.startswith('mc'):
mc_path = os.path.join(edac_path, mc)
ce_file = os.path.join(mc_path, 'ce_count') # Correctable errors
ue_file = os.path.join(mc_path, 'ue_count') # Uncorrectable errors
if os.path.exists(ce_file):
with open(ce_file, 'r') as f:
ce_count = safe_int(f.read().strip())
result["correctable_errors"] = result.get("correctable_errors", 0) + ce_count
if os.path.exists(ue_file):
with open(ue_file, 'r') as f:
ue_count = safe_int(f.read().strip())
result["uncorrectable_errors"] = result.get("uncorrectable_errors", 0) + ue_count
except:
pass
return result
def check_edac_errors() -> Dict[str, Any]:
"""检查 EDACError Detection and Correction错误。"""
result = {
"total_errors": 0,
"correctable_errors": 0,
"uncorrectable_errors": 0,
"memory_controllers": []
}
edac_path = '/sys/devices/system/edac/mc'
if not os.path.exists(edac_path):
result["note"] = "EDAC 不可用"
return result
try:
for mc_name in os.listdir(edac_path):
if not mc_name.startswith('mc'):
continue
mc_path = os.path.join(edac_path, mc_name)
mc_info = {"name": mc_name}
# 读取 CE 计数
ce_file = os.path.join(mc_path, 'ce_count')
if os.path.exists(ce_file):
with open(ce_file, 'r') as f:
ce = safe_int(f.read().strip())
mc_info["correctable_errors"] = ce
result["correctable_errors"] += ce
# 读取 UE 计数
ue_file = os.path.join(mc_path, 'ue_count')
if os.path.exists(ue_file):
with open(ue_file, 'r') as f:
ue = safe_int(f.read().strip())
mc_info["uncorrectable_errors"] = ue
result["uncorrectable_errors"] += ue
# 读取内存控制器信息
info_files = ['mc_name', 'size_mb', 'mem_type', 'edac_mc_mode']
for info_file in info_files:
filepath = os.path.join(mc_path, info_file)
if os.path.exists(filepath):
with open(filepath, 'r') as f:
mc_info[info_file] = f.read().strip()
result["memory_controllers"].append(mc_info)
result["total_errors"] = result["correctable_errors"] + result["uncorrectable_errors"]
except Exception as e:
result["error"] = str(e)
return result
@require_root
def run_memtester(duration: int = 300) -> Dict[str, Any]:
"""
运行内存压力测试。
Args:
duration: 测试持续时间(秒),实际 memtester 是基于大小而非时间
Returns:
Dict[str, Any]: 测试结果
"""
result = {
"passed": False,
"size_mb": 0,
"iterations": 1,
"start_time": None,
"end_time": None,
"duration_seconds": 0,
"errors": [],
"tests_run": []
}
if not check_command_exists('memtester'):
result["errors"].append("memtester 未安装")
return result
try:
# 计算测试内存大小
# 留出一些内存给系统和 stress-ng 使用
with open('/proc/meminfo', 'r') as f:
content = f.read()
match = re.search(r'MemAvailable:\s+(\d+)', content)
if match:
available_mb = safe_int(match.group(1)) // 1024
# 使用可用内存的 70%
test_size_mb = max(64, int(available_mb * 0.7))
else:
test_size_mb = 256
result["size_mb"] = test_size_mb
result["start_time"] = time.strftime('%Y-%m-%d %H:%M:%S')
start_ts = time.time()
# 运行 memtester
cmd = ['memtester', f'{test_size_mb}M', '1']
_, stdout, stderr = execute_command(
cmd,
timeout=max(300, test_size_mb), # 根据内存大小调整超时
check_returncode=False
)
result["end_time"] = time.strftime('%Y-%m-%d %H:%M:%S')
result["duration_seconds"] = round(time.time() - start_ts, 2)
output = stdout + stderr
result["raw_output"] = output[:2000] # 保存部分原始输出
# 分析结果
if 'FAILURE' in output.upper():
result["passed"] = False
# 提取错误信息
for line in output.split('\n'):
if 'FAILURE' in line.upper() or 'error' in line.lower():
result["errors"].append(line.strip())
elif 'SUCCESS' in output.upper() or 'ok' in output.lower() or 'finished' in output.lower():
result["passed"] = True
else:
# 检查是否完成所有测试
if 'Done' in output or 'finished' in output.lower():
result["passed"] = True
else:
result["passed"] = False
result["errors"].append("测试可能未完成")
# 提取运行的测试
test_names = [
'Stuck Address', 'Random Value', 'Compare XOR',
'Compare SUB', 'Compare MUL', 'Compare DIV',
'Compare OR', 'Compare AND', 'Sequential Increment',
'Solid Bits', 'Block Sequential', 'Checkerboard',
'Bit Spread', 'Bit Flip', 'Walking Ones', 'Walking Zeroes'
]
for test in test_names:
if test in output:
result["tests_run"].append(test)
except Exception as e:
result["passed"] = False
result["errors"].append(str(e))
return result
@require_root
def run_memory_stress_ng(duration: int = 300) -> Dict[str, Any]:
"""
使用 stress-ng 进行内存压力测试。
Args:
duration: 测试持续时间(秒)
Returns:
Dict[str, Any]: 测试结果
"""
result = {
"passed": False,
"tool": "stress-ng",
"duration_seconds": duration,
"start_time": None,
"end_time": None,
"errors": []
}
if not check_command_exists('stress-ng'):
result["errors"].append("stress-ng 未安装")
return result
try:
result["start_time"] = time.strftime('%Y-%m-%d %H:%M:%S')
# 运行 stress-ng 内存测试
cmd = [
'stress-ng',
'--vm', '4', # 4 个 vm worker
'--vm-bytes', '80%', # 每个 worker 使用 80% 可用内存
'--vm-method', 'all', # 使用所有测试方法
'--timeout', str(duration),
'--metrics-brief'
]
_, stdout, stderr = execute_command(
cmd,
timeout=duration + 30,
check_returncode=False
)
result["end_time"] = time.strftime('%Y-%m-%d %H:%M:%S')
output = stdout + stderr
if 'error' in output.lower() or 'fail' in output.lower():
result["passed"] = False
else:
result["passed"] = True
# 提取指标
bogo_ops = re.search(r'stress-ng:\s+vm:\s+(\d+)\s+bogo ops', output)
if bogo_ops:
result["bogo_ops"] = safe_int(bogo_ops.group(1))
except Exception as e:
result["passed"] = False
result["errors"].append(str(e))
return result
@require_root
def run_memory_stress(duration: int = 300) -> Dict[str, Any]:
"""
使用 stress 进行内存压力测试(备选方案)。
Args:
duration: 测试持续时间(秒)
Returns:
Dict[str, Any]: 测试结果
"""
result = {
"passed": False,
"tool": "stress",
"duration_seconds": duration,
"start_time": None,
"end_time": None,
"workers": 4,
"errors": []
}
if not check_command_exists('stress'):
result["errors"].append("stress 未安装")
return result
try:
result["start_time"] = time.strftime('%Y-%m-%d %H:%M:%S')
# 运行 stress 内存测试
# --vm: 内存分配 worker 数量
# --vm-bytes: 每个 worker 分配的内存
# --vm-keep: 保持内存占用
# --timeout: 超时时间
cmd = [
'stress',
'--vm', '4',
'--vm-bytes', '80%',
'--vm-keep',
'--timeout', str(duration)
]
_, stdout, stderr = execute_command(
cmd,
timeout=duration + 30,
check_returncode=False
)
result["end_time"] = time.strftime('%Y-%m-%d %H:%M:%S')
output = stdout + stderr
# stress 的成功退出码通常是 0
# 如果有错误输出,可能是失败的
if 'error' in output.lower() or 'fail' in output.lower():
result["passed"] = False
else:
result["passed"] = True
except Exception as e:
result["passed"] = False
result["errors"].append(str(e))
return result
if __name__ == '__main__':
import json
print(json.dumps(run_memory_check(stress_test=False), indent=2, ensure_ascii=False))

545
modules/sensors.py Normal file
View File

@@ -0,0 +1,545 @@
"""
ServerGuard - 电源与主板传感器监控模块
监控电源、主板传感器数据,包括温度、电压、风扇转速等。
"""
import os
import re
from typing import Dict, Any, List, Optional
import sys
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from utils import (
execute_command, check_command_exists, parse_key_value_output,
safe_int, safe_float, require_root
)
def run_sensors_check() -> Dict[str, Any]:
"""
执行传感器检测。
Returns:
Dict[str, Any]: 检测结果
"""
result = {
"status": "success",
"lm_sensors": {},
"ipmi_sensors": {},
"thermal_zones": {},
"power_supplies": {},
"ipmi_sel": {}
}
try:
# 获取 lm-sensors 数据
result["lm_sensors"] = get_lm_sensors_data()
# 获取 IPMI 传感器数据
result["ipmi_sensors"] = get_ipmi_sensors_data()
# 获取 thermal zone 数据
result["thermal_zones"] = get_thermal_zones()
# 获取电源信息
result["power_supplies"] = get_power_supply_info()
# 获取 IPMI SEL 日志
result["ipmi_sel"] = get_ipmi_sel_logs()
# 检查警告条件
warnings = check_sensor_warnings(result)
if warnings:
result["warnings"] = warnings
result["status"] = "warning"
except Exception as e:
result["status"] = "error"
result["error"] = str(e)
return result
def get_lm_sensors_data() -> Dict[str, Any]:
"""获取 lm-sensors 传感器数据。"""
result = {
"available": False,
"chips": {}
}
if not check_command_exists('sensors'):
result["error"] = "lm-sensors 未安装"
return result
try:
# 检测传感器芯片
_, stdout, _ = execute_command(
['sensors', '-u'],
check_returncode=False, timeout=15
)
if not stdout.strip():
result["error"] = "无传感器数据,可能需要运行 sensors-detect"
return result
result["available"] = True
# 解析 sensors -u 输出
current_chip = None
current_adapter = None
current_feature = None
for line in stdout.split('\n'):
line = line.rstrip()
if not line:
continue
# 检测芯片名称行(以冒号结尾的非缩进行)
if not line.startswith(' ') and line.endswith(':'):
current_chip = line.rstrip(':')
result["chips"][current_chip] = {
"features": {}
}
current_feature = None
continue
# 检测 Adapter 行
if line.strip().startswith('Adapter:'):
current_adapter = line.split(':', 1)[1].strip()
if current_chip:
result["chips"][current_chip]["adapter"] = current_adapter
continue
# 检测功能名称行(缩进的非冒号结尾行)
if line.startswith(' ') and not line.startswith(' ') and not line.endswith(':'):
current_feature = line.strip().rstrip(':')
if current_chip:
result["chips"][current_chip]["features"][current_feature] = {}
continue
# 检测属性行(四个空格缩进)
if line.startswith(' ') and ':' in line and current_chip and current_feature:
key_value = line.strip().split(':', 1)
if len(key_value) == 2:
key = key_value[0].strip()
value_str = key_value[1].strip()
# 提取数值
value_match = re.search(r'([\d.]+)', value_str)
if value_match:
value = safe_float(value_match.group(1))
feature_data = result["chips"][current_chip]["features"][current_feature]
# 分类存储
if '_input' in key:
feature_data["value"] = value
elif '_max' in key:
feature_data["max"] = value
elif '_min' in key:
feature_data["min"] = value
elif '_crit' in key:
feature_data["critical"] = value
elif '_alarm' in key:
feature_data["alarm"] = value > 0
else:
feature_data[key] = value
# 提取常用传感器的汇总数据
result["summary"] = extract_sensor_summary(result["chips"])
except Exception as e:
result["error"] = str(e)
return result
def extract_sensor_summary(chips: Dict[str, Any]) -> Dict[str, Any]:
"""从传感器数据中提取常用指标的汇总。"""
summary = {
"temperatures": {},
"voltages": {},
"fans": {},
"powers": {},
"currents": {}
}
for chip_name, chip_data in chips.items():
for feature_name, feature_data in chip_data.get("features", {}).items():
value = feature_data.get("value")
if value is None:
continue
feature_lower = feature_name.lower()
# 温度传感器
if 'temp' in feature_lower or 'thermal' in feature_lower:
# 提取传感器编号
temp_match = re.search(r'temp(\d+)', feature_lower)
if temp_match:
temp_id = temp_match.group(1)
summary["temperatures"][f"{chip_name}_temp{temp_id}"] = {
"value": value,
"max": feature_data.get("max"),
"critical": feature_data.get("critical"),
"alarm": feature_data.get("alarm", False)
}
# 电压传感器
elif 'in' in feature_lower or 'voltage' in feature_lower or 'vcc' in feature_lower:
summary["voltages"][f"{chip_name}_{feature_name}"] = {
"value": value,
"min": feature_data.get("min"),
"max": feature_data.get("max"),
"alarm": feature_data.get("alarm", False)
}
# 风扇转速
elif 'fan' in feature_lower:
fan_match = re.search(r'fan(\d+)', feature_lower)
if fan_match:
fan_id = fan_match.group(1)
summary["fans"][f"{chip_name}_fan{fan_id}"] = {
"rpm": value,
"min": feature_data.get("min"),
"alarm": feature_data.get("alarm", False)
}
# 功率传感器
elif 'power' in feature_lower or 'watt' in feature_lower:
summary["powers"][f"{chip_name}_{feature_name}"] = {
"value": value,
"max": feature_data.get("max")
}
# 电流传感器
elif 'curr' in feature_lower or 'amp' in feature_lower:
summary["currents"][f"{chip_name}_{feature_name}"] = {
"value": value,
"max": feature_data.get("max")
}
return summary
def get_ipmi_sensors_data() -> Dict[str, Any]:
"""获取 IPMI 传感器数据。"""
result = {
"available": False,
"sensors": {}
}
if not check_command_exists('ipmitool'):
result["note"] = "ipmitool 未安装"
return result
try:
# 检查 IPMI 是否可用
_, stdout, stderr = execute_command(
['ipmitool', 'sensor'],
check_returncode=False, timeout=10
)
if 'Could not open device' in stderr or 'Driver not found' in stderr:
result["note"] = "IPMI 设备不可用"
return result
result["available"] = True
# 解析传感器列表
for line in stdout.split('\n'):
if not line.strip() or '|' not in line:
continue
parts = [p.strip() for p in line.split('|')]
if len(parts) >= 4:
sensor_name = parts[0]
sensor_value = parts[1]
sensor_unit = parts[2]
sensor_status = parts[3]
result["sensors"][sensor_name] = {
"value": sensor_value,
"unit": sensor_unit,
"status": sensor_status
}
# 分类传感器
result["categories"] = categorize_ipmi_sensors(result["sensors"])
except Exception as e:
result["error"] = str(e)
return result
def categorize_ipmi_sensors(sensors: Dict[str, Any]) -> Dict[str, Dict[str, Any]]:
"""将 IPMI 传感器分类。"""
categories = {
"temperatures": {},
"voltages": {},
"fans": {},
"power": {},
"currents": {},
"other": {}
}
for name, data in sensors.items():
name_lower = name.lower()
unit = data.get("unit", "").lower()
if 'temp' in name_lower or unit == 'degrees c':
categories["temperatures"][name] = data
elif 'volt' in name_lower or unit == 'volts' or 'vcc' in name_lower or '3.3v' in name_lower or '5v' in name_lower or '12v' in name_lower:
categories["voltages"][name] = data
elif 'fan' in name_lower or 'rpm' in unit:
categories["fans"][name] = data
elif 'power' in name_lower or 'watt' in unit:
categories["power"][name] = data
elif 'current' in name_lower or 'amp' in unit:
categories["currents"][name] = data
else:
categories["other"][name] = data
return categories
def get_thermal_zones() -> Dict[str, Any]:
"""从 thermal zone 获取温度信息。"""
result = {
"zones": {},
"policies": {}
}
thermal_path = '/sys/class/thermal'
if not os.path.exists(thermal_path):
return result
try:
for zone_name in os.listdir(thermal_path):
if not zone_name.startswith('thermal_zone'):
continue
zone_path = os.path.join(thermal_path, zone_name)
zone_info = {}
# 读取类型
type_file = os.path.join(zone_path, 'type')
if os.path.exists(type_file):
with open(type_file, 'r') as f:
zone_info["type"] = f.read().strip()
# 读取温度 (毫摄氏度转换为摄氏度)
temp_file = os.path.join(zone_path, 'temp')
if os.path.exists(temp_file):
with open(temp_file, 'r') as f:
temp_mc = safe_int(f.read().strip())
zone_info["temperature_c"] = temp_mc / 1000.0
# 读取策略
policy_file = os.path.join(zone_path, 'policy')
if os.path.exists(policy_file):
with open(policy_file, 'r') as f:
zone_info["policy"] = f.read().strip()
# 读取临界温度
trip_point_file = os.path.join(zone_path, 'trip_point_0_temp')
if os.path.exists(trip_point_file):
with open(trip_point_file, 'r') as f:
zone_info["critical_temp_c"] = safe_int(f.read().strip()) / 1000.0
result["zones"][zone_name] = zone_info
# 读取 thermal 策略
for policy_file in os.listdir('/sys/class/thermal'):
if policy_file.startswith('cooling_device'):
policy_path = os.path.join('/sys/class/thermal', policy_file)
policy_info = {}
type_file = os.path.join(policy_path, 'type')
if os.path.exists(type_file):
with open(type_file, 'r') as f:
policy_info["type"] = f.read().strip()
cur_state_file = os.path.join(policy_path, 'cur_state')
if os.path.exists(cur_state_file):
with open(cur_state_file, 'r') as f:
policy_info["current_state"] = safe_int(f.read().strip())
max_state_file = os.path.join(policy_path, 'max_state')
if os.path.exists(max_state_file):
with open(max_state_file, 'r') as f:
policy_info["max_state"] = safe_int(f.read().strip())
result["policies"][policy_file] = policy_info
except Exception as e:
result["error"] = str(e)
return result
def get_power_supply_info() -> Dict[str, Any]:
"""获取电源信息。"""
result = {
"supplies": []
}
power_supply_path = '/sys/class/power_supply'
if not os.path.exists(power_supply_path):
return result
try:
for supply_name in os.listdir(power_supply_path):
supply_path = os.path.join(power_supply_path, supply_name)
supply_info = {"name": supply_name}
# 读取所有属性文件
for attr in os.listdir(supply_path):
attr_path = os.path.join(supply_path, attr)
if os.path.isfile(attr_path):
try:
with open(attr_path, 'r') as f:
value = f.read().strip()
# 尝试转换为数字
if value.isdigit():
supply_info[attr] = safe_int(value)
else:
try:
supply_info[attr] = safe_float(value)
except:
supply_info[attr] = value
except:
pass
result["supplies"].append(supply_info)
except Exception as e:
result["error"] = str(e)
return result
def get_ipmi_sel_logs() -> Dict[str, Any]:
"""获取 IPMI SELSystem Event Log日志。"""
result = {
"available": False,
"entries": [],
"hardware_errors": [],
"critical_events": []
}
if not check_command_exists('ipmitool'):
result["note"] = "ipmitool 未安装"
return result
try:
# 获取 SEL 列表
_, stdout, stderr = execute_command(
['ipmitool', 'sel', 'elist'],
check_returncode=False, timeout=15
)
if 'Could not open device' in stderr or 'Driver not found' in stderr:
result["note"] = "IPMI 设备不可用"
return result
result["available"] = True
# 解析 SEL 条目
critical_keywords = ['critical', 'failure', 'error', 'thermal', 'voltage', 'power']
hardware_keywords = ['memory', 'processor', 'hard drive', 'fan', 'power supply', 'temperature']
for line in stdout.split('\n'):
if not line.strip():
continue
# SEL 格式: ID | Date/Time | Source | Event
parts = [p.strip() for p in line.split('|')]
if len(parts) >= 4:
entry = {
"id": parts[0],
"datetime": parts[1],
"source": parts[2],
"event": parts[3]
}
result["entries"].append(entry)
# 检查是否为关键事件
event_lower = entry["event"].lower()
if any(kw in event_lower for kw in critical_keywords):
result["critical_events"].append(entry)
# 检查是否为硬件错误
if any(kw in event_lower for kw in hardware_keywords):
result["hardware_errors"].append(entry)
result["total_entries"] = len(result["entries"])
result["critical_count"] = len(result["critical_events"])
result["hardware_error_count"] = len(result["hardware_errors"])
except Exception as e:
result["error"] = str(e)
return result
def check_sensor_warnings(sensor_data: Dict[str, Any]) -> List[str]:
"""检查传感器警告条件。"""
warnings = []
# 检查 lm-sensors 告警
lm_sensors = sensor_data.get("lm_sensors", {})
summary = lm_sensors.get("summary", {})
# 温度告警
for name, temp_data in summary.get("temperatures", {}).items():
if temp_data.get("alarm"):
warnings.append(f"温度传感器 {name} 告警: {temp_data.get('value')}°C")
elif temp_data.get("value", 0) > 90:
warnings.append(f"温度传感器 {name} 温度过高: {temp_data.get('value')}°C")
# 电压告警
for name, volt_data in summary.get("voltages", {}).items():
if volt_data.get("alarm"):
warnings.append(f"电压传感器 {name} 告警: {volt_data.get('value')}V")
# 风扇告警
for name, fan_data in summary.get("fans", {}).items():
if fan_data.get("alarm"):
warnings.append(f"风扇 {name} 告警: {fan_data.get('rpm')} RPM")
elif fan_data.get("rpm", 0) == 0 and fan_data.get("min", 0) > 0:
warnings.append(f"风扇 {name} 可能已停止: {fan_data.get('rpm')} RPM")
# 检查 IPMI 告警
ipmi_sensors = sensor_data.get("ipmi_sensors", {})
for name, data in ipmi_sensors.get("sensors", {}).items():
status = data.get("status", "").lower()
if status in ['critical', 'non-recoverable', 'warning']:
warnings.append(f"IPMI 传感器 {name} 状态异常: {data.get('status')}")
# 检查 IPMI SEL 关键事件
ipmi_sel = sensor_data.get("ipmi_sel", {})
if ipmi_sel.get("critical_count", 0) > 0:
warnings.append(f"IPMI SEL 中有 {ipmi_sel['critical_count']} 个关键事件")
# 检查 thermal zone 温度
thermal_zones = sensor_data.get("thermal_zones", {})
for zone_name, zone_data in thermal_zones.get("zones", {}).items():
temp = zone_data.get("temperature_c", 0)
critical = zone_data.get("critical_temp_c", 100)
if temp > critical * 0.9: # 超过临界温度的 90%
warnings.append(f"Thermal zone {zone_name} 温度接近临界值: {temp}°C (临界: {critical}°C)")
return warnings
if __name__ == '__main__':
import json
print(json.dumps(run_sensors_check(), indent=2, ensure_ascii=False))

602
modules/storage.py Normal file
View File

@@ -0,0 +1,602 @@
"""
ServerGuard - 存储设备检测模块
检查硬盘/SSD 的健康状况、SMART 数据、RAID 状态。
"""
import os
import re
import json
from typing import Dict, Any, List, Optional
import sys
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from utils import (
execute_command, check_command_exists, parse_key_value_output,
safe_int, safe_float, format_bytes, require_root
)
def run_storage_check() -> Dict[str, Any]:
"""
执行存储设备检测。
Returns:
Dict[str, Any]: 检测结果
"""
result = {
"status": "success",
"devices": [],
"raid_status": {},
"io_stats": {}
}
try:
# 获取存储设备列表
devices = get_storage_devices()
# 检测每个设备
for device in devices:
device_info = check_device(device)
result["devices"].append(device_info)
# 如果有严重问题,标记警告状态
if device_info.get("health") in ['FAILED', 'WARNING']:
result["status"] = "warning"
# 检查 RAID 状态
result["raid_status"] = check_raid_status()
# 获取 I/O 统计
result["io_stats"] = get_io_statistics()
except Exception as e:
result["status"] = "error"
result["error"] = str(e)
return result
def get_storage_devices() -> List[Dict[str, str]]:
"""获取存储设备列表。"""
devices = []
# 方法 1: 使用 lsblk
if check_command_exists('lsblk'):
try:
_, stdout, _ = execute_command(
['lsblk', '-d', '-n', '-o', 'NAME,TYPE,ROTA', '-J'],
check_returncode=False, timeout=10
)
data = json.loads(stdout)
for dev in data.get('blockdevices', []):
if dev.get('type') == 'disk':
devices.append({
"name": dev['name'],
"path": f"/dev/{dev['name']}",
"type": "hdd" if dev.get('rota') else "ssd"
})
except:
pass
# 方法 2: 扫描 /sys/block
if not devices:
try:
for name in os.listdir('/sys/block'):
if name.startswith(('sd', 'hd', 'nvme', 'vd', 'xvd', 'mmcblk')):
dev_type = "unknown"
try:
with open(f'/sys/block/{name}/queue/rotational', 'r') as f:
dev_type = "hdd" if f.read().strip() == '1' else "ssd"
except:
pass
devices.append({
"name": name,
"path": f"/dev/{name}",
"type": dev_type
})
except:
pass
return devices
def check_device(device: Dict[str, str]) -> Dict[str, Any]:
"""检查单个存储设备。"""
result = {
"name": device["name"],
"path": device["path"],
"type": device.get("type", "unknown"),
"model": "Unknown",
"serial": "Unknown",
"firmware": "Unknown",
"size_bytes": 0,
"size_human": "Unknown",
"health": "UNKNOWN",
"smart_status": {},
"temperature_c": None,
"power_on_hours": None,
"start_stop_count": None,
"reallocated_sectors": None,
"pending_sectors": None,
"test_result": None
}
# 获取设备基本信息
result.update(get_device_info(device["path"]))
# 获取 SMART 数据
smart_data = get_smart_data(device["path"])
result["smart_status"] = smart_data
# 分析健康状态
result["health"] = analyze_health(smart_data)
# 提取关键属性
if "attributes" in smart_data:
attrs = smart_data["attributes"]
# 温度
for temp_attr in ['194 Temperature_Celsius', '190 Airflow_Temperature_Cel', 'Temperature']:
if temp_attr in attrs:
temp_val = attrs[temp_attr].get('raw_value')
if temp_val:
result["temperature_c"] = safe_int(temp_val.split()[0])
break
# 运行时间
if '9 Power_On_Hours' in attrs:
result["power_on_hours"] = safe_int(attrs['9 Power_On_Hours'].get('raw_value', 0))
# 启动次数
if '4 Start_Stop_Count' in attrs:
result["start_stop_count"] = safe_int(attrs['4 Start_Stop_Count'].get('raw_value', 0))
# 重映射扇区
if '5 Reallocated_Sector_Ct' in attrs:
result["reallocated_sectors"] = safe_int(attrs['5 Reallocated_Sector_Ct'].get('raw_value', 0))
# 待处理扇区
if '197 Current_Pending_Sector' in attrs:
result["pending_sectors"] = safe_int(attrs['197 Current_Pending_Sector'].get('raw_value', 0))
# NVMe 特殊处理
if device["name"].startswith('nvme'):
nvme_data = get_nvme_data(device["path"])
result["nvme_data"] = nvme_data
if nvme_data.get("temperature"):
result["temperature_c"] = nvme_data["temperature"]
if nvme_data.get("health"):
result["health"] = nvme_data["health"]
return result
def get_device_info(device_path: str) -> Dict[str, Any]:
"""获取设备基本信息。"""
info = {}
# 使用 smartctl -i 获取信息
if check_command_exists('smartctl'):
try:
_, stdout, _ = execute_command(
['smartctl', '-i', device_path],
check_returncode=False, timeout=10
)
patterns = {
"model": r'Device Model:\s*(.+)',
"serial": r'Serial Number:\s*(\S+)',
"firmware": r'Firmware Version:\s*(\S+)',
"size_human": r'User Capacity:\s*(.+)',
"sector_size": r'Sector Size:\s*(.+)',
"rotation_rate": r'Rotation Rate:\s*(.+)',
"form_factor": r'Form Factor:\s*(.+)',
"transport": r'Transport protocol:\s*(.+)'
}
for key, pattern in patterns.items():
match = re.search(pattern, stdout)
if match:
info[key] = match.group(1).strip()
# 提取容量字节数
size_match = re.search(r'User Capacity:\s*[\d,]+\s*bytes\s*\[(\d+)\]', stdout)
if size_match:
info["size_bytes"] = safe_int(size_match.group(1))
# 是否为 SSD
if 'Solid State Device' in stdout or 'Rotation Rate: Solid State Device' in stdout:
info["is_ssd"] = True
elif 'Rotation Rate' in stdout and 'Solid State' not in stdout:
info["is_ssd"] = False
except:
pass
# 备用:从 /sys 获取大小
if "size_bytes" not in info or info["size_bytes"] == 0:
try:
dev_name = os.path.basename(device_path)
with open(f'/sys/block/{dev_name}/size', 'r') as f:
sectors = safe_int(f.read().strip())
info["size_bytes"] = sectors * 512
info["size_human"] = format_bytes(info["size_bytes"])
except:
pass
return info
def get_smart_data(device_path: str) -> Dict[str, Any]:
"""获取 SMART 数据。"""
result = {
"supported": False,
"enabled": False,
"overall": "UNKNOWN",
"attributes": {},
"self_tests": []
}
if not check_command_exists('smartctl'):
result["error"] = "smartctl 未安装"
return result
try:
# 检查 SMART 支持
_, stdout, _ = execute_command(
['smartctl', '-i', device_path],
check_returncode=False, timeout=10
)
if 'SMART support is: Available' in stdout:
result["supported"] = True
if 'SMART support is: Enabled' in stdout:
result["enabled"] = True
# 获取所有 SMART 数据
_, stdout, _ = execute_command(
['smartctl', '-a', device_path],
check_returncode=False, timeout=15
)
# 解析整体健康状态
if 'PASSED' in stdout or 'OK' in stdout:
result["overall"] = "PASSED"
elif 'FAILED' in stdout:
result["overall"] = "FAILED"
# 解析 SMART 属性表 (ATA 设备)
if 'ID#' in stdout and 'ATTRIBUTE_NAME' in stdout:
lines = stdout.split('\n')
in_attributes = False
for line in lines:
if 'ID#' in line and 'ATTRIBUTE_NAME' in line:
in_attributes = True
continue
if in_attributes:
if not line.strip() or line.startswith('SMART'):
break
# 解析属性行
# 格式: ID# ATTRIBUTE_NAME FLAG VALUE WORST THRESH TYPE UPDATED WHEN_FAILED RAW_VALUE
parts = line.split()
if len(parts) >= 10:
attr_id = parts[0]
attr_name = parts[1]
attr_key = f"{attr_id} {attr_name}"
result["attributes"][attr_key] = {
"flag": parts[2],
"value": safe_int(parts[3]),
"worst": safe_int(parts[4]),
"thresh": safe_int(parts[5]),
"type": parts[6],
"updated": parts[7],
"when_failed": parts[8] if parts[8] != '-' else None,
"raw_value": ' '.join(parts[9:])
}
# 解析自检日志
if 'SMART Self-test log' in stdout:
self_test_section = False
for line in stdout.split('\n'):
if 'SMART Self-test log' in line:
self_test_section = True
continue
if self_test_section and line.strip() and not line.startswith('SMART'):
if '#' in line:
result["self_tests"].append(line.strip())
# 解析错误日志
if 'SMART Error Log' in stdout:
error_match = re.search(r'Error (\d+)\s+occurred at', stdout)
if error_match:
result["error_count"] = safe_int(error_match.group(1))
except Exception as e:
result["error"] = str(e)
return result
def get_nvme_data(device_path: str) -> Dict[str, Any]:
"""获取 NVMe 设备特有数据。"""
result = {
"health": "UNKNOWN",
"temperature": None,
"available_spare": None,
"percentage_used": None,
"data_units_read": None,
"data_units_written": None,
"host_reads": None,
"host_writes": None
}
if not check_command_exists('nvme'):
return result
try:
# 获取 SMART 日志
_, stdout, _ = execute_command(
['nvme', 'smart-log', device_path],
check_returncode=False, timeout=10
)
# 解析关键指标
temp_match = re.search(r'temperature\s*:\s*(\d+)', stdout)
if temp_match:
result["temperature"] = safe_int(temp_match.group(1)) - 273 # 转换为摄氏度
spare_match = re.search(r'available spare\s*:\s*(\d+)%', stdout)
if spare_match:
result["available_spare"] = safe_int(spare_match.group(1))
used_match = re.search(r'percentage used\s*:\s*(\d+)%', stdout)
if used_match:
result["percentage_used"] = safe_int(used_match.group(1))
# 评估健康状态
if result["percentage_used"] is not None:
if result["percentage_used"] < 90:
result["health"] = "PASSED"
else:
result["health"] = "WARNING"
if result["available_spare"] is not None and result["available_spare"] < 10:
result["health"] = "WARNING"
except:
pass
return result
def analyze_health(smart_data: Dict[str, Any]) -> str:
"""分析设备健康状态。"""
if not smart_data.get("supported"):
return "UNKNOWN"
if smart_data.get("overall") == "FAILED":
return "FAILED"
# 检查关键属性
attrs = smart_data.get("attributes", {})
critical_attrs = {
'5 Reallocated_Sector_Ct': 'reallocated_sectors',
'197 Current_Pending_Sector': 'pending_sectors',
'198 Offline_Uncorrectable': 'offline_uncorrectable',
'196 Reallocation_Event_Count': 'reallocation_events'
}
for attr_name, description in critical_attrs.items():
if attr_name in attrs:
raw_value = attrs[attr_name].get('raw_value', '0')
value = safe_int(raw_value.split()[0])
if value > 0:
return "WARNING"
# 检查温度
for temp_attr in ['194 Temperature_Celsius', '190 Airflow_Temperature_Cel']:
if temp_attr in attrs:
temp = attrs[temp_attr].get('value', 0)
if temp > 60: # 温度阈值
return "WARNING"
return "PASSED"
def check_raid_status() -> Dict[str, Any]:
"""检查 RAID 阵列状态。"""
result = {
"raid_available": False,
"controllers": [],
"arrays": []
}
# 检查软件 RAID (mdadm)
if check_command_exists('mdadm'):
try:
_, stdout, _ = execute_command(
['mdadm', '--detail', '--scan'],
check_returncode=False, timeout=10
)
if stdout.strip():
result["software_raid"] = True
result["mdadm_config"] = stdout.strip()
# 获取详细信息
_, detail, _ = execute_command(
['cat', '/proc/mdstat'],
check_returncode=False, timeout=5
)
result["mdstat"] = detail
# 解析每个阵列
for line in detail.split('\n'):
if line.startswith('md'):
parts = line.split()
array_info = {
"name": parts[0],
"status": "active" if "active" in line else "inactive"
}
# 检查是否有降级
if '_' in line or 'recovery' in line:
array_info["degraded"] = True
result["status"] = "warning"
result["arrays"].append(array_info)
except:
pass
# 检查硬件 RAID (MegaCli/storcli)
if check_command_exists('storcli'):
try:
_, stdout, _ = execute_command(
['storcli', '/c0', 'show'],
check_returncode=False, timeout=10
)
result["hardware_raid"] = True
result["controller_type"] = "LSI/Broadcom"
result["storcli_output"] = stdout[:500] # 保存部分输出
except:
pass
elif check_command_exists('MegaCli'):
try:
_, stdout, _ = execute_command(
['MegaCli', '-AdpAllInfo', '-aALL'],
check_returncode=False, timeout=10
)
result["hardware_raid"] = True
result["controller_type"] = "LSI"
result["megacli_output"] = stdout[:500]
except:
pass
return result
def get_io_statistics() -> Dict[str, Any]:
"""获取 I/O 统计信息。"""
result = {}
# 从 /proc/diskstats 获取
try:
with open('/proc/diskstats', 'r') as f:
for line in f:
parts = line.split()
if len(parts) >= 14:
device = parts[2]
# 只关注物理磁盘
if device.startswith(('sd', 'hd', 'nvme', 'vd')) and not device[-1].isdigit():
result[device] = {
"reads_completed": safe_int(parts[3]),
"reads_merged": safe_int(parts[4]),
"sectors_read": safe_int(parts[5]),
"time_reading_ms": safe_int(parts[6]),
"writes_completed": safe_int(parts[7]),
"writes_merged": safe_int(parts[8]),
"sectors_written": safe_int(parts[9]),
"time_writing_ms": safe_int(parts[10]),
"ios_in_progress": safe_int(parts[11]),
"time_doing_ios_ms": safe_int(parts[12]),
"weighted_time_ios_ms": safe_int(parts[13])
}
except:
pass
return result
@require_root
def run_io_test(device_path: str, test_size_mb: int = 100) -> Dict[str, Any]:
"""
运行简单的 I/O 性能测试。
Args:
device_path: 设备路径
test_size_mb: 测试大小MB
Returns:
Dict[str, Any]: 测试结果
"""
result = {
"passed": False,
"device": device_path,
"test_size_mb": test_size_mb,
"read_speed_mbps": None,
"write_speed_mbps": None,
"errors": []
}
# 使用 fio 进行测试
if check_command_exists('fio'):
try:
import tempfile
with tempfile.NamedTemporaryFile(mode='w', suffix='.fio', delete=False) as f:
fio_config = f"""
[global]
directory=/tmp
filename=serverguard_test
direct=1
size={test_size_mb}M
unlink=1
[seq_read]
stonewall
rw=read
bs=1M
[seq_write]
stonewall
rw=write
bs=1M
"""
f.write(fio_config)
fio_file = f.name
try:
_, stdout, stderr = execute_command(
['fio', fio_file, '--output-format=json'],
timeout=120,
check_returncode=False
)
data = json.loads(stdout)
for job in data.get('jobs', []):
job_name = job.get('jobname', '')
read_bw = job.get('read', {}).get('bw', 0) / 1024 # 转换为 MB/s
write_bw = job.get('write', {}).get('bw', 0) / 1024
if 'read' in job_name.lower() and read_bw > 0:
result["read_speed_mbps"] = round(read_bw, 2)
if 'write' in job_name.lower() and write_bw > 0:
result["write_speed_mbps"] = round(write_bw, 2)
result["passed"] = True
finally:
os.unlink(fio_file)
except Exception as e:
result["errors"].append(str(e))
else:
result["errors"].append("fio 未安装")
return result
if __name__ == '__main__':
import json
print(json.dumps(run_storage_check(), indent=2, ensure_ascii=False))

476
modules/system_info.py Normal file
View File

@@ -0,0 +1,476 @@
"""
ServerGuard - 系统信息概览模块
收集服务器的硬件和操作系统基本信息。
"""
import os
import re
import platform
from typing import Dict, Any, List, Optional
import sys
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from utils import (
execute_command, parse_key_value_output, check_command_exists,
safe_int, safe_float, format_bytes
)
def get_system_info() -> Dict[str, Any]:
"""
获取系统硬件和操作系统信息。
Returns:
Dict[str, Any]: 系统信息字典
"""
result = {
"status": "success",
"os": {},
"cpu": {},
"memory": {},
"motherboard": {},
"storage": [],
"network": [],
"gpu": []
}
try:
result["os"] = get_os_info()
result["cpu"] = get_cpu_info()
result["memory"] = get_memory_info()
result["motherboard"] = get_motherboard_info()
result["storage"] = get_storage_list()
result["network"] = get_network_info()
result["gpu"] = get_gpu_list()
except Exception as e:
result["status"] = "error"
result["error"] = str(e)
return result
def get_os_info() -> Dict[str, str]:
"""获取操作系统信息。"""
info = {
"platform": platform.system(),
"release": platform.release(),
"version": platform.version(),
"machine": platform.machine(),
"processor": platform.processor()
}
# 尝试获取 Linux 发行版信息
if os.path.exists('/etc/os-release'):
try:
with open('/etc/os-release', 'r') as f:
for line in f:
if line.startswith('PRETTY_NAME='):
info["distribution"] = line.split('=', 1)[1].strip().strip('"')
break
except:
pass
# 获取主机名
try:
_, hostname, _ = execute_command(['hostname'], check_returncode=False)
info["hostname"] = hostname.strip()
except:
info["hostname"] = "unknown"
# 获取 uptime
try:
with open('/proc/uptime', 'r') as f:
uptime_seconds = float(f.readline().split()[0])
days = int(uptime_seconds // 86400)
hours = int((uptime_seconds % 86400) // 3600)
minutes = int((uptime_seconds % 3600) // 60)
info["uptime"] = f"{days}{hours}小时 {minutes}分钟"
except:
info["uptime"] = "unknown"
return info
def get_cpu_info() -> Dict[str, Any]:
"""获取 CPU 信息。"""
info = {
"model": "Unknown",
"vendor": "Unknown",
"architecture": "Unknown",
"cores": 0,
"threads": 0,
"frequency_mhz": 0,
"cache_size_kb": {}
}
# 从 /proc/cpuinfo 获取
try:
cpu_data = {}
with open('/proc/cpuinfo', 'r') as f:
for line in f:
if ':' in line:
key, value = line.split(':', 1)
cpu_data[key.strip()] = value.strip()
info["model"] = cpu_data.get('model name', 'Unknown')
info["vendor"] = cpu_data.get('vendor_id', 'Unknown')
info["architecture"] = cpu_data.get('cpu architecture', platform.machine())
info["cores"] = safe_int(cpu_data.get('cpu cores', 0))
info["threads"] = safe_int(cpu_data.get('siblings', 0))
info["frequency_mhz"] = safe_int(cpu_data.get('cpu MHz', 0))
# 缓存信息
if 'cache size' in cpu_data:
cache = cpu_data['cache size']
info["cache_size_kb"] = {"general": cache}
except Exception as e:
pass
# 使用 lscpu 获取更详细的信息
if check_command_exists('lscpu'):
try:
_, stdout, _ = execute_command(['lscpu'], check_returncode=False, timeout=10)
lscpu_data = parse_key_value_output(stdout)
if 'Model name' in lscpu_data:
info["model"] = lscpu_data['Model name']
if 'Architecture' in lscpu_data:
info["architecture"] = lscpu_data['Architecture']
if 'CPU(s)' in lscpu_data:
info["threads"] = safe_int(lscpu_data['CPU(s)'])
if 'Core(s) per socket' in lscpu_data and 'Socket(s)' in lscpu_data:
cores_per_socket = safe_int(lscpu_data['Core(s) per socket'])
sockets = safe_int(lscpu_data['Socket(s)'])
info["cores"] = cores_per_socket * sockets
if 'CPU max MHz' in lscpu_data:
info["max_frequency_mhz"] = safe_float(lscpu_data['CPU max MHz'])
if 'CPU min MHz' in lscpu_data:
info["min_frequency_mhz"] = safe_float(lscpu_data['CPU min MHz'])
if 'Virtualization' in lscpu_data:
info["virtualization"] = lscpu_data['Virtualization']
except:
pass
return info
def get_memory_info() -> Dict[str, Any]:
"""获取内存信息。"""
info = {
"total_gb": 0,
"available_gb": 0,
"slots_total": 0,
"slots_used": 0,
"slots": [],
"type": "Unknown",
"speed_mhz": 0,
"ecc_supported": False
}
# 从 /proc/meminfo 获取总内存
try:
with open('/proc/meminfo', 'r') as f:
for line in f:
if line.startswith('MemTotal:'):
kb = safe_int(line.split()[1])
info["total_gb"] = round(kb / 1024 / 1024, 2)
elif line.startswith('MemAvailable:'):
kb = safe_int(line.split()[1])
info["available_gb"] = round(kb / 1024 / 1024, 2)
except:
pass
# 使用 dmidecode 获取详细内存信息
if check_command_exists('dmidecode'):
try:
_, stdout, _ = execute_command(
['dmidecode', '-t', 'memory'],
check_returncode=False, timeout=15
)
memory_devices = stdout.split('Memory Device')
slots = []
for device in memory_devices[1:]: # 第一个是标题,跳过
slot = {}
# 解析各项属性
size_match = re.search(r'Size:\s*(\d+)\s*MB', device)
if size_match:
slot["size_gb"] = round(safe_int(size_match.group(1)) / 1024, 2)
type_match = re.search(r'Type:\s*(DDR\d+)', device)
if type_match:
slot["type"] = type_match.group(1)
info["type"] = type_match.group(1)
speed_match = re.search(r'Speed:\s*(\d+)\s*MT/s', device)
if speed_match:
slot["speed_mhz"] = safe_int(speed_match.group(1))
manufacturer_match = re.search(r'Manufacturer:\s*(\S+)', device)
if manufacturer_match:
slot["manufacturer"] = manufacturer_match.group(1)
locator_match = re.search(r'Locator:\s*(.+)', device)
if locator_match:
slot["locator"] = locator_match.group(1).strip()
if slot and slot.get("size_gb", 0) > 0:
slots.append(slot)
info["slots"] = slots
info["slots_used"] = len(slots)
# 计算总插槽数
array_match = re.search(r'Number Of Devices:\s*(\d+)', stdout)
if array_match:
info["slots_total"] = safe_int(array_match.group(1))
else:
info["slots_total"] = len(slots)
except:
pass
# 使用 free 命令作为备用
if info["total_gb"] == 0 and check_command_exists('free'):
try:
_, stdout, _ = execute_command(['free', '-m'], check_returncode=False)
lines = stdout.strip().split('\n')
if len(lines) > 1:
parts = lines[1].split()
if len(parts) >= 2:
info["total_gb"] = round(safe_int(parts[1]) / 1024, 2)
except:
pass
# 检查 ECC 支持
try:
with open('/proc/meminfo', 'r') as f:
content = f.read()
if 'HardwareCorrupted' in content:
info["ecc_supported"] = True
except:
pass
return info
def get_motherboard_info() -> Dict[str, str]:
"""获取主板信息。"""
info = {
"manufacturer": "Unknown",
"product_name": "Unknown",
"version": "Unknown",
"serial_number": "Unknown",
"bios_vendor": "Unknown",
"bios_version": "Unknown",
"bios_date": "Unknown"
}
if check_command_exists('dmidecode'):
try:
# 获取主板信息
_, stdout, _ = execute_command(
['dmidecode', '-t', 'baseboard'],
check_returncode=False, timeout=10
)
patterns = {
"manufacturer": r'Manufacturer:\s*(.+)',
"product_name": r'Product Name:\s*(.+)',
"version": r'Version:\s*(.+)',
"serial_number": r'Serial Number:\s*(.+)'
}
for key, pattern in patterns.items():
match = re.search(pattern, stdout)
if match:
value = match.group(1).strip()
if value not in ['Not Specified', 'To be filled by O.E.M.', 'None']:
info[key] = value
# 获取 BIOS 信息
_, stdout, _ = execute_command(
['dmidecode', '-t', 'bios'],
check_returncode=False, timeout=10
)
bios_patterns = {
"bios_vendor": r'Vendor:\s*(.+)',
"bios_version": r'Version:\s*(.+)',
"bios_date": r'Release Date:\s*(.+)'
}
for key, pattern in bios_patterns.items():
match = re.search(pattern, stdout)
if match:
info[key] = match.group(1).strip()
except:
pass
return info
def get_storage_list() -> List[Dict[str, Any]]:
"""获取存储设备列表。"""
devices = []
# 使用 lsblk 获取块设备列表
if check_command_exists('lsblk'):
try:
_, stdout, _ = execute_command(
['lsblk', '-d', '-o', 'NAME,SIZE,TYPE,MODEL,VENDOR,ROTA', '-n', '-J'],
check_returncode=False, timeout=10
)
import json
data = json.loads(stdout)
for device in data.get('blockdevices', []):
dev_info = {
"name": device.get('name', 'unknown'),
"path": f"/dev/{device.get('name', 'unknown')}",
"size": device.get('size', 'unknown'),
"type": device.get('type', 'unknown'),
"model": device.get('model', 'unknown'),
"vendor": device.get('vendor', 'unknown'),
"is_rotational": device.get('rota', True)
}
devices.append(dev_info)
except:
pass
# 备用方法:直接读取 /sys/block
if not devices:
try:
for name in os.listdir('/sys/block'):
if name.startswith(('sd', 'hd', 'nvme', 'vd')):
dev_info = {"name": name, "path": f"/dev/{name}"}
# 尝试读取大小
try:
with open(f'/sys/block/{name}/size', 'r') as f:
sectors = safe_int(f.read().strip())
size_bytes = sectors * 512
dev_info["size"] = format_bytes(size_bytes)
except:
dev_info["size"] = "unknown"
# 判断是否为 SSD
try:
with open(f'/sys/block/{name}/queue/rotational', 'r') as f:
dev_info["is_rotational"] = f.read().strip() == '1'
dev_info["type"] = 'hdd' if dev_info["is_rotational"] else 'ssd'
except:
dev_info["type"] = 'unknown'
devices.append(dev_info)
except:
pass
return devices
def get_network_info() -> List[Dict[str, Any]]:
"""获取网络接口信息。"""
interfaces = []
# 使用 ip 命令
if check_command_exists('ip'):
try:
_, stdout, _ = execute_command(
['ip', '-j', 'link', 'show'],
check_returncode=False, timeout=10
)
import json
data = json.loads(stdout)
for iface in data:
iface_info = {
"name": iface.get('ifname', 'unknown'),
"state": iface.get('operstate', 'unknown'),
"mac_address": iface.get('address', 'unknown'),
"type": iface.get('link_type', 'unknown')
}
# 获取 IP 地址
if 'addr_info' in iface:
ips = []
for addr in iface['addr_info']:
if addr.get('family') == 'inet':
ips.append(f"{addr.get('local')}/{addr.get('prefixlen', '')}")
if ips:
iface_info["ip_addresses"] = ips
interfaces.append(iface_info)
except:
pass
return interfaces
def get_gpu_list() -> List[Dict[str, Any]]:
"""获取显卡列表。"""
gpus = []
# 使用 lspci 查找 VGA 和 3D 控制器
if check_command_exists('lspci'):
try:
_, stdout, _ = execute_command(
['lspci', '-nn'],
check_returncode=False, timeout=10
)
for line in stdout.split('\n'):
if 'VGA' in line or '3D controller' in line or 'Display controller' in line:
# 提取设备信息
parts = line.split(': ', 1)
if len(parts) == 2:
bus_id = parts[0].split()[0]
description = parts[1]
gpu_info = {
"bus_id": bus_id,
"description": description,
"type": "integrated" if "Intel" in description else "discrete"
}
# 尝试获取更详细的信息
try:
_, detail, _ = execute_command(
['lspci', '-v', '-s', bus_id],
check_returncode=False, timeout=5
)
# 提取驱动信息
driver_match = re.search(r'Kernel driver in use:\s*(\S+)', detail)
if driver_match:
gpu_info["driver"] = driver_match.group(1)
# 提取模块信息
modules_match = re.search(r'Kernel modules:\s*(.+)', detail)
if modules_match:
gpu_info["modules"] = modules_match.group(1).strip()
except:
pass
gpus.append(gpu_info)
except:
pass
return gpus
if __name__ == '__main__':
# 测试模块
import json
print(json.dumps(get_system_info(), indent=2, ensure_ascii=False))

189
quick_test.py Executable file
View File

@@ -0,0 +1,189 @@
#!/usr/bin/env python3
"""
ServerGuard - 快速测试脚本
用于快速验证各模块是否正常工作,不进行压力测试。
"""
import sys
import os
# 设置日志级别为警告,减少输出
import logging
logging.basicConfig(level=logging.WARNING)
def test_imports():
"""测试所有模块是否能正常导入"""
print("测试模块导入...")
modules_to_test = [
'utils',
'reporter',
'modules.system_info',
'modules.cpu',
'modules.memory',
'modules.storage',
'modules.sensors',
'modules.gpu',
'modules.log_analyzer'
]
failed = []
for module in modules_to_test:
try:
__import__(module)
print(f"{module}")
except Exception as e:
print(f"{module}: {e}")
failed.append(module)
if failed:
print(f"\n{len(failed)} 个模块导入失败")
return False
else:
print("\n所有模块导入成功!")
return True
def test_basic_functions():
"""测试基本功能"""
print("\n测试基本功能...")
from modules import system_info, cpu, memory, storage, sensors, gpu, log_analyzer
# 返回字典的测试函数
dict_tests = [
("系统信息", system_info.get_system_info),
("CPU 信息", cpu.get_cpu_details),
("内存信息", memory.get_memory_summary),
("传感器数据", sensors.get_lm_sensors_data),
("日志分析", log_analyzer.analyze_logs),
]
# 返回列表的测试函数
list_tests = [
("存储设备", storage.get_storage_devices),
("GPU 信息", gpu.check_generic_gpus),
]
# 测试返回字典的函数
for name, func in dict_tests:
try:
result = func()
if isinstance(result, dict):
status = result.get("status", "unknown")
if status == "error":
print(f"{name}: 有错误 - {result.get('error', 'Unknown')}")
else:
print(f"{name}: 正常")
else:
print(f"{name}: 正常 (返回 {type(result).__name__})")
except Exception as e:
print(f"{name}: 异常 - {e}")
# 测试返回列表的函数
for name, func in list_tests:
try:
result = func()
if isinstance(result, list):
print(f"{name}: 正常 (找到 {len(result)} 个项目)")
else:
print(f"{name}: 返回类型异常 - {type(result).__name__}")
except Exception as e:
print(f"{name}: 异常 - {e}")
print("\n基本功能测试完成")
def test_utils():
"""测试工具函数"""
print("\n测试工具函数...")
from utils import safe_int, safe_float, format_bytes
# 测试 safe_int
assert safe_int("123") == 123
assert safe_int("32 GB") == 32
assert safe_int("invalid", -1) == -1
print(" ✓ safe_int")
# 测试 safe_float
assert safe_float("123.5") == 123.5
assert safe_float("2.5GHz") == 2.5
print(" ✓ safe_float")
# 测试 format_bytes
assert format_bytes(1024) == "1.00 KB"
assert format_bytes(1024**2) == "1.00 MB"
print(" ✓ format_bytes")
print("\n工具函数测试通过")
def test_report_generation():
"""测试报告生成"""
print("\n测试报告生成...")
from reporter import ReportGenerator
generator = ReportGenerator()
test_data = {
"scan_type": "test",
"timestamp": "2024-01-01 00:00:00",
"modules": {
"cpu": {
"status": "success",
"temperature": {"current_c": 45}
},
"memory": {
"status": "success",
"total_gb": 32
}
}
}
formats = ['text', 'json', 'html']
for fmt in formats:
try:
report = generator.generate_report(test_data, fmt)
print(f"{fmt.upper()} 格式: {len(report)} 字符")
except Exception as e:
print(f"{fmt.upper()} 格式: {e}")
print("\n报告生成测试完成")
def main():
"""主函数"""
print("=" * 60)
print("ServerGuard 快速测试")
print("=" * 60)
print()
# 测试导入
if not test_imports():
print("\n模块导入测试失败,请检查依赖安装")
sys.exit(1)
# 测试工具函数
test_utils()
# 测试报告生成
test_report_generation()
# 测试基本功能
test_basic_functions()
print()
print("=" * 60)
print("测试完成!")
print("=" * 60)
print()
print("运行完整诊断命令:")
print(" sudo python3 main.py --quick # 快速检测")
print(" sudo python3 main.py --full # 全面诊断(含压力测试)")
print()
if __name__ == '__main__':
main()

387
reporter.py Normal file
View File

@@ -0,0 +1,387 @@
"""
ServerGuard - 报告生成模块
负责将检测结果格式化为各种输出格式。
"""
import json
import csv
import os
from typing import Dict, Any, List
from datetime import datetime
from io import StringIO
try:
from rich.console import Console
from rich.table import Table
from rich.panel import Panel
from rich import box
HAS_RICH = True
except ImportError:
HAS_RICH = False
class ReportGenerator:
"""报告生成器类"""
def __init__(self):
self.console = Console() if HAS_RICH else None
def generate_report(self, data: Dict[str, Any], format_type: str = 'text') -> str:
"""
根据指定格式生成报告。
Args:
data: 检测结果数据
format_type: 报告格式 (text, json, csv, html)
Returns:
str: 格式化的报告内容
"""
if format_type == 'json':
return self._format_json_report(data)
elif format_type == 'csv':
return self._format_csv_report(data)
elif format_type == 'html':
return self._format_html_report(data)
else:
return self._format_text_report(data)
def save_report(self, data: Dict[str, Any], format_type: str, filepath: str):
"""
保存报告到文件。
Args:
data: 检测结果数据
format_type: 报告格式
filepath: 输出文件路径
"""
report = self.generate_report(data, format_type)
# 确保目录存在
os.makedirs(os.path.dirname(filepath) or '.', exist_ok=True)
with open(filepath, 'w', encoding='utf-8') as f:
f.write(report)
def _format_json_report(self, data: Dict[str, Any]) -> str:
"""生成 JSON 格式报告。"""
return json.dumps(data, indent=2, ensure_ascii=False, default=str)
def _format_csv_report(self, data: Dict[str, Any]) -> str:
"""生成 CSV 格式报告。"""
output = StringIO()
writer = csv.writer(output)
# 写入基本信息
writer.writerow(['ServerGuard Diagnostic Report'])
writer.writerow(['Scan Type', data.get('scan_type', 'unknown')])
writer.writerow(['Timestamp', data.get('timestamp', '')])
writer.writerow([])
# 写入各模块数据
for module_name, module_data in data.get('modules', {}).items():
writer.writerow([f'Module: {module_name.upper()}'])
writer.writerow(['Status', module_data.get('status', 'unknown')])
# 展平嵌套字典
self._write_dict_to_csv(writer, module_data, prefix='')
writer.writerow([])
return output.getvalue()
def _write_dict_to_csv(self, writer, data: Dict[str, Any], prefix: str = ''):
"""辅助函数:将字典写入 CSV"""
for key, value in data.items():
if key == 'status':
continue
full_key = f"{prefix}.{key}" if prefix else key
if isinstance(value, dict):
self._write_dict_to_csv(writer, value, full_key)
elif isinstance(value, list):
writer.writerow([full_key, ', '.join(str(v) for v in value)])
else:
writer.writerow([full_key, value])
def _format_text_report(self, data: Dict[str, Any]) -> str:
"""生成纯文本格式报告。"""
lines = []
# 报告头部
lines.append("=" * 70)
lines.append("ServerGuard 硬件健康诊断报告")
lines.append("=" * 70)
lines.append(f"扫描类型: {data.get('scan_type', 'unknown').upper()}")
lines.append(f"生成时间: {data.get('timestamp', '')}")
if 'stress_duration' in data:
lines.append(f"压力测试时长: {data['stress_duration']}")
lines.append("=" * 70)
lines.append("")
# 各模块结果
for module_name, module_data in data.get('modules', {}).items():
lines.append(f"\n[{module_name.upper()}]")
lines.append("-" * 70)
status = module_data.get('status', 'unknown')
status_symbol = '' if status == 'success' else '' if status == 'warning' else ''
lines.append(f"状态: {status_symbol} {status.upper()}")
if 'error' in module_data:
lines.append(f"错误: {module_data['error']}")
# 格式化模块特定数据
self._format_module_text(lines, module_name, module_data)
lines.append("")
# 报告尾部
lines.append("=" * 70)
lines.append("报告结束")
lines.append("=" * 70)
return '\n'.join(lines)
def _format_module_text(self, lines: List[str], module_name: str, data: Dict[str, Any]):
"""格式化特定模块的文本输出"""
if module_name == 'system':
if 'cpu' in data:
cpu = data['cpu']
lines.append(f"CPU: {cpu.get('model', 'N/A')}")
lines.append(f" 核心数: {cpu.get('cores', 'N/A')} 核 / {cpu.get('threads', 'N/A')} 线程")
if 'memory' in data:
mem = data['memory']
lines.append(f"内存: 总计 {mem.get('total_gb', 'N/A')} GB, {mem.get('slots_used', 'N/A')} 个插槽")
if 'storage' in data:
lines.append(f"存储设备: {len(data['storage'])} 个设备")
elif module_name == 'cpu':
if 'temperature' in data:
temp = data['temperature']
lines.append(f"CPU 温度: {temp.get('current_c', 'N/A')}°C")
if 'mce_errors' in data:
mce = data['mce_errors']
lines.append(f"MCE 错误: {mce.get('count', 0)}")
if 'stress_test' in data:
stress = data['stress_test']
lines.append(f"压力测试: {'通过' if stress.get('passed') else '失败'}")
lines.append(f" 运行时长: {stress.get('duration_seconds', 'N/A')}")
elif module_name == 'memory':
if 'ecc_status' in data:
ecc = data['ecc_status']
lines.append(f"ECC 支持: {'' if ecc.get('supported') else ''}")
if ecc.get('errors', 0) > 0:
lines.append(f"ECC 错误: {ecc['errors']}")
if 'stress_test' in data:
st = data['stress_test']
lines.append(f"内存压力测试: {'通过' if st.get('passed') else '失败'}")
if st.get('tool'):
lines.append(f" 使用工具: {st.get('tool')}")
if st.get('size_mb'):
lines.append(f" 测试大小: {st.get('size_mb')} MB")
elif module_name == 'storage':
for device in data.get('devices', []):
lines.append(f"设备 {device.get('name', 'N/A')}:")
lines.append(f" 型号: {device.get('model', 'N/A')}")
lines.append(f" 健康状态: {device.get('health', 'N/A')}")
if 'smart_status' in device:
smart = device['smart_status']
lines.append(f" SMART: {smart.get('overall', 'N/A')}")
elif module_name == 'sensors':
if 'temperatures' in data:
lines.append("温度传感器:")
for name, value in data['temperatures'].items():
lines.append(f" {name}: {value}°C")
if 'voltages' in data:
lines.append("电压传感器:")
for name, value in data['voltages'].items():
lines.append(f" {name}: {value}V")
elif module_name == 'logs':
if 'hardware_errors' in data:
errors = data['hardware_errors']
total = sum(errors.values())
lines.append(f"硬件错误总计: {total}")
for error_type, count in errors.items():
if count > 0:
lines.append(f" {error_type}: {count}")
def _format_html_report(self, data: Dict[str, Any]) -> str:
"""生成 HTML 格式报告。"""
html_parts = []
# HTML 头部
html_parts.append("""<!DOCTYPE html>
<html lang="zh-CN">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>ServerGuard 诊断报告</title>
<style>
body {
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif;
line-height: 1.6;
color: #333;
max-width: 1200px;
margin: 0 auto;
padding: 20px;
background: #f5f5f5;
}
.header {
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
color: white;
padding: 30px;
border-radius: 10px;
margin-bottom: 20px;
}
.header h1 {
margin: 0;
font-size: 2em;
}
.header .meta {
margin-top: 10px;
opacity: 0.9;
}
.module {
background: white;
border-radius: 8px;
padding: 20px;
margin-bottom: 20px;
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
}
.module-header {
display: flex;
justify-content: space-between;
align-items: center;
border-bottom: 2px solid #eee;
padding-bottom: 10px;
margin-bottom: 15px;
}
.module-title {
font-size: 1.5em;
font-weight: bold;
color: #444;
}
.status {
padding: 5px 15px;
border-radius: 20px;
font-weight: bold;
font-size: 0.9em;
}
.status-success { background: #d4edda; color: #155724; }
.status-warning { background: #fff3cd; color: #856404; }
.status-error { background: #f8d7da; color: #721c24; }
.status-unknown { background: #e2e3e5; color: #383d41; }
.info-grid {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));
gap: 15px;
}
.info-item {
padding: 10px;
background: #f8f9fa;
border-radius: 5px;
}
.info-label {
font-weight: bold;
color: #666;
font-size: 0.9em;
}
.info-value {
margin-top: 5px;
font-size: 1.1em;
}
.footer {
text-align: center;
color: #666;
margin-top: 30px;
padding: 20px;
}
.error-box {
background: #f8d7da;
border: 1px solid #f5c6cb;
color: #721c24;
padding: 15px;
border-radius: 5px;
margin: 10px 0;
}
</style>
</head>
<body>""")
# 报告头部
html_parts.append(f"""
<div class="header">
<h1>🔧 ServerGuard 硬件健康诊断报告</h1>
<div class="meta">
扫描类型: {data.get('scan_type', 'unknown').upper()} |
生成时间: {data.get('timestamp', '')}
</div>
</div>""")
# 各模块结果
for module_name, module_data in data.get('modules', {}).items():
status = module_data.get('status', 'unknown')
status_class = f'status-{status}'
html_parts.append(f"""
<div class="module">
<div class="module-header">
<span class="module-title">{module_name.upper()}</span>
<span class="status {status_class}">{status.upper()}</span>
</div>""")
if 'error' in module_data:
html_parts.append(f"""
<div class="error-box">
<strong>错误:</strong> {module_data['error']}
</div>""")
else:
html_parts.append(' <div class="info-grid">')
self._format_module_html(html_parts, module_name, module_data)
html_parts.append(' </div>')
html_parts.append(' </div>')
# 报告尾部
html_parts.append("""
<div class="footer">
<p>由 ServerGuard 生成</p>
</div>
</body>
</html>""")
return '\n'.join(html_parts)
def _format_module_html(self, html_parts: List[str], module_name: str, data: Dict[str, Any]):
"""格式化特定模块的 HTML 输出"""
for key, value in data.items():
if key == 'status':
continue
display_key = key.replace('_', ' ').title()
if isinstance(value, dict):
html_parts.append(f"""
<div class="info-item">
<div class="info-label">{display_key}</div>
<div class="info-value">{len(value)} 项数据</div>
</div>""")
elif isinstance(value, list):
html_parts.append(f"""
<div class="info-item">
<div class="info-label">{display_key}</div>
<div class="info-value">{len(value)} 个项目</div>
</div>""")
else:
html_parts.append(f"""
<div class="info-item">
<div class="info-label">{display_key}</div>
<div class="info-value">{value}</div>
</div>""")

2
requirements.txt Normal file
View File

@@ -0,0 +1,2 @@
rich>=13.0.0
psutil>=5.9.0

3
tests/__init__.py Normal file
View File

@@ -0,0 +1,3 @@
"""
ServerGuard 测试模块
"""

175
tests/test_modules.py Normal file
View File

@@ -0,0 +1,175 @@
"""
测试各个硬件检测模块
"""
import unittest
from unittest.mock import patch, MagicMock
import sys
import os
# 添加父目录到路径
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from modules import system_info, cpu, memory, storage, sensors, gpu, log_analyzer
class TestSystemInfo(unittest.TestCase):
"""测试系统信息模块"""
@patch('modules.system_info.execute_command')
def test_get_os_info(self, mock_exec):
mock_exec.return_value = (0, "test-hostname\n", "")
result = system_info.get_os_info()
self.assertIn("platform", result)
self.assertIn("machine", result)
def test_get_cpu_info(self):
result = system_info.get_cpu_info()
self.assertIn("model", result)
# 在大多数系统上应该能获取到一些信息
self.assertIsInstance(result["model"], str)
def test_get_memory_info(self):
result = system_info.get_memory_info()
self.assertIn("total_gb", result)
self.assertIsInstance(result["total_gb"], (int, float))
def test_get_system_info(self):
result = system_info.get_system_info()
self.assertIn("status", result)
self.assertIn("cpu", result)
self.assertIn("memory", result)
class TestCPU(unittest.TestCase):
"""测试 CPU 模块"""
def test_get_cpu_details(self):
result = cpu.get_cpu_details()
self.assertIn("model", result)
self.assertIn("cores", result)
self.assertIsInstance(result["cores"], int)
def test_get_cpu_temperature(self):
result = cpu.get_cpu_temperature()
self.assertIn("status", result)
self.assertIn("sensors", result)
def test_get_load_average(self):
result = cpu.get_load_average()
self.assertIn("1min", result)
self.assertIn("5min", result)
self.assertIn("15min", result)
def test_check_mce_errors(self):
result = cpu.check_mce_errors()
self.assertIn("count", result)
self.assertIn("status", result)
class TestMemory(unittest.TestCase):
"""测试内存模块"""
def test_get_memory_summary(self):
result = memory.get_memory_summary()
self.assertIn("total_bytes", result)
self.assertIn("total_gb", result)
self.assertIsInstance(result["total_gb"], (int, float))
def test_get_dimm_info(self):
result = memory.get_dimm_info()
self.assertIsInstance(result, list)
def test_check_ecc_status(self):
result = memory.check_ecc_status()
self.assertIn("supported", result)
self.assertIsInstance(result["supported"], bool)
def test_check_edac_errors(self):
result = memory.check_edac_errors()
self.assertIn("total_errors", result)
self.assertIsInstance(result["total_errors"], int)
class TestStorage(unittest.TestCase):
"""测试存储模块"""
def test_get_storage_devices(self):
result = storage.get_storage_devices()
self.assertIsInstance(result, list)
def test_check_raid_status(self):
result = storage.check_raid_status()
self.assertIn("arrays", result)
self.assertIsInstance(result["arrays"], list)
def test_get_io_statistics(self):
result = storage.get_io_statistics()
self.assertIsInstance(result, dict)
class TestSensors(unittest.TestCase):
"""测试传感器模块"""
def test_get_lm_sensors_data(self):
result = sensors.get_lm_sensors_data()
self.assertIn("available", result)
def test_get_thermal_zones(self):
result = sensors.get_thermal_zones()
self.assertIn("zones", result)
self.assertIsInstance(result["zones"], dict)
def test_get_power_supply_info(self):
result = sensors.get_power_supply_info()
self.assertIn("supplies", result)
self.assertIsInstance(result["supplies"], list)
class TestGPU(unittest.TestCase):
"""测试 GPU 模块"""
def test_check_generic_gpus(self):
result = gpu.check_generic_gpus()
self.assertIsInstance(result, list)
def test_check_gpu_dmesg_errors(self):
result = gpu.check_gpu_dmesg_errors()
self.assertIsInstance(result, list)
class TestLogAnalyzer(unittest.TestCase):
"""测试日志分析模块"""
def test_get_kernel_panic_logs(self):
result = log_analyzer.get_kernel_panic_logs()
self.assertIsInstance(result, list)
def test_get_hardware_error_logs(self):
result = log_analyzer.get_hardware_error_logs()
self.assertIn("mce_errors", result)
self.assertIn("ecc_errors", result)
self.assertIn("io_errors", result)
def test_summarize_errors(self):
test_data = {
"dmesg_analysis": {
"error_counts": {
"cpu_errors": 5,
"memory_errors": 3
}
},
"journal_analysis": {
"error_counts": {
"cpu_errors": 2,
"memory_errors": 1
}
}
}
result = log_analyzer.summarize_errors(test_data)
self.assertEqual(result["cpu_errors"], 7)
self.assertEqual(result["memory_errors"], 4)
if __name__ == '__main__':
unittest.main()

94
tests/test_utils.py Normal file
View File

@@ -0,0 +1,94 @@
"""
测试 utils 模块
"""
import unittest
import sys
import os
# 添加父目录到路径
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from utils import (
parse_key_value_output, parse_table_output, extract_with_regex,
safe_int, safe_float, format_bytes, sanitize_filename,
merge_dicts, check_command_exists
)
class TestParseFunctions(unittest.TestCase):
"""测试解析函数"""
def test_parse_key_value_output(self):
text = """
Key1: Value1
Key2: Value2
# Comment line
Key3: Value with: colon
"""
result = parse_key_value_output(text)
self.assertEqual(result["Key1"], "Value1")
self.assertEqual(result["Key2"], "Value2")
self.assertEqual(result["Key3"], "Value with: colon")
def test_parse_table_output(self):
text = """
NAME SIZE TYPE MODEL
sda 1T disk Samsung SSD
sdb 2T disk WD HDD
"""
result = parse_table_output(text, headers=["NAME", "SIZE", "TYPE", "MODEL"])
self.assertEqual(len(result), 2)
self.assertEqual(result[0]["NAME"], "sda")
self.assertEqual(result[1]["TYPE"], "disk")
def test_extract_with_regex(self):
text = "Temperature: 45.5 degrees"
result = extract_with_regex(text, r'Temperature:\s*([\d.]+)')
self.assertEqual(result, "45.5")
def test_safe_int(self):
self.assertEqual(safe_int("123"), 123)
self.assertEqual(safe_int("123.5"), 123)
self.assertEqual(safe_int("1,234"), 1234)
self.assertEqual(safe_int("32 GB"), 32)
self.assertEqual(safe_int("invalid"), 0)
self.assertEqual(safe_int("invalid", -1), -1)
def test_safe_float(self):
self.assertEqual(safe_float("123.5"), 123.5)
self.assertEqual(safe_float("2.5GHz"), 2.5)
self.assertEqual(safe_float("invalid"), 0.0)
def test_format_bytes(self):
self.assertEqual(format_bytes(0), "0 B")
self.assertEqual(format_bytes(1024), "1.00 KB")
self.assertEqual(format_bytes(1024**2), "1.00 MB")
self.assertEqual(format_bytes(1024**3), "1.00 GB")
def test_sanitize_filename(self):
self.assertEqual(sanitize_filename("file<name>.txt"), "file_name_.txt")
self.assertEqual(sanitize_filename("path/to/file"), "path/to/file")
def test_merge_dicts(self):
base = {"a": 1, "b": {"c": 2}}
update = {"b": {"d": 3}, "e": 4}
result = merge_dicts(base, update)
self.assertEqual(result["a"], 1)
self.assertEqual(result["b"]["c"], 2)
self.assertEqual(result["b"]["d"], 3)
self.assertEqual(result["e"], 4)
class TestCommandFunctions(unittest.TestCase):
"""测试命令相关函数"""
def test_check_command_exists(self):
# ls 应该存在
self.assertTrue(check_command_exists("ls"))
# 不存在的命令
self.assertFalse(check_command_exists("nonexistent_command_12345"))
if __name__ == '__main__':
unittest.main()

419
utils.py Normal file
View File

@@ -0,0 +1,419 @@
"""
ServerGuard - 通用工具库
提供命令执行、日志配置、输出解析等通用功能。
"""
import subprocess
import logging
import sys
import os
import re
import json
from typing import List, Dict, Any, Optional, Tuple, Union
from datetime import datetime
class ServerGuardError(Exception):
"""ServerGuard 基础异常类"""
pass
class CommandExecutionError(ServerGuardError):
"""命令执行异常"""
pass
class PermissionError(ServerGuardError):
"""权限异常"""
pass
def execute_command(
cmd_list: List[str],
timeout: int = 60,
check_returncode: bool = True,
capture_output: bool = True,
shell: bool = False,
input_data: Optional[str] = None
) -> Tuple[int, str, str]:
"""
安全地执行外部命令。
Args:
cmd_list: 命令及其参数的列表
timeout: 命令超时时间(秒)
check_returncode: 是否在非零返回码时抛出异常
capture_output: 是否捕获输出
shell: 是否使用 shell 执行
input_data: 输入到命令的字符串数据
Returns:
Tuple[returncode, stdout, stderr]
Raises:
CommandExecutionError: 命令执行失败
PermissionError: 权限不足
"""
logger = logging.getLogger(__name__)
# 安全:禁止使用 shell=True 时传递未经验证的命令字符串
if shell and isinstance(cmd_list, list):
cmd_str = ' '.join(cmd_list)
logger.warning(f"Using shell=True with command: {cmd_str}")
try:
logger.debug(f"Executing command: {' '.join(cmd_list)}")
kwargs = {
'timeout': timeout,
'shell': shell,
'universal_newlines': True # Python 3.6 compatible version of text=True
}
if capture_output:
kwargs['stdout'] = subprocess.PIPE
kwargs['stderr'] = subprocess.PIPE
if input_data:
kwargs['input'] = input_data
result = subprocess.run(cmd_list, **kwargs)
stdout = result.stdout if result.stdout else ""
stderr = result.stderr if result.stderr else ""
if check_returncode and result.returncode != 0:
error_msg = f"Command failed with code {result.returncode}: {' '.join(cmd_list)}\nstderr: {stderr}"
logger.error(error_msg)
raise CommandExecutionError(error_msg)
return result.returncode, stdout, stderr
except subprocess.TimeoutExpired:
error_msg = f"Command timed out after {timeout}s: {' '.join(cmd_list)}"
logger.error(error_msg)
raise CommandExecutionError(error_msg)
except FileNotFoundError:
error_msg = f"Command not found: {cmd_list[0]}"
logger.error(error_msg)
raise CommandExecutionError(error_msg)
except PermissionError as e:
error_msg = f"Permission denied executing: {' '.join(cmd_list)}"
logger.error(error_msg)
raise PermissionError(error_msg) from e
def check_root_privileges() -> bool:
"""
检查当前是否以 root 用户运行。
Returns:
bool: 是否为 root 用户
"""
return os.geteuid() == 0
def require_root(func):
"""
装饰器:要求函数必须以 root 权限运行。
"""
def wrapper(*args, **kwargs):
if not check_root_privileges():
logging.warning(f"Function {func.__name__} requires root privileges")
return {
"status": "error",
"error": "This function requires root privileges. Please run with sudo."
}
return func(*args, **kwargs)
return wrapper
def setup_logging(
log_file: Optional[str] = None,
level: int = logging.INFO,
console_output: bool = True
) -> logging.Logger:
"""
配置日志系统。
Args:
log_file: 日志文件路径None 则不写入文件
level: 日志级别
console_output: 是否输出到控制台
Returns:
logging.Logger: 配置好的 logger 实例
"""
logger = logging.getLogger()
logger.setLevel(level)
# 清除已有的 handlers
logger.handlers = []
formatter = logging.Formatter(
'%(asctime)s - %(name)s - %(levelname)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
if console_output:
console_handler = logging.StreamHandler(sys.stdout)
console_handler.setFormatter(formatter)
logger.addHandler(console_handler)
if log_file:
os.makedirs(os.path.dirname(log_file) or '.', exist_ok=True)
file_handler = logging.FileHandler(log_file)
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)
return logger
def parse_key_value_output(text: str, delimiter: str = ':') -> Dict[str, str]:
"""
解析 key: value 格式的文本输出。
Args:
text: 要解析的文本
delimiter: 键值分隔符
Returns:
Dict[str, str]: 解析后的字典
"""
result = {}
for line in text.strip().split('\n'):
line = line.strip()
if not line or line.startswith('#'):
continue
parts = line.split(delimiter, 1)
if len(parts) == 2:
key = parts[0].strip()
value = parts[1].strip()
result[key] = value
return result
def parse_table_output(text: str, headers: Optional[List[str]] = None) -> List[Dict[str, str]]:
"""
解析表格格式的文本输出。
Args:
text: 要解析的文本
headers: 表头列表None 则从第一行自动提取
Returns:
List[Dict[str, str]]: 解析后的列表
"""
lines = [line.strip() for line in text.strip().split('\n') if line.strip()]
if not lines:
return []
if headers is None:
# 尝试自动检测表头
headers = [h.strip() for h in lines[0].split() if h.strip()]
data_lines = lines[1:]
else:
data_lines = lines
result = []
for line in data_lines:
values = line.split()
if len(values) >= len(headers):
row = {headers[i]: values[i] for i in range(len(headers))}
result.append(row)
return result
def extract_with_regex(text: str, pattern: str, group: int = 1, default: Any = None) -> Any:
"""
使用正则表达式从文本中提取内容。
Args:
text: 要搜索的文本
pattern: 正则表达式模式
group: 捕获组索引
default: 未匹配时的默认值
Returns:
匹配结果或默认值
"""
match = re.search(pattern, text)
if match:
try:
return match.group(group)
except IndexError:
return default
return default
def safe_int(value: Any, default: int = 0) -> int:
"""
安全地将值转换为整数。
Args:
value: 要转换的值
default: 转换失败时的默认值
Returns:
int: 转换后的整数
"""
try:
# 移除常见单位后缀
if isinstance(value, str):
value = value.strip().lower()
value = re.sub(r'[\s,]', '', value)
# 处理带单位的数值 (如 "32 GB", "2.5GHz")
value = re.sub(r'[^\d.-]', '', value)
return int(float(value))
except (ValueError, TypeError):
return default
def safe_float(value: Any, default: float = 0.0) -> float:
"""
安全地将值转换为浮点数。
Args:
value: 要转换的值
default: 转换失败时的默认值
Returns:
float: 转换后的浮点数
"""
try:
if isinstance(value, str):
value = value.strip().lower()
value = re.sub(r'[\s,]', '', value)
value = re.sub(r'[^\d.-]', '', value)
return float(value)
except (ValueError, TypeError):
return default
def get_timestamp() -> str:
"""
获取当前时间戳字符串。
Returns:
str: 格式化的时间戳
"""
return datetime.now().strftime('%Y-%m-%d %H:%M:%S')
def get_file_timestamp() -> str:
"""
获取适合文件名的当前时间戳字符串。
Returns:
str: 格式化的文件名时间戳
"""
return datetime.now().strftime('%Y%m%d_%H%M%S')
def read_file_lines(filepath: str, max_lines: int = 1000) -> List[str]:
"""
安全地读取文件内容。
Args:
filepath: 文件路径
max_lines: 最大读取行数
Returns:
List[str]: 文件行列表
"""
try:
with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
lines = []
for i, line in enumerate(f):
if i >= max_lines:
break
lines.append(line.rstrip('\n'))
return lines
except (IOError, OSError) as e:
logging.getLogger(__name__).warning(f"Failed to read file {filepath}: {e}")
return []
def check_command_exists(command: str) -> bool:
"""
检查命令是否存在。
Args:
command: 命令名称
Returns:
bool: 命令是否存在
"""
try:
# Python 3.6 compatible version
subprocess.run(
['which', command],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
check=True
)
return True
except (subprocess.CalledProcessError, FileNotFoundError):
return False
def format_bytes(size_bytes: int) -> str:
"""
将字节数格式化为人类可读的字符串。
Args:
size_bytes: 字节数
Returns:
str: 格式化后的字符串
"""
if size_bytes == 0:
return "0 B"
units = ['B', 'KB', 'MB', 'GB', 'TB', 'PB']
unit_index = 0
size = float(size_bytes)
while size >= 1024 and unit_index < len(units) - 1:
size /= 1024
unit_index += 1
return f"{size:.2f} {units[unit_index]}"
def sanitize_filename(filename: str) -> str:
"""
清理文件名,移除不安全字符。
Args:
filename: 原始文件名
Returns:
str: 清理后的文件名
"""
# 移除或替换不安全字符
filename = re.sub(r'[<>:"/\\|?*]', '_', filename)
filename = filename.strip('. ')
return filename
def merge_dicts(base: Dict[str, Any], update: Dict[str, Any]) -> Dict[str, Any]:
"""
递归合并两个字典。
Args:
base: 基础字典
update: 更新字典
Returns:
Dict[str, Any]: 合并后的字典
"""
result = base.copy()
for key, value in update.items():
if key in result and isinstance(result[key], dict) and isinstance(value, dict):
result[key] = merge_dicts(result[key], value)
else:
result[key] = value
return result