first commit

2026-03-02 14:14:40 +08:00
commit c4f4fefa0a
20 changed files with 6037 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,54 @@
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# 虚拟环境
+venv/
+env/
+ENV/
+.venv/
+
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+
+# 日志和报告
+*.log
+reports/
+*.json
+*.csv
+*.html
+
+# 配置文件（可能包含敏感信息）
+config/local_config.yaml
+config/secrets.yaml
+
+# 操作系统
+.DS_Store
+Thumbs.db
+
+# 临时文件
+tmp/
+temp/
+*.tmp
--- a/README.md
+++ b/README.md
@@ -0,0 +1,111 @@
+# ServerGuard - 服务器硬件健康诊断系统
+
+ServerGuard 是一款基于 Python 的 Linux 命令行工具，用于诊断服务器硬件（CPU、内存、存储、电源、显卡等）的潜在故障。
+
+## 功能特性
+
+- **硬件信息概览**：收集 CPU、内存、主板、存储、显卡等详细信息
+- **CPU 检测**：温度监控、MCE 错误检查、压力测试
+- **内存检测**：DIMM 信息、ECC 状态检查、内存压力测试
+- **存储检测**：SMART 数据分析、I/O 性能测试、RAID 状态检查
+- **传感器监控**：电压、风扇转速、温度监控（支持 IPMI）
+- **显卡检测**：GPU 信息、温度、驱动状态检查
+- **日志分析**：自动扫描系统日志中的硬件错误
+- **报告生成**：支持 JSON、CSV、纯文本、HTML 格式
+
+## 安装
+
+### 系统要求
+
+- Python 3.8+
+- Linux 操作系统
+- root 权限（大多数硬件诊断功能需要）
+
+### 安装系统依赖
+
+**Debian/Ubuntu:**
+```bash
+sudo apt update
+sudo apt install -y lshw dmidecode smartmontools lm-sensors stress-ng memtester ipmitool edac-utils fio mdadm pciutils usbutils
+```
+
+**CentOS/RHEL:**
+```bash
+sudo yum install -y lshw dmidecode smartmontools lm_sensors stress-ng memtester OpenIPMI edac-utils fio mdadm pciutils usbutils
+```
+
+### 安装 Python 依赖
+
+```bash
+python3 -m venv venv
+source venv/bin/activate
+pip install -r requirements.txt
+```
+
+## 使用方法
+
+### 快速检测（非侵入性）
+
+```bash
+sudo python3 main.py --quick
+```
+
+### 全面诊断（包含压力测试）
+
+```bash
+sudo python3 main.py --full
+```
+
+### 运行特定模块
+
+```bash
+sudo python3 main.py --module cpu
+sudo python3 main.py --module memory
+sudo python3 main.py --module storage
+```
+
+### 生成不同格式的报告
+
+```bash
+sudo python3 main.py --full --format json --output report.json
+sudo python3 main.py --full --format html --output report.html
+```
+
+### 查看帮助
+
+```bash
+python3 main.py --help
+```
+
+## 项目结构
+
+```
+ServerGuard/
+├── main.py              # 程序入口和核心调度器
+├── utils.py             # 通用工具库
+├── reporter.py          # 报告生成模块
+├── requirements.txt     # Python 依赖
+├── README.md           # 项目说明
+├── config/
+│   └── config.yaml     # 配置文件
+├── modules/
+│   ├── __init__.py
+│   ├── system_info.py  # 系统信息概览
+│   ├── cpu.py          # CPU 检测
+│   ├── memory.py       # 内存检测
+│   ├── storage.py      # 存储检测
+│   ├── sensors.py      # 传感器监控
+│   ├── gpu.py          # 显卡检测
+│   └── log_analyzer.py # 日志分析
+└── tests/              # 测试文件
+```
+
+## 注意事项
+
+1. **权限要求**：大多数硬件诊断功能需要 root 权限运行
+2. **压力测试**：全面诊断中的压力测试会占用大量系统资源，建议在维护窗口期进行
+3. **数据安全**：存储设备坏块扫描可能破坏数据，请谨慎使用
+
+## 许可证
+
+MIT License
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -0,0 +1,121 @@
+# ServerGuard 配置文件
+
+# 应用设置
+app:
+  name: "ServerGuard"
+  version: "1.0.0"
+  description: "服务器硬件健康诊断系统"
+
+# 日志设置
+logging:
+  level: INFO  # DEBUG, INFO, WARNING, ERROR
+  file: "/var/log/serverguard.log"
+  max_size_mb: 100
+  backup_count: 5
+  console_output: true
+
+# 报告设置
+report:
+  default_format: "text"  # text, json, csv, html
+  output_directory: "./reports"
+  include_timestamp: true
+  max_report_size_mb: 10
+
+# 检测模块设置
+modules:
+  # CPU 检测设置
+  cpu:
+    enabled: true
+    temperature_warning: 85  # 温度警告阈值（摄氏度）
+    temperature_critical: 95  # 温度危险阈值（摄氏度）
+    stress_test:
+      duration_seconds: 300  # 压力测试持续时间
+      check_mce: true  # 检查 MCE 错误
+    
+  # 内存检测设置
+  memory:
+    enabled: true
+    memtester:
+      enabled: true
+      memory_percent: 70  # 使用可用内存的百分比进行测试
+    stress_test:
+      duration_seconds: 300
+    check_ecc: true  # 检查 ECC 错误
+    
+  # 存储检测设置
+  storage:
+    enabled: true
+    smart_check: true
+    check_reallocated_sectors: true
+    reallocated_threshold: 1  # 重映射扇区警告阈值
+    temperature_warning: 60  # 硬盘温度警告阈值
+    temperature_critical: 70  # 硬盘温度危险阈值
+    run_io_test: false  # 是否运行 I/O 性能测试（耗时）
+    io_test_size_mb: 100
+    check_raid: true  # 检查 RAID 状态
+    
+  # 传感器检测设置
+  sensors:
+    enabled: true
+    lm_sensors: true
+    ipmi: true
+    check_fans: true
+    fan_min_rpm: 500  # 风扇最低转速警告阈值
+    voltage_tolerance: 0.1  # 电压偏差容忍度（比例）
+    
+  # GPU 检测设置
+  gpu:
+    enabled: true
+    check_nvidia: true
+    check_amd: true
+    check_intel: true
+    temperature_warning: 85
+    
+  # 日志分析设置
+  log_analyzer:
+    enabled: true
+    check_dmesg: true
+    check_journalctl: true
+    max_lines: 5000
+    lookback_days: 7  # 分析最近几天的日志
+
+# 告警设置
+alerts:
+  enabled: false
+  smtp:
+    host: ""
+    port: 587
+    username: ""
+    password: ""
+    use_tls: true
+    from_address: "serverguard@example.com"
+    to_addresses: []
+  
+  webhook:
+    enabled: false
+    url: ""
+    headers: {}
+  
+  # 告警阈值
+  thresholds:
+    cpu_temperature: 85
+    memory_usage_percent: 90
+    disk_usage_percent: 90
+    hardware_error_count: 1
+
+# 压力测试设置（全面诊断模式）
+stress_test:
+  cpu:
+    enabled: true
+    workers: 0  # 0 表示使用所有核心
+    timeout_seconds: 300
+    
+  memory:
+    enabled: true
+    workers: 4
+    timeout_seconds: 300
+    
+  io:
+    enabled: false  # I/O 压力测试可能很危险，默认关闭
+    workers: 4
+    timeout_seconds: 300
--- a/install.sh
+++ b/install.sh
@@ -0,0 +1,280 @@
+#!/bin/bash
+# ServerGuard 安装脚本
+# 支持 Debian/Ubuntu 和 CentOS/RHEL
+
+echo "========================================"
+echo "ServerGuard 安装脚本"
+echo "========================================"
+echo ""
+
+# 检查是否为 root
+if [ "$EUID" -ne 0 ]; then 
+    echo "错误: 请以 root 权限运行此脚本"
+    echo "  sudo ./install.sh"
+    exit 1
+fi
+
+# 检测 Linux 发行版
+if [ -f /etc/os-release ]; then
+    . /etc/os-release
+    OS=$NAME
+    VER=$VERSION_ID
+else
+    echo "无法检测操作系统类型"
+    exit 1
+fi
+
+echo "检测到操作系统: $OS $VER"
+echo ""
+
+# 记录安装失败的包
+FAILED_PACKAGES=""
+
+# 安装单个包的函数
+install_package() {
+    local pkg=$1
+    local pkg_manager=$2
+    
+    if [ "$pkg_manager" = "apt" ]; then
+        apt-get install -y "$pkg" 2>/dev/null && return 0
+    else
+        yum install -y "$pkg" 2>/dev/null && return 0
+    fi
+    
+    FAILED_PACKAGES="$FAILED_PACKAGES $pkg"
+    return 1
+}
+
+# 安装 Debian/Ubuntu 依赖
+install_debian_deps() {
+    echo "正在安装 Debian/Ubuntu 依赖..."
+    apt-get update
+    
+    # 核心依赖（必须）
+    CORE_PKGS="lshw dmidecode smartmontools lm-sensors ipmitool mdadm pciutils usbutils util-linux coreutils grep gawk sed"
+    
+    # 可选依赖
+    OPTIONAL_PKGS="stress-ng memtester edac-utils fio nvme-cli"
+    
+    echo "安装核心依赖..."
+    for pkg in $CORE_PKGS; do
+        install_package "$pkg" "apt" || echo "警告: $pkg 安装失败"
+    done
+    
+    echo "安装可选依赖..."
+    for pkg in $OPTIONAL_PKGS; do
+        install_package "$pkg" "apt" || echo "注意: $pkg 安装失败（可选）"
+    done
+}
+
+# 安装 RHEL/CentOS 依赖
+install_redhat_deps() {
+    echo "正在安装 RHEL/CentOS 依赖..."
+    
+    # 尝试启用 EPEL
+    if ! rpm -qa | grep -q epel-release; then
+        echo "启用 EPEL 仓库..."
+        yum install -y epel-release 2>/dev/null || true
+    fi
+    
+    # 对于 CentOS 8/RHEL 8，启用 PowerTools/CRB 仓库
+    if [[ "$VER" == 8* ]] || [[ "$VER" == "8" ]]; then
+        echo "启用 PowerTools 仓库..."
+        yum config-manager --set-enabled powertools 2>/dev/null || \
+        yum config-manager --set-enabled PowerTools 2>/dev/null || true
+        
+        # 尝试启用 CRB (CodeReady Builder) 对于 RHEL 8
+        subscription-manager repos --enable codeready-builder-for-rhel-8-x86_64-rpms 2>/dev/null || true
+    fi
+    
+    # 核心依赖（必须）
+    CORE_PKGS="lshw dmidecode smartmontools lm_sensors ipmitool mdadm pciutils usbutils util-linux coreutils grep gawk sed"
+    
+    echo "安装核心依赖..."
+    for pkg in $CORE_PKGS; do
+        install_package "$pkg" "yum" || echo "警告: $pkg 安装失败"
+    done
+    
+    # 尝试安装 OpenIPMI (替代 ipmitool 的依赖)
+    install_package "OpenIPMI" "yum" || echo "注意: OpenIPMI 安装失败（可选）"
+    
+    # 可选依赖
+    OPTIONAL_PKGS="memtester edac-utils fio nvme-cli"
+    
+    echo "安装可选依赖..."
+    for pkg in $OPTIONAL_PKGS; do
+        install_package "$pkg" "yum" || echo "注意: $pkg 安装失败（可选）"
+    done
+    
+    # 特别处理 stress-ng
+    echo "尝试安装 stress-ng..."
+    if ! yum install -y stress-ng 2>/dev/null; then
+        echo "注意: stress-ng 从默认仓库安装失败"
+        
+        # 尝试从 EPEL 安装 stress (备选)
+        echo "尝试安装 stress 作为备选..."
+        if yum install -y stress 2>/dev/null; then
+            echo "stress 安装成功，可作为压力测试备选工具"
+        else
+            echo "警告: stress 和 stress-ng 都安装失败"
+            echo "      压力测试功能将不可用"
+            FAILED_PACKAGES="$FAILED_PACKAGES stress-ng"
+        fi
+    fi
+    
+    # 对于 CentOS 8，提供手动安装 stress-ng 的指导
+    if [[ "$VER" == 8* ]] && [[ "$FAILED_PACKAGES" == *"stress-ng"* ]]; then
+        echo ""
+        echo "============================================"
+        echo "注意: CentOS 8 中 stress-ng 需要从源码编译安装"
+        echo "============================================"
+        echo "手动安装步骤:"
+        echo "  1. 安装编译依赖:"
+        echo "     yum install -y gcc make libaio-devel libattr-devel libbsd-devel libcap-devel libgcrypt-devel"
+        echo "  2. 下载并编译 stress-ng:"
+        echo "     cd /tmp"
+        echo "     git clone https://github.com/ColinIanKing/stress-ng.git"
+        echo "     cd stress-ng"
+        echo "     make"
+        echo "     make install"
+        echo "============================================"
+        echo ""
+    fi
+}
+
+# 根据发行版安装
+case "$OS" in
+    *Debian*|*Ubuntu*)
+        install_debian_deps
+        ;;
+    *CentOS*|*Red*Hat*|*Fedora*|*Alma*|*Rocky*)
+        install_redhat_deps
+        ;;
+    *)
+        echo "不支持的操作系统: $OS"
+        echo "请手动安装以下工具:"
+        echo "  lshw, dmidecode, smartmontools, lm-sensors, stress-ng, memtester"
+        echo "  ipmitool, edac-utils, fio, mdadm, pciutils, usbutils"
+        exit 1
+        ;;
+esac
+
+echo ""
+echo "系统依赖安装完成"
+
+# 显示安装失败的包
+if [ -n "$FAILED_PACKAGES" ]; then
+    echo ""
+    echo "以下包安装失败: $FAILED_PACKAGES"
+    echo "某些功能可能受限，ServerGuard 仍可运行基本检测"
+fi
+
+echo ""
+
+# 检查 Python 版本
+echo "检查 Python 版本..."
+if command -v python3 &> /dev/null; then
+    PYTHON_VERSION=$(python3 --version 2>&1 | awk '{print $2}')
+    echo "找到 Python $PYTHON_VERSION"
+elif command -v python &> /dev/null; then
+    PYTHON_VERSION=$(python --version 2>&1 | awk '{print $2}')
+    echo "找到 Python $PYTHON_VERSION"
+else
+    echo "错误: 未找到 Python"
+    echo "请安装 Python 3.6 或更高版本"
+    exit 1
+fi
+
+# 检查 Python 版本号
+PYTHON_MAJOR=$(echo $PYTHON_VERSION | cut -d. -f1)
+PYTHON_MINOR=$(echo $PYTHON_VERSION | cut -d. -f2)
+
+if [ "$PYTHON_MAJOR" -lt 3 ] || ([ "$PYTHON_MAJOR" -eq 3 ] && [ "$PYTHON_MINOR" -lt 6 ]); then
+    echo "错误: Python 版本过低 ($PYTHON_VERSION)"
+    echo "需要 Python 3.6 或更高版本"
+    exit 1
+fi
+
+echo "Python 版本符合要求"
+echo ""
+
+# 安装 Python 依赖
+echo "安装 Python 依赖..."
+PIP_CMD="pip3"
+if ! command -v pip3 &> /dev/null; then
+    PIP_CMD="pip"
+fi
+
+$PIP_CMD install -r requirements.txt || {
+    echo "警告: pip 安装失败，尝试使用 --user 选项"
+    $PIP_CMD install --user -r requirements.txt
+}
+
+echo ""
+
+# 配置 lm-sensors
+if command -v sensors-detect &> /dev/null; then
+    echo ""
+    echo "检测到 lm-sensors 需要配置"
+    echo "是否要运行 sensors-detect 配置传感器? (y/N)"
+    read -r response
+    if [[ "$response" =~ ^([yY][eE][sS]|[yY])$ ]]; then
+        echo "正在运行 sensors-detect..."
+        sensors-detect --auto || true
+    fi
+fi
+
+echo ""
+echo "========================================"
+echo "安装完成!"
+echo "========================================"
+echo ""
+
+# 检查依赖状态
+echo "依赖检查:"
+echo "------------"
+for cmd in lshw dmidecode smartctl sensors ipmitool; do
+    if command -v "$cmd" &> /dev/null; then
+        echo "  ✓ $cmd"
+    else
+        echo "  ✗ $cmd (未安装)"
+    fi
+done
+
+echo ""
+echo "压力测试工具:"
+if command -v stress-ng &> /dev/null; then
+    echo "  ✓ stress-ng (推荐)"
+elif command -v stress &> /dev/null; then
+    echo "  ✓ stress (备选)"
+else
+    echo "  ✗ stress/stress-ng (未安装，压力测试不可用)"
+fi
+
+echo ""
+echo "使用方法:"
+echo "  快速检测:    sudo python3 main.py --quick"
+echo "  全面诊断:    sudo python3 main.py --full"
+echo "  特定模块:    sudo python3 main.py --module cpu"
+echo "  生成报告:    sudo python3 main.py --quick --format json --output report.json"
+echo ""
+echo "查看帮助: python3 main.py --help"
+echo ""
+
+# 创建快捷方式（可选）
+echo "是否要创建 /usr/local/bin/serverguard 快捷方式? (y/N)"
+read -r response
+if [[ "$response" =~ ^([yY][eE][sS]|[yY])$ ]]; then
+    SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+    cat > /usr/local/bin/serverguard << EOF
+#!/bin/bash
+cd "$SCRIPT_DIR"
+python3 main.py "\$@"
+EOF
+    chmod +x /usr/local/bin/serverguard
+    echo "快捷方式已创建: serverguard"
+    echo "现在可以直接使用: sudo serverguard --quick"
+fi
+
+echo ""
+echo "安装完成!"
--- a/main.py
+++ b/main.py
@@ -0,0 +1,419 @@
+#!/usr/bin/env python3
+"""
+ServerGuard - 服务器硬件健康诊断系统
+
+主程序入口，负责命令行参数解析、模块调度和报告生成。
+
+使用方法:
+    sudo python3 main.py --quick          # 快速检测
+    sudo python3 main.py --full           # 全面诊断（含压力测试）
+    sudo python3 main.py --module cpu     # 仅检测 CPU
+    sudo python3 main.py --full --format json --output report.json
+"""
+
+import argparse
+import sys
+import os
+from typing import Optional, Dict, Any
+
+from utils import setup_logging, check_root_privileges, get_file_timestamp
+from reporter import ReportGenerator
+
+
+def parse_arguments() -> argparse.Namespace:
+    """
+    解析命令行参数。
+
+    Returns:
+        argparse.Namespace: 解析后的参数
+    """
+    parser = argparse.ArgumentParser(
+        prog='ServerGuard',
+        description='服务器硬件健康诊断系统 - 用于诊断 Linux 服务器硬件故障',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+示例:
+  %(prog)s --quick                    # 快速硬件检测
+  %(prog)s --full                     # 全面诊断（含压力测试）
+  %(prog)s --module cpu               # 仅检测 CPU
+  %(prog)s --module memory,storage    # 检测内存和存储
+  %(prog)s --full --format json       # 生成 JSON 格式报告
+  %(prog)s --list-modules             # 列出所有可用模块
+
+注意: 大多数诊断功能需要 root 权限，请使用 sudo 运行。
+        """
+    )
+    
+    # 主要操作模式（互斥）
+    mode_group = parser.add_mutually_exclusive_group(required=True)
+    mode_group.add_argument(
+        '--quick', '-q',
+        action='store_true',
+        help='快速检测模式（非侵入性，仅收集信息）'
+    )
+    mode_group.add_argument(
+        '--full', '-f',
+        action='store_true',
+        help='全面诊断模式（包含压力测试，耗时较长）'
+    )
+    mode_group.add_argument(
+        '--module', '-m',
+        type=str,
+        metavar='MODULE',
+        help='运行指定模块，多个模块用逗号分隔 (cpu,memory,storage,sensors,gpu,logs)'
+    )
+    mode_group.add_argument(
+        '--list-modules', '-l',
+        action='store_true',
+        help='列出所有可用的检测模块'
+    )
+    
+    # 报告选项
+    parser.add_argument(
+        '--format',
+        type=str,
+        choices=['text', 'json', 'csv', 'html'],
+        default='text',
+        help='报告格式 (默认: text)'
+    )
+    parser.add_argument(
+        '--output', '-o',
+        type=str,
+        metavar='FILE',
+        help='输出文件路径（不指定则输出到控制台）'
+    )
+    parser.add_argument(
+        '--log',
+        type=str,
+        metavar='FILE',
+        default='/var/log/serverguard.log',
+        help='日志文件路径 (默认: /var/log/serverguard.log)'
+    )
+    
+    # 测试参数
+    parser.add_argument(
+        '--stress-duration',
+        type=int,
+        default=300,
+        metavar='SECONDS',
+        help='压力测试持续时间，单位秒 (默认: 300)'
+    )
+    parser.add_argument(
+        '--verbose', '-v',
+        action='store_true',
+        help='显示详细输出'
+    )
+    parser.add_argument(
+        '--yes', '-y',
+        action='store_true',
+        help='自动确认所有警告提示（如压力测试警告）'
+    )
+    
+    return parser.parse_args()
+
+
+def list_available_modules():
+    """列出所有可用的检测模块。"""
+    modules = {
+        'system': '系统信息概览',
+        'cpu': 'CPU 检测与压力测试',
+        'memory': '内存检测与压力测试',
+        'storage': '存储设备检测',
+        'sensors': '电源与传感器监控',
+        'gpu': '显卡检测',
+        'logs': '日志分析'
+    }
+    
+    print("可用的检测模块:")
+    print("-" * 40)
+    for name, description in modules.items():
+        print(f"  {name:12} - {description}")
+    print("-" * 40)
+    print("\n使用示例:")
+    print("  sudo python3 main.py --module cpu")
+    print("  sudo python3 main.py --module cpu,memory,storage")
+
+
+def confirm_stress_test(duration: int, auto_confirm: bool = False) -> bool:
+    """
+    确认是否执行压力测试。
+
+    Args:
+        duration: 压力测试持续时间
+        auto_confirm: 是否自动确认
+
+    Returns:
+        bool: 是否继续
+    """
+    if auto_confirm:
+        return True
+    
+    print("\n" + "=" * 60)
+    print("警告：即将执行压力测试")
+    print("=" * 60)
+    print(f"测试持续时间: {duration} 秒 ({duration // 60} 分钟)")
+    print("此测试将占用大量系统资源，可能导致:")
+    print("  - CPU 和内存使用率接近 100%")
+    print("  - 系统响应变慢")
+    print("  - 温度升高")
+    print("建议在维护窗口期进行，并确保服务器可接受高负载。")
+    print("=" * 60)
+    
+    try:
+        response = input("\n是否继续? [y/N]: ").strip().lower()
+        return response in ('y', 'yes')
+    except KeyboardInterrupt:
+        print("\n操作已取消")
+        return False
+
+
+def run_module(module_name: str, stress_test: bool = False, stress_duration: int = 300) -> Dict[str, Any]:
+    """
+    运行指定的检测模块。
+
+    Args:
+        module_name: 模块名称
+        stress_test: 是否执行压力测试
+        stress_duration: 压力测试持续时间
+
+    Returns:
+        Dict[str, Any]: 模块检测结果
+    """
+    import logging
+    logger = logging.getLogger(__name__)
+    
+    module_map = {
+        'system': 'modules.system_info',
+        'cpu': 'modules.cpu',
+        'memory': 'modules.memory',
+        'storage': 'modules.storage',
+        'sensors': 'modules.sensors',
+        'gpu': 'modules.gpu',
+        'logs': 'modules.log_analyzer'
+    }
+    
+    if module_name not in module_map:
+        logger.error(f"未知模块: {module_name}")
+        return {"status": "error", "error": f"未知模块: {module_name}"}
+    
+    try:
+        module = __import__(module_map[module_name], fromlist=[''])
+        
+        if module_name == 'system':
+            return module.get_system_info()
+        elif module_name == 'cpu':
+            return module.run_cpu_check(stress_test, stress_duration)
+        elif module_name == 'memory':
+            return module.run_memory_check(stress_test, stress_duration)
+        elif module_name == 'storage':
+            return module.run_storage_check()
+        elif module_name == 'sensors':
+            return module.run_sensors_check()
+        elif module_name == 'gpu':
+            return module.run_gpu_check()
+        elif module_name == 'logs':
+            return module.analyze_logs()
+        
+    except Exception as e:
+        logger.error(f"运行模块 {module_name} 时出错: {e}")
+        return {"status": "error", "error": str(e)}
+
+
+def run_quick_check() -> Dict[str, Any]:
+    """
+    执行快速检测（非侵入性）。
+
+    Returns:
+        Dict[str, Any]: 检测结果
+    """
+    import logging
+    logger = logging.getLogger(__name__)
+    
+    print("正在执行快速硬件检测...")
+    print("-" * 60)
+    
+    results = {
+        "scan_type": "quick",
+        "timestamp": get_file_timestamp(),
+        "modules": {}
+    }
+    
+    modules_to_run = ['system', 'cpu', 'memory', 'storage', 'sensors', 'gpu', 'logs']
+    
+    for module_name in modules_to_run:
+        print(f"正在检测: {module_name}...", end=' ', flush=True)
+        try:
+            result = run_module(module_name, stress_test=False)
+            results["modules"][module_name] = result
+            status = result.get("status", "unknown")
+            if status == "success":
+                print("[完成]")
+            elif status == "warning":
+                print("[警告]")
+            elif status == "error":
+                print("[错误]")
+            else:
+                print(f"[{status}]")
+        except Exception as e:
+            logger.error(f"模块 {module_name} 执行失败: {e}")
+            results["modules"][module_name] = {"status": "error", "error": str(e)}
+            print("[失败]")
+    
+    print("-" * 60)
+    return results
+
+
+def run_full_diagnostic(stress_duration: int, auto_confirm: bool = False) -> Dict[str, Any]:
+    """
+    执行全面诊断（包含压力测试）。
+
+    Args:
+        stress_duration: 压力测试持续时间
+        auto_confirm: 是否自动确认
+
+    Returns:
+        Dict[str, Any]: 检测结果
+    """
+    import logging
+    logger = logging.getLogger(__name__)
+    
+    if not confirm_stress_test(stress_duration, auto_confirm):
+        print("诊断已取消")
+        sys.exit(0)
+    
+    print("\n正在执行全面硬件诊断...")
+    print("=" * 60)
+    
+    results = {
+        "scan_type": "full",
+        "timestamp": get_file_timestamp(),
+        "stress_duration": stress_duration,
+        "modules": {}
+    }
+    
+    # 先执行快速检测
+    modules_to_run = ['system', 'cpu', 'memory', 'storage', 'sensors', 'gpu', 'logs']
+    
+    for module_name in modules_to_run:
+        print(f"\n正在检测: {module_name}...")
+        try:
+            # CPU 和内存执行压力测试
+            do_stress = module_name in ['cpu', 'memory']
+            result = run_module(module_name, stress_test=do_stress, stress_duration=stress_duration)
+            results["modules"][module_name] = result
+            status = result.get("status", "unknown")
+            print(f"  状态: {status}")
+        except Exception as e:
+            logger.error(f"模块 {module_name} 执行失败: {e}")
+            results["modules"][module_name] = {"status": "error", "error": str(e)}
+            print(f"  状态: 失败 - {e}")
+    
+    print("\n" + "=" * 60)
+    return results
+
+
+def run_specific_modules(module_list: str, stress_duration: int) -> Dict[str, Any]:
+    """
+    运行指定的模块列表。
+
+    Args:
+        module_list: 逗号分隔的模块名称
+        stress_duration: 压力测试持续时间
+
+    Returns:
+        Dict[str, Any]: 检测结果
+    """
+    modules = [m.strip() for m in module_list.split(',')]
+    
+    results = {
+        "scan_type": "custom",
+        "timestamp": get_file_timestamp(),
+        "modules": {}
+    }
+    
+    print(f"正在执行自定义模块检测: {', '.join(modules)}")
+    print("-" * 60)
+    
+    for module_name in modules:
+        print(f"正在检测: {module_name}...", end=' ', flush=True)
+        try:
+            result = run_module(module_name, stress_test=False)
+            results["modules"][module_name] = result
+            status = result.get("status", "unknown")
+            print(f"[{status}]")
+        except Exception as e:
+            results["modules"][module_name] = {"status": "error", "error": str(e)}
+            print(f"[失败: {e}]")
+    
+    print("-" * 60)
+    return results
+
+
+def main():
+    """程序主入口。"""
+    args = parse_arguments()
+    
+    # 设置日志
+    log_level = logging.DEBUG if args.verbose else logging.INFO
+    setup_logging(
+        log_file=args.log if check_root_privileges() else None,
+        level=log_level,
+        console_output=True
+    )
+    logger = logging.getLogger(__name__)
+    
+    # 列出模块
+    if args.list_modules:
+        list_available_modules()
+        sys.exit(0)
+    
+    # 检查 root 权限警告
+    if not check_root_privileges():
+        logger.warning("未以 root 权限运行，部分功能可能受限")
+        print("警告: 未检测到 root 权限，部分硬件信息可能无法获取")
+        print("建议: 使用 sudo 运行以获得完整的诊断信息\n")
+    
+    # 执行诊断
+    try:
+        if args.quick:
+            results = run_quick_check()
+        elif args.full:
+            results = run_full_diagnostic(args.stress_duration, args.yes)
+        elif args.module:
+            results = run_specific_modules(args.module, args.stress_duration)
+        else:
+            print("请指定操作模式: --quick, --full, --module 或 --list-modules")
+            sys.exit(1)
+        
+        # 生成报告
+        generator = ReportGenerator()
+        
+        if args.output:
+            generator.save_report(results, args.format, args.output)
+            print(f"\n报告已保存至: {args.output}")
+        else:
+            report = generator.generate_report(results, args.format)
+            print("\n" + "=" * 60)
+            print("诊断报告")
+            print("=" * 60)
+            print(report)
+        
+        # 返回退出码：如果有错误则返回 1
+        has_errors = any(
+            m.get("status") == "error" 
+            for m in results.get("modules", {}).values()
+        )
+        sys.exit(1 if has_errors else 0)
+        
+    except KeyboardInterrupt:
+        print("\n\n操作已被用户中断")
+        sys.exit(130)
+    except Exception as e:
+        logger.exception("程序执行过程中发生错误")
+        print(f"\n错误: {e}")
+        sys.exit(1)
+
+
+if __name__ == '__main__':
+    import logging
+    main()
--- a/modules/init.py
+++ b/modules/init.py
@@ -0,0 +1,15 @@
+"""
+ServerGuard 硬件检测模块
+
+包含以下子模块:
+- system_info: 系统信息概览
+- cpu: CPU 检测与压力测试
+- memory: 内存检测与压力测试
+- storage: 存储设备检测
+- sensors: 电源与传感器监控
+- gpu: 显卡检测
+- log_analyzer: 日志分析
+"""
+
+__version__ = "1.0.0"
+__author__ = "ServerGuard Team"
--- a/modules/cpu.py
+++ b/modules/cpu.py
@@ -0,0 +1,518 @@
+"""
+ServerGuard - CPU 检测与压力测试模块
+
+检查 CPU 状态、温度、错误日志，并执行压力测试。
+"""
+
+import os
+import re
+import time
+from typing import Dict, Any, List, Optional
+
+import sys
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from utils import (
+    execute_command, check_command_exists, parse_key_value_output,
+    safe_int, safe_float, require_root
+)
+
+
+def run_cpu_check(stress_test: bool = False, stress_duration: int = 300) -> Dict[str, Any]:
+    """
+    执行 CPU 检测。
+
+    Args:
+        stress_test: 是否执行压力测试
+        stress_duration: 压力测试持续时间（秒）
+
+    Returns:
+        Dict[str, Any]: 检测结果
+    """
+    result = {
+        "status": "success",
+        "cpu_info": {},
+        "temperature": {},
+        "mce_errors": {},
+        "load_average": {},
+        "stress_test": {}
+    }
+    
+    try:
+        # 获取基本信息
+        result["cpu_info"] = get_cpu_details()
+        
+        # 获取温度
+        result["temperature"] = get_cpu_temperature()
+        if result["temperature"].get("status") == "warning":
+            result["status"] = "warning"
+        
+        # 获取负载
+        result["load_average"] = get_load_average()
+        
+        # 检查 MCE 错误
+        result["mce_errors"] = check_mce_errors()
+        if result["mce_errors"].get("count", 0) > 0:
+            result["status"] = "warning"
+        
+        # 执行压力测试
+        if stress_test:
+            result["stress_test"] = run_cpu_stress_test(stress_duration)
+            if not result["stress_test"].get("passed", False):
+                result["status"] = "error"
+                
+    except Exception as e:
+        result["status"] = "error"
+        result["error"] = str(e)
+    
+    return result
+
+
+def get_cpu_details() -> Dict[str, Any]:
+    """获取 CPU 详细信息。"""
+    info = {
+        "model": "Unknown",
+        "architecture": "Unknown",
+        "cores": 0,
+        "threads": 0,
+        "current_frequency_mhz": 0,
+        "bogomips": 0,
+        "flags": []
+    }
+    
+    try:
+        with open('/proc/cpuinfo', 'r') as f:
+            content = f.read()
+            
+        # 解析第一个 CPU 的信息
+        cpu_sections = content.split('\n\n')
+        if cpu_sections:
+            first_cpu = cpu_sections[0]
+            data = {}
+            for line in first_cpu.split('\n'):
+                if ':' in line:
+                    key, value = line.split(':', 1)
+                    data[key.strip()] = value.strip()
+            
+            info["model"] = data.get('model name', 'Unknown')
+            info["vendor"] = data.get('vendor_id', 'Unknown')
+            info["architecture"] = data.get('cpu family', 'Unknown')
+            info["bogomips"] = safe_float(data.get('bogomips', 0))
+            
+            if 'flags' in data:
+                info["flags"] = data['flags'].split()
+            
+            # 统计核心数和线程数
+            info["threads"] = content.count('processor\t:')
+            info["cores"] = len(set(re.findall(r'physical id\t:\s*(\d+)', content)))
+            if info["cores"] == 0:
+                info["cores"] = info["threads"]
+        
+        # 获取当前频率
+        if os.path.exists('/proc/cpuinfo'):
+            with open('/proc/cpuinfo', 'r') as f:
+                for line in f:
+                    if 'cpu MHz' in line:
+                        info["current_frequency_mhz"] = safe_float(line.split(':')[1].strip())
+                        break
+        
+        # 获取缩放频率信息
+        freq_info = get_cpu_frequency_info()
+        if freq_info:
+            info["frequency_info"] = freq_info
+            
+    except Exception as e:
+        info["error"] = str(e)
+    
+    return info
+
+
+def get_cpu_frequency_info() -> Dict[str, Any]:
+    """获取 CPU 频率信息。"""
+    info = {}
+    
+    # 尝试从 cpufreq 获取
+    cpu0_path = '/sys/devices/system/cpu/cpu0/cpufreq'
+    if os.path.exists(cpu0_path):
+        try:
+            files = {
+                "min_mhz": "scaling_min_freq",
+                "max_mhz": "scaling_max_freq",
+                "current_mhz": "scaling_cur_freq",
+                "governor": "scaling_governor",
+                "driver": "scaling_driver"
+            }
+            
+            for key, filename in files.items():
+                filepath = os.path.join(cpu0_path, filename)
+                if os.path.exists(filepath):
+                    with open(filepath, 'r') as f:
+                        value = f.read().strip()
+                        if 'freq' in filename:
+                            # 频率值通常以 kHz 存储
+                            info[key] = round(safe_int(value) / 1000, 2)
+                        else:
+                            info[key] = value
+        except:
+            pass
+    
+    return info
+
+
+def get_cpu_temperature() -> Dict[str, Any]:
+    """获取 CPU 温度信息。"""
+    result = {
+        "status": "success",
+        "sensors": {},
+        "current_c": None,
+        "high_threshold_c": None,
+        "critical_threshold_c": None
+    }
+    
+    temperatures = []
+    
+    # 方法 1: 使用 sensors 命令 (lm-sensors)
+    if check_command_exists('sensors'):
+        try:
+            _, stdout, _ = execute_command(
+                ['sensors', '-u'],
+                check_returncode=False, timeout=10
+            )
+            
+            # 解析 sensors -u 输出
+            current_chip = None
+            current_adapter = None
+            
+            for line in stdout.split('\n'):
+                line = line.strip()
+                
+                # 检测芯片名称
+                if line and not line.startswith('Adapter:') and not ':' in line and not line.startswith('temp'):
+                    current_chip = line.rstrip(':')
+                    result["sensors"][current_chip] = {}
+                    continue
+                
+                if line.startswith('Adapter:'):
+                    current_adapter = line.split(':', 1)[1].strip()
+                    if current_chip:
+                        result["sensors"][current_chip]["adapter"] = current_adapter
+                    continue
+                
+                # 解析温度输入值
+                if 'temp' in line and '_input' in line:
+                    match = re.match(r'(temp\d+)_input:\s*([\d.]+)', line)
+                    if match:
+                        temp_name = match.group(1)
+                        temp_value = safe_float(match.group(2))
+                        
+                        if current_chip:
+                            if temp_name not in result["sensors"][current_chip]:
+                                result["sensors"][current_chip][temp_name] = {}
+                            result["sensors"][current_chip][temp_name]["current"] = temp_value
+                            temperatures.append(temp_value)
+                
+                # 解析高温阈值
+                if 'temp' in line and '_max' in line:
+                    match = re.match(r'(temp\d+)_max:\s*([\d.]+)', line)
+                    if match:
+                        temp_name = match.group(1)
+                        temp_value = safe_float(match.group(2))
+                        if current_chip and temp_name in result["sensors"][current_chip]:
+                            result["sensors"][current_chip][temp_name]["high"] = temp_value
+                
+                # 解析临界温度
+                if 'temp' in line and '_crit' in line:
+                    match = re.match(r'(temp\d+)_crit:\s*([\d.]+)', line)
+                    if match:
+                        temp_name = match.group(1)
+                        temp_value = safe_float(match.group(2))
+                        if current_chip and temp_name in result["sensors"][current_chip]:
+                            result["sensors"][current_chip][temp_name]["critical"] = temp_value
+                            
+        except:
+            pass
+    
+    # 方法 2: 直接读取 thermal zone
+    if not temperatures:
+        try:
+            thermal_path = '/sys/class/thermal'
+            if os.path.exists(thermal_path):
+                for zone in os.listdir(thermal_path):
+                    if zone.startswith('thermal_zone'):
+                        zone_path = os.path.join(thermal_path, zone)
+                        
+                        # 读取类型
+                        type_file = os.path.join(zone_path, 'type')
+                        zone_type = 'unknown'
+                        if os.path.exists(type_file):
+                            with open(type_file, 'r') as f:
+                                zone_type = f.read().strip()
+                        
+                        # 读取温度 (单位是毫摄氏度)
+                        temp_file = os.path.join(zone_path, 'temp')
+                        if os.path.exists(temp_file):
+                            with open(temp_file, 'r') as f:
+                                temp_mc = safe_int(f.read().strip())
+                                temp_c = temp_mc / 1000.0
+                                
+                                if 'x86_pkg_temp' in zone_type or 'cpu' in zone_type.lower():
+                                    result["sensors"][zone] = {
+                                        "type": zone_type,
+                                        "current": temp_c
+                                    }
+                                    temperatures.append(temp_c)
+        except:
+            pass
+    
+    # 方法 3: 尝试从 hwmon 读取
+    if not temperatures:
+        try:
+            hwmon_path = '/sys/class/hwmon'
+            if os.path.exists(hwmon_path):
+                for hwmon in os.listdir(hwmon_path):
+                    hwmon_dir = os.path.join(hwmon_path, hwmon)
+                    
+                    # 读取名称
+                    name_file = os.path.join(hwmon_dir, 'name')
+                    if os.path.exists(name_file):
+                        with open(name_file, 'r') as f:
+                            name = f.read().strip()
+                    else:
+                        name = hwmon
+                    
+                    # 查找温度输入
+                    for file in os.listdir(hwmon_dir):
+                        if file.startswith('temp') and file.endswith('_input'):
+                            temp_file = os.path.join(hwmon_dir, file)
+                            with open(temp_file, 'r') as f:
+                                temp_mc = safe_int(f.read().strip())
+                                temp_c = temp_mc / 1000.0
+                                
+                                sensor_name = file.replace('_input', '')
+                                result["sensors"][f"{name}_{sensor_name}"] = {
+                                    "current": temp_c
+                                }
+                                temperatures.append(temp_c)
+        except:
+            pass
+    
+    # 计算平均温度
+    if temperatures:
+        result["current_c"] = round(sum(temperatures) / len(temperatures), 1)
+        result["max_c"] = round(max(temperatures), 1)
+        
+        # 检查温度警告
+        if result["max_c"] > 85:
+            result["status"] = "warning"
+            result["warning"] = f"CPU 温度过高: {result['max_c']}°C"
+    else:
+        result["status"] = "unknown"
+        result["warning"] = "无法获取 CPU 温度信息"
+    
+    return result
+
+
+def get_load_average() -> Dict[str, Any]:
+    """获取系统负载信息。"""
+    result = {}
+    
+    try:
+        with open('/proc/loadavg', 'r') as f:
+            load_data = f.read().strip().split()
+            
+        if len(load_data) >= 3:
+            result["1min"] = safe_float(load_data[0])
+            result["5min"] = safe_float(load_data[1])
+            result["15min"] = safe_float(load_data[2])
+            
+            # 获取 CPU 核心数以计算相对负载
+            num_cores = os.cpu_count() or 1
+            result["cores"] = num_cores
+            result["relative_1min"] = round(result["1min"] / num_cores, 2)
+            result["relative_5min"] = round(result["5min"] / num_cores, 2)
+            result["relative_15min"] = round(result["15min"] / num_cores, 2)
+            
+    except:
+        pass
+    
+    return result
+
+
+def check_mce_errors() -> Dict[str, Any]:
+    """检查 Machine Check Exception (MCE) 错误。"""
+    result = {
+        "count": 0,
+        "errors": [],
+        "status": "ok"
+    }
+    
+    # 方法 1: 检查 dmesg
+    if check_command_exists('dmesg'):
+        try:
+            _, stdout, _ = execute_command(
+                ['dmesg'],
+                check_returncode=False, timeout=10
+            )
+            
+            mce_keywords = ['Machine check events logged', 'Hardware Error', 'CMCI storm']
+            
+            for line in stdout.split('\n'):
+                for keyword in mce_keywords:
+                    if keyword in line:
+                        result["count"] += 1
+                        if len(result["errors"]) < 10:  # 限制错误数量
+                            result["errors"].append(line.strip())
+                        result["status"] = "warning"
+                        break
+                        
+        except:
+            pass
+    
+    # 方法 2: 检查 mcelog
+    if check_command_exists('mcelog'):
+        try:
+            # 尝试读取 mcelog 输出
+            _, stdout, _ = execute_command(
+                ['mcelog', '--client'],
+                check_returncode=False, timeout=5
+            )
+            
+            if stdout.strip() and 'no machine check' not in stdout.lower():
+                result["count"] += stdout.count('MCE')
+                result["status"] = "warning"
+                result["mcelog_available"] = True
+        except:
+            pass
+    
+    # 方法 3: 检查 /dev/mcelog
+    if os.path.exists('/dev/mcelog'):
+        result["mcelog_device"] = True
+    
+    return result
+
+
+@require_root
+def run_cpu_stress_test(duration: int = 300) -> Dict[str, Any]:
+    """
+    运行 CPU 压力测试。
+
+    Args:
+        duration: 测试持续时间（秒）
+
+    Returns:
+        Dict[str, Any]: 测试结果
+    """
+    result = {
+        "passed": False,
+        "duration_seconds": duration,
+        "cpu_cores": os.cpu_count() or 1,
+        "start_time": None,
+        "end_time": None,
+        "max_temperature": None,
+        "tool_used": None,
+        "errors": []
+    }
+    
+    # 使用 stress-ng 进行压力测试（首选）
+    if check_command_exists('stress-ng'):
+        result["tool_used"] = "stress-ng"
+        try:
+            result["start_time"] = time.strftime('%Y-%m-%d %H:%M:%S')
+            
+            # 获取测试前温度
+            temp_before = get_cpu_temperature()
+            
+            # 运行 stress-ng
+            # --cpu 0 使用所有 CPU 核心
+            # --timeout 指定超时时间
+            # --metrics-brief 输出简要指标
+            cmd = [
+                'stress-ng',
+                '--cpu', '0',
+                '--timeout', str(duration),
+                '--metrics-brief'
+            ]
+            
+            _, stdout, stderr = execute_command(
+                cmd,
+                timeout=duration + 30,  # 给一些额外时间
+                check_returncode=False
+            )
+            
+            result["end_time"] = time.strftime('%Y-%m-%d %H:%M:%S')
+            
+            # 获取测试后温度
+            temp_after = get_cpu_temperature()
+            
+            # 分析输出
+            output = stdout + stderr
+            
+            # 检查是否有错误
+            if 'error' in output.lower() or 'fail' in output.lower():
+                result["passed"] = False
+                result["errors"].append("压力测试过程中发现错误")
+            else:
+                result["passed"] = True
+            
+            # 提取性能指标
+            bogo_ops = re.search(r'stress-ng:\s+cpu:\s+(\d+)\s+bogo ops', output)
+            if bogo_ops:
+                result["bogo_ops"] = safe_int(bogo_ops.group(1))
+            
+            bogo_ops_per_sec = re.search(r'(\d+\.\d+)\s+bogo ops per second', output)
+            if bogo_ops_per_sec:
+                result["bogo_ops_per_second"] = safe_float(bogo_ops_per_sec.group(1))
+            
+            # 温度分析
+            if temp_after.get("max_c"):
+                result["max_temperature"] = temp_after["max_c"]
+                if temp_after["max_c"] > 95:
+                    result["warnings"] = [f"测试期间温度过高: {temp_after['max_c']}°C"]
+            
+            result["temperature_before"] = temp_before
+            result["temperature_after"] = temp_after
+            
+        except Exception as e:
+            result["passed"] = False
+            result["errors"].append(str(e))
+    
+    # 备选: 使用 stress
+    elif check_command_exists('stress'):
+        result["tool_used"] = "stress"
+        try:
+            result["start_time"] = time.strftime('%Y-%m-%d %H:%M:%S')
+            
+            temp_before = get_cpu_temperature()
+            
+            num_cores = os.cpu_count() or 1
+            _, stdout, stderr = execute_command(
+                ['stress', '--cpu', str(num_cores), '--timeout', str(duration)],
+                timeout=duration + 30,
+                check_returncode=False
+            )
+            
+            result["end_time"] = time.strftime('%Y-%m-%d %H:%M:%S')
+            temp_after = get_cpu_temperature()
+            
+            result["passed"] = True
+            result["temperature_before"] = temp_before
+            result["temperature_after"] = temp_after
+            
+            if temp_after.get("max_c"):
+                result["max_temperature"] = temp_after["max_c"]
+                
+        except Exception as e:
+            result["passed"] = False
+            result["errors"].append(str(e))
+    
+    else:
+        result["passed"] = False
+        result["errors"].append("未找到压力测试工具 (stress-ng 或 stress)")
+        result["note"] = "请安装 stress-ng 或 stress: yum install stress / apt install stress-ng"
+    
+    return result
+
+
+if __name__ == '__main__':
+    import json
+    print(json.dumps(run_cpu_check(stress_test=False), indent=2, ensure_ascii=False))
--- a/modules/gpu.py
+++ b/modules/gpu.py
@@ -0,0 +1,497 @@
+"""
+ServerGuard - 显卡检测模块
+
+检测 GPU 信息、温度、驱动状态等。
+"""
+
+import os
+import re
+from typing import Dict, Any, List, Optional
+
+import sys
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from utils import (
+    execute_command, check_command_exists, parse_key_value_output,
+    safe_int, safe_float, format_bytes
+)
+
+
+def run_gpu_check() -> Dict[str, Any]:
+    """
+    执行 GPU 检测。
+
+    Returns:
+        Dict[str, Any]: 检测结果
+    """
+    result = {
+        "status": "success",
+        "gpus": [],
+        "errors": []
+    }
+    
+    try:
+        # 检测 NVIDIA GPU
+        nvidia_gpus = check_nvidia_gpus()
+        if nvidia_gpus:
+            result["gpus"].extend(nvidia_gpus)
+        
+        # 检测 AMD GPU
+        amd_gpus = check_amd_gpus()
+        if amd_gpus:
+            result["gpus"].extend(amd_gpus)
+        
+        # 检测 Intel GPU
+        intel_gpus = check_intel_gpus()
+        if intel_gpus:
+            result["gpus"].extend(intel_gpus)
+        
+        # 如果没有找到 GPU，使用 lspci 基础检测
+        if not result["gpus"]:
+            result["gpus"] = check_generic_gpus()
+        
+        # 检查系统日志中的 GPU 错误
+        result["dmesg_errors"] = check_gpu_dmesg_errors()
+        
+        # 如果有错误，更新状态
+        if result["dmesg_errors"]:
+            result["status"] = "warning"
+        
+        if not result["gpus"]:
+            result["status"] = "unknown"
+            result["note"] = "未检测到 GPU 设备"
+        
+    except Exception as e:
+        result["status"] = "error"
+        result["error"] = str(e)
+    
+    return result
+
+
+def check_nvidia_gpus() -> List[Dict[str, Any]]:
+    """检测 NVIDIA GPU。"""
+    gpus = []
+    
+    if not check_command_exists('nvidia-smi'):
+        return gpus
+    
+    try:
+        # 获取 GPU 列表和基本信息
+        _, stdout, _ = execute_command(
+            ['nvidia-smi', '--query-gpu=gpu_name,gpu_bus_id,pci.bus_id,driver_version,pstate,pcie.link.gen.max,pcie.link.gen.current',
+             '--format=csv,noheader'],
+            check_returncode=False, timeout=10
+        )
+        
+        for i, line in enumerate(stdout.strip().split('\n')):
+            if not line.strip():
+                continue
+            
+            parts = [p.strip() for p in line.split(',')]
+            if len(parts) >= 4:
+                gpu_info = {
+                    "vendor": "NVIDIA",
+                    "index": i,
+                    "name": parts[0],
+                    "bus_id": parts[1] if len(parts) > 1 else "unknown",
+                    "pci_bus_id": parts[2] if len(parts) > 2 else "unknown",
+                    "driver_version": parts[3],
+                    "pstate": parts[4] if len(parts) > 4 else "unknown",
+                    "pcie_max_gen": parts[5] if len(parts) > 5 else "unknown",
+                    "pcie_current_gen": parts[6] if len(parts) > 6 else "unknown"
+                }
+                
+                # 获取详细信息
+                gpu_info.update(get_nvidia_gpu_details(i))
+                gpus.append(gpu_info)
+        
+    except Exception as e:
+        pass
+    
+    return gpus
+
+
+def get_nvidia_gpu_details(gpu_index: int) -> Dict[str, Any]:
+    """获取单个 NVIDIA GPU 的详细信息。"""
+    details = {}
+    
+    try:
+        # 获取温度和功耗
+        _, stdout, _ = execute_command(
+            ['nvidia-smi', '--query-gpu=temperature.gpu,power.draw,power.limit,clocks.gr,clocks.mem,utilization.gpu,utilization.memory,memory.total,memory.used,memory.free,serial,uuid,vbios_version',
+             '--format=csv,noheader,nounits', '-i', str(gpu_index)],
+            check_returncode=False, timeout=10
+        )
+        
+        parts = [p.strip() for p in stdout.split(',')]
+        if len(parts) >= 10:
+            details["temperature_c"] = safe_int(parts[0]) if parts[0] != '[Not Supported]' else None
+            details["power_draw_w"] = safe_float(parts[1]) if parts[1] != '[Not Supported]' else None
+            details["power_limit_w"] = safe_float(parts[2]) if parts[2] != '[Not Supported]' else None
+            details["graphics_clock_mhz"] = safe_int(parts[3]) if parts[3] != '[Not Supported]' else None
+            details["memory_clock_mhz"] = safe_int(parts[4]) if parts[4] != '[Not Supported]' else None
+            details["gpu_utilization_percent"] = safe_int(parts[5]) if parts[5] != '[Not Supported]' else None
+            details["memory_utilization_percent"] = safe_int(parts[6]) if parts[6] != '[Not Supported]' else None
+            details["memory_total_mb"] = safe_int(parts[7]) if parts[7] != '[Not Supported]' else None
+            details["memory_used_mb"] = safe_int(parts[8]) if parts[8] != '[Not Supported]' else None
+            details["memory_free_mb"] = safe_int(parts[9]) if parts[9] != '[Not Supported]' else None
+            
+            if len(parts) > 10:
+                details["serial"] = parts[10] if parts[10] != '[Not Supported]' else None
+            if len(parts) > 11:
+                details["uuid"] = parts[11] if parts[11] != '[Not Supported]' else None
+            if len(parts) > 12:
+                details["vbios_version"] = parts[12] if parts[12] != '[Not Supported]' else None
+        
+        # 获取 ECC 状态
+        _, ecc_output, _ = execute_command(
+            ['nvidia-smi', '--query-gpu=ecc.mode.current,ecc.mode.pending,ecc.errors.corrected.volatile.total,ecc.errors.uncorrected.volatile.total',
+             '--format=csv,noheader', '-i', str(gpu_index)],
+            check_returncode=False, timeout=10
+        )
+        
+        ecc_parts = [p.strip() for p in ecc_output.split(',')]
+        if len(ecc_parts) >= 4:
+            details["ecc_mode"] = ecc_parts[0] if ecc_parts[0] != '[Not Supported]' else None
+            details["ecc_pending"] = ecc_parts[1] if ecc_parts[1] != '[Not Supported]' else None
+            details["ecc_corrected_errors"] = safe_int(ecc_parts[2]) if ecc_parts[2] != '[Not Supported]' else 0
+            details["ecc_uncorrected_errors"] = safe_int(ecc_parts[3]) if ecc_parts[3] != '[Not Supported]' else 0
+        
+        # 获取进程信息
+        _, proc_output, _ = execute_command(
+            ['nvidia-smi', 'pmon', '-s', 'um', '-c', '1', '-i', str(gpu_index)],
+            check_returncode=False, timeout=5
+        )
+        
+        processes = []
+        for line in proc_output.split('\n')[2:]:  # 跳过表头
+            if line.strip() and not line.startswith('#'):
+                proc_parts = line.split()
+                if len(proc_parts) >= 5:
+                    processes.append({
+                        "pid": proc_parts[1],
+                        "type": proc_parts[2],
+                        "sm_util": proc_parts[3],
+                        "mem_util": proc_parts[4]
+                    })
+        
+        if processes:
+            details["processes"] = processes
+        
+    except:
+        pass
+    
+    return details
+
+
+def check_amd_gpus() -> List[Dict[str, Any]]:
+    """检测 AMD GPU。"""
+    gpus = []
+    
+    # 使用 radeontop 获取信息
+    if check_command_exists('radeontop'):
+        try:
+            # radeontop 需要图形环境，使用 -d 参数输出到文件
+            import tempfile
+            
+            with tempfile.NamedTemporaryFile(mode='r', suffix='.txt', delete=False) as f:
+                dump_file = f.name
+            
+            try:
+                _, stdout, _ = execute_command(
+                    ['radeontop', '-d', dump_file, '-l', '1'],
+                    check_returncode=False, timeout=5
+                )
+                
+                with open(dump_file, 'r') as f:
+                    output = f.read()
+                
+                gpu_info = {"vendor": "AMD"}
+                
+                # 解析 radeontop 输出
+                for line in output.split('\n'):
+                    if 'GPU' in line and ':' in line:
+                        parts = line.split(':')
+                        if len(parts) == 2:
+                            key = parts[0].strip().lower().replace(' ', '_')
+                            value = parts[1].strip()
+                            gpu_info[key] = value
+                
+                if gpu_info:
+                    gpus.append(gpu_info)
+                    
+            finally:
+                if os.path.exists(dump_file):
+                    os.unlink(dump_file)
+                    
+        except:
+            pass
+    
+    # 尝试从 sysfs 获取 AMD GPU 信息
+    try:
+        for card in os.listdir('/sys/class/drm'):
+            if card.startswith('card') and not card[-1].isdigit() or (card.startswith('card') and os.path.exists(f'/sys/class/drm/{card}/device/vendor')):
+                vendor_path = f'/sys/class/drm/{card}/device/vendor'
+                if os.path.exists(vendor_path):
+                    with open(vendor_path, 'r') as f:
+                        vendor_id = f.read().strip()
+                    
+                    # AMD vendor ID 是 0x1002
+                    if vendor_id == '0x1002':
+                        gpu_info = {
+                            "vendor": "AMD",
+                            "card": card
+                        }
+                        
+                        # 获取设备信息
+                        device_path = f'/sys/class/drm/{card}/device/device'
+                        if os.path.exists(device_path):
+                            with open(device_path, 'r') as f:
+                                gpu_info["device_id"] = f.read().strip()
+                        
+                        # 获取驱动
+                        driver_path = f'/sys/class/drm/{card}/device/driver'
+                        if os.path.exists(driver_path):
+                            driver = os.path.basename(os.readlink(driver_path))
+                            gpu_info["driver"] = driver
+                        
+                        # 获取温度
+                        temp_path = f'/sys/class/drm/{card}/device/hwmon/hwmon0/temp1_input'
+                        if os.path.exists(temp_path):
+                            with open(temp_path, 'r') as f:
+                                temp_mc = safe_int(f.read().strip())
+                                gpu_info["temperature_c"] = temp_mc / 1000.0
+                        
+                        # 获取频率
+                        freq_path = f'/sys/class/drm/{card}/device/pp_dpm_sclk'
+                        if os.path.exists(freq_path):
+                            with open(freq_path, 'r') as f:
+                                gpu_info["core_clock_levels"] = f.read().strip()
+                        
+                        gpus.append(gpu_info)
+                        
+    except:
+        pass
+    
+    return gpus
+
+
+def check_intel_gpus() -> List[Dict[str, Any]]:
+    """检测 Intel GPU。"""
+    gpus = []
+    
+    # 从 sysfs 获取 Intel GPU 信息
+    try:
+        for card in os.listdir('/sys/class/drm'):
+            if not card.startswith('card'):
+                continue
+            
+            vendor_path = f'/sys/class/drm/{card}/device/vendor'
+            if not os.path.exists(vendor_path):
+                continue
+            
+            with open(vendor_path, 'r') as f:
+                vendor_id = f.read().strip()
+            
+            # Intel vendor ID 是 0x8086
+            if vendor_id == '0x8086':
+                gpu_info = {
+                    "vendor": "Intel",
+                    "card": card
+                }
+                
+                # 获取设备信息
+                device_path = f'/sys/class/drm/{card}/device/device'
+                if os.path.exists(device_path):
+                    with open(device_path, 'r') as f:
+                        gpu_info["device_id"] = f.read().strip()
+                
+                # 获取驱动
+                driver_path = f'/sys/class/drm/{card}/device/driver'
+                if os.path.exists(driver_path):
+                    driver = os.path.basename(os.readlink(driver_path))
+                    gpu_info["driver"] = driver
+                
+                # Intel GPU 通常集成，标记为集成显卡
+                gpu_info["type"] = "integrated"
+                
+                gpus.append(gpu_info)
+                
+    except:
+        pass
+    
+    return gpus
+
+
+def check_generic_gpus() -> List[Dict[str, Any]]:
+    """使用 lspci 进行通用 GPU 检测。"""
+    gpus = []
+    
+    if not check_command_exists('lspci'):
+        return gpus
+    
+    try:
+        _, stdout, _ = execute_command(
+            ['lspci', '-nn'],
+            check_returncode=False, timeout=10
+        )
+        
+        for line in stdout.split('\n'):
+            if 'VGA' in line or '3D controller' in line or 'Display controller' in line:
+                parts = line.split(': ', 1)
+                if len(parts) == 2:
+                    bus_id = parts[0].split()[0]
+                    description = parts[1]
+                    
+                    gpu_info = {
+                        "bus_id": bus_id,
+                        "description": description
+                    }
+                    
+                    # 识别厂商
+                    desc_lower = description.lower()
+                    if 'nvidia' in desc_lower:
+                        gpu_info["vendor"] = "NVIDIA"
+                    elif 'amd' in desc_lower or 'ati' in desc_lower:
+                        gpu_info["vendor"] = "AMD"
+                    elif 'intel' in desc_lower:
+                        gpu_info["vendor"] = "Intel"
+                    else:
+                        gpu_info["vendor"] = "Unknown"
+                    
+                    # 识别类型
+                    if 'VGA' in line:
+                        gpu_info["type"] = "vga"
+                    elif '3D controller' in line:
+                        gpu_info["type"] = "3d"
+                    elif 'Display controller' in line:
+                        gpu_info["type"] = "display"
+                    
+                    # 获取详细信息
+                    try:
+                        _, detail, _ = execute_command(
+                            ['lspci', '-v', '-s', bus_id],
+                            check_returncode=False, timeout=5
+                        )
+                        
+                        # 提取驱动信息
+                        driver_match = re.search(r'Kernel driver in use:\s*(\S+)', detail)
+                        if driver_match:
+                            gpu_info["driver"] = driver_match.group(1)
+                        
+                        # 提取模块信息
+                        modules_match = re.search(r'Kernel modules:\s*(.+)', detail)
+                        if modules_match:
+                            gpu_info["modules"] = modules_match.group(1).strip()
+                            
+                    except:
+                        pass
+                    
+                    gpus.append(gpu_info)
+                    
+    except:
+        pass
+    
+    return gpus
+
+
+def check_gpu_dmesg_errors() -> List[Dict[str, str]]:
+    """检查 dmesg 中的 GPU 相关错误。"""
+    errors = []
+    
+    if not check_command_exists('dmesg'):
+        return errors
+    
+    try:
+        _, stdout, _ = execute_command(
+            ['dmesg'],
+            check_returncode=False, timeout=10
+        )
+        
+        # GPU 相关错误关键词
+        gpu_error_patterns = [
+            r'GPU has fallen off the bus',
+            r'NVRM: Xid',
+            r'nvidia.*error',
+            r'amdgpu.*error',
+            r'i915.*error',
+            r'GPU hang',
+            r'ring.*timeout',
+            r'Failed to load firmware',
+            r'VRAM lost',
+            r'gpu.*fault',
+            r' thermal ',
+        ]
+        
+        for line in stdout.split('\n'):
+            line_lower = line.lower()
+            
+            # 检查是否包含 GPU 相关错误
+            is_gpu_error = any(
+                re.search(pattern, line, re.IGNORECASE) 
+                for pattern in gpu_error_patterns
+            )
+            
+            if is_gpu_error and ('error' in line_lower or 'fail' in line_lower or 'warn' in line_lower or 'Xid' in line):
+                # 提取时间戳
+                timestamp_match = re.match(r'\[\s*([\d.]+)\]', line)
+                timestamp = timestamp_match.group(1) if timestamp_match else "unknown"
+                
+                errors.append({
+                    "timestamp": timestamp,
+                    "message": line.strip()
+                })
+        
+        # 去重并限制数量
+        seen = set()
+        unique_errors = []
+        for error in errors:
+            msg = error["message"]
+            if msg not in seen and len(unique_errors) < 20:
+                seen.add(msg)
+                unique_errors.append(error)
+        
+        return unique_errors
+        
+    except:
+        return []
+
+
+def get_gpu_processes() -> List[Dict[str, Any]]:
+    """获取使用 GPU 的进程列表（仅 NVIDIA）。"""
+    processes = []
+    
+    if not check_command_exists('nvidia-smi'):
+        return processes
+    
+    try:
+        _, stdout, _ = execute_command(
+            ['nvidia-smi', 'pmon', '-s', 'um', '-c', '1'],
+            check_returncode=False, timeout=5
+        )
+        
+        lines = stdout.strip().split('\n')
+        # 跳过前两行（表头）
+        for line in lines[2:]:
+            if line.strip() and not line.startswith('#'):
+                parts = line.split()
+                if len(parts) >= 8:
+                    processes.append({
+                        "gpu_index": safe_int(parts[0]),
+                        "pid": parts[1],
+                        "type": parts[2],
+                        "sm_util": parts[3],
+                        "mem_util": parts[4],
+                        "enc_util": parts[5],
+                        "dec_util": parts[6],
+                        "command": parts[7]
+                    })
+    except:
+        pass
+    
+    return processes
+
+
+if __name__ == '__main__':
+    import json
+    print(json.dumps(run_gpu_check(), indent=2, ensure_ascii=False))
--- a/modules/log_analyzer.py
+++ b/modules/log_analyzer.py
@@ -0,0 +1,553 @@
+"""
+ServerGuard - 日志分析模块
+
+自动分析系统日志，查找硬件相关错误关键词。
+"""
+
+import os
+import re
+import gzip
+from typing import Dict, Any, List, Optional
+from datetime import datetime, timedelta
+
+import sys
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from utils import execute_command, check_command_exists, safe_int
+
+
+# 硬件错误关键词分类
+HARDWARE_ERROR_PATTERNS = {
+    "cpu_errors": [
+        r'Machine check events? logged',
+        r'Hardware Error',
+        r'CMCI storm',
+        r'machine check',
+        r'CPU\s*\d+.*temperature',
+        r'thermal.*cpu',
+        r'CPU.*throttl',
+        r'core.*temp',
+        r'CPU.*fault',
+        r'uncorrectable',
+        r'correctable.*error',
+    ],
+    "memory_errors": [
+        r'Hardware error.*memory',
+        r'EDAC.*error',
+        r'memory.*error',
+        r'Memory.*parity',
+        r'ECC.*error',
+        r'ue\s+count',
+        r'ce\s+count',
+        r'Out of memory',
+        r'oom-kill',
+        r'page allocation failure',
+    ],
+    "storage_errors": [
+        r'I/O error',
+        r'Buffer I/O error',
+        r'blk_update_request',
+        r'ata\d+.*error',
+        r'SATA.*error',
+        r'NVMe.*error',
+        r'critical.*warning',
+        r'disk error',
+        r'block.*error',
+        r'SMART.*failure',
+        r'medium error',
+        r'uncorrectable error',
+    ],
+    "pci_errors": [
+        r'PCIe.*error',
+        r'pcieport.*error',
+        r'PCI.*error',
+        r'AER:\s*',
+        r'Corrected error',
+        r'Uncorrected error',
+        r'Non-Fatal error',
+        r'Fatal error',
+        r'Unsupported Request',
+    ],
+    "usb_errors": [
+        r'usb.*error',
+        r'USB.*over-current',
+        r'usb.*disconnect',
+        r'usb.*timeout',
+        r'ehci.*error',
+        r'xhci.*error',
+    ],
+    "power_errors": [
+        r'thermal.*shutdown',
+        r'critical.*temperature',
+        r'overheat',
+        r'power.*fail',
+        r'under.*voltage',
+        r'over.*voltage',
+        r'brownout',
+        r'power.*button',
+    ],
+    "kernel_panics": [
+        r'Kernel panic',
+        r'sysrq.*trigger',
+        r'watchdog.*bug',
+        r'softlockup',
+        r'hardlockup',
+        r'BUG:.*spinlock',
+        r'BUG:.*scheduling',
+        r'Oops:',
+        r'Call Trace:',
+        r'general protection fault',
+        r'double fault',
+        r'stack.*corruption',
+    ]
+}
+
+
+def analyze_logs() -> Dict[str, Any]:
+    """
+    分析系统日志中的硬件错误。
+
+    Returns:
+        Dict[str, Any]: 分析结果
+    """
+    result = {
+        "status": "success",
+        "scan_time": datetime.now().isoformat(),
+        "dmesg_analysis": {},
+        "journal_analysis": {},
+        "hardware_errors": {},
+        "critical_events": [],
+        "summary": {}
+    }
+    
+    try:
+        # 分析 dmesg
+        result["dmesg_analysis"] = analyze_dmesg()
+        
+        # 分析 journalctl
+        result["journal_analysis"] = analyze_journalctl()
+        
+        # 汇总错误统计
+        result["hardware_errors"] = summarize_errors(result)
+        
+        # 识别关键事件
+        result["critical_events"] = identify_critical_events(result)
+        
+        # 生成摘要
+        total_errors = sum(result["hardware_errors"].values())
+        result["summary"] = {
+            "total_errors_found": total_errors,
+            "critical_events": len(result["critical_events"]),
+            "recommend_action": total_errors > 0
+        }
+        
+        # 如果有错误，标记警告状态
+        if total_errors > 0:
+            result["status"] = "warning"
+        
+    except Exception as e:
+        result["status"] = "error"
+        result["error"] = str(e)
+    
+    return result
+
+
+def analyze_dmesg() -> Dict[str, Any]:
+    """分析 dmesg 输出。"""
+    result = {
+        "available": False,
+        "error_counts": {key: 0 for key in HARDWARE_ERROR_PATTERNS.keys()},
+        "recent_errors": [],
+        "boot_errors": []
+    }
+    
+    if not check_command_exists('dmesg'):
+        result["note"] = "dmesg 不可用"
+        return result
+    
+    try:
+        # 获取 dmesg 输出
+        _, stdout, _ = execute_command(
+            ['dmesg', '--time-format=iso'],
+            check_returncode=False, timeout=15
+        )
+        
+        result["available"] = True
+        
+        # 如果没有 --time-format 支持，使用标准格式
+        if not stdout.strip():
+            _, stdout, _ = execute_command(
+                ['dmesg'],
+                check_returncode=False, timeout=15
+            )
+        
+        lines = stdout.split('\n')
+        
+        # 分析每一行
+        for line in lines:
+            if not line.strip():
+                continue
+            
+            # 检查各类错误
+            for error_type, patterns in HARDWARE_ERROR_PATTERNS.items():
+                for pattern in patterns:
+                    if re.search(pattern, line, re.IGNORECASE):
+                        result["error_counts"][error_type] += 1
+                        
+                        # 保存最近的一些错误
+                        if len(result["recent_errors"]) < 50:
+                            error_entry = {
+                                "type": error_type,
+                                "message": line.strip(),
+                                "pattern": pattern
+                            }
+                            if error_entry not in result["recent_errors"]:
+                                result["recent_errors"].append(error_entry)
+                        break
+        
+        # 检查启动错误
+        result["boot_errors"] = extract_boot_errors(lines)
+        
+    except Exception as e:
+        result["error"] = str(e)
+    
+    return result
+
+
+def analyze_journalctl() -> Dict[str, Any]:
+    """分析 journalctl 日志。"""
+    result = {
+        "available": False,
+        "error_counts": {key: 0 for key in HARDWARE_ERROR_PATTERNS.keys()},
+        "recent_errors": [],
+        "boot_events": []
+    }
+    
+    if not check_command_exists('journalctl'):
+        result["note"] = "journalctl 不可用"
+        return result
+    
+    try:
+        # 获取最近 1000 行日志
+        _, stdout, stderr = execute_command(
+            ['journalctl', '-n', '1000', '--no-pager', '-p', 'err'],
+            check_returncode=False, timeout=15
+        )
+        
+        if 'No journal files were found' in stderr:
+            result["note"] = "无 journal 文件"
+            return result
+        
+        result["available"] = True
+        
+        lines = stdout.split('\n')
+        
+        for line in lines:
+            if not line.strip():
+                continue
+            
+            # 检查各类错误
+            for error_type, patterns in HARDWARE_ERROR_PATTERNS.items():
+                for pattern in patterns:
+                    if re.search(pattern, line, re.IGNORECASE):
+                        result["error_counts"][error_type] += 1
+                        
+                        if len(result["recent_errors"]) < 50:
+                            error_entry = {
+                                "type": error_type,
+                                "message": line.strip()
+                            }
+                            if error_entry not in result["recent_errors"]:
+                                result["recent_errors"].append(error_entry)
+                        break
+        
+        # 获取启动事件
+        result["boot_events"] = get_journal_boot_events()
+        
+    except Exception as e:
+        result["error"] = str(e)
+    
+    return result
+
+
+def extract_boot_errors(lines: List[str]) -> List[Dict[str, str]]:
+    """提取启动过程中的错误。"""
+    boot_errors = []
+    in_boot = False
+    
+    for line in lines:
+        # 检测启动阶段
+        if 'Linux version' in line or 'Command line:' in line:
+            in_boot = True
+        
+        if in_boot and ('error' in line.lower() or 'fail' in line.lower() or 'warn' in line.lower()):
+            # 排除常见的非关键消息
+            if not any(x in line.lower() for x in ['firmware', 'efi', 'acpi']):
+                boot_errors.append({
+                    "stage": "boot",
+                    "message": line.strip()
+                })
+        
+        # 启动完成后停止
+        if in_boot and ('systemd' in line and 'startup' in line):
+            in_boot = False
+    
+    return boot_errors[:20]  # 限制数量
+
+
+def get_journal_boot_events() -> List[Dict[str, str]]:
+    """获取 journalctl 中的启动事件。"""
+    events = []
+    
+    try:
+        # 获取当前启动的日志
+        _, stdout, _ = execute_command(
+            ['journalctl', '-b', '0', '--no-pager', '-p', 'warning'],
+            check_returncode=False, timeout=10
+        )
+        
+        for line in stdout.split('\n'):
+            if 'error' in line.lower() or 'fail' in line.lower() or 'hardware' in line.lower():
+                events.append({"message": line.strip()})
+        
+        return events[:20]
+        
+    except:
+        return []
+
+
+def summarize_errors(analysis_result: Dict[str, Any]) -> Dict[str, int]:
+    """汇总错误统计。"""
+    summary = {key: 0 for key in HARDWARE_ERROR_PATTERNS.keys()}
+    
+    # 合并 dmesg 和 journalctl 的统计
+    dmesg_counts = analysis_result.get("dmesg_analysis", {}).get("error_counts", {})
+    journal_counts = analysis_result.get("journal_analysis", {}).get("error_counts", {})
+    
+    for error_type in summary.keys():
+        summary[error_type] = dmesg_counts.get(error_type, 0) + journal_counts.get(error_type, 0)
+    
+    return summary
+
+
+def identify_critical_events(analysis_result: Dict[str, Any]) -> List[Dict[str, Any]]:
+    """识别需要立即关注的关键事件。"""
+    critical_events = []
+    
+    # 合并所有错误
+    all_errors = []
+    all_errors.extend(analysis_result.get("dmesg_analysis", {}).get("recent_errors", []))
+    all_errors.extend(analysis_result.get("journal_analysis", {}).get("recent_errors", []))
+    
+    # 定义关键错误模式
+    critical_patterns = [
+        (r'Kernel panic', 'kernel_panic', '内核崩溃'),
+        (r'hardlockup', 'hard_lockup', 'CPU 硬死锁'),
+        (r'softlockup', 'soft_lockup', 'CPU 软死锁'),
+        (r'thermal.*shutdown', 'thermal_shutdown', '过热关机'),
+        (r'Hardware Error', 'hardware_error', '硬件错误'),
+        (r'Fatal.*PCIe', 'pcie_fatal', 'PCIe 致命错误'),
+        (r'I/O error.*sector', 'disk_io_error', '磁盘 I/O 错误'),
+        (r'Uncorrectable.*error', 'uncorrectable_error', '不可纠正错误'),
+        (r'out of memory.*kill', 'oom_kill', 'OOM 进程杀死'),
+        (r'GPU.*fallen.*bus', 'gpu_disconnect', 'GPU 断开连接'),
+    ]
+    
+    for error in all_errors:
+        message = error.get("message", "")
+        for pattern, event_type, description in critical_patterns:
+            if re.search(pattern, message, re.IGNORECASE):
+                event = {
+                    "type": event_type,
+                    "description": description,
+                    "message": message[:200],  # 限制长度
+                    "source": "dmesg" if error in analysis_result.get("dmesg_analysis", {}).get("recent_errors", []) else "journal"
+                }
+                
+                # 避免重复
+                if event not in critical_events:
+                    critical_events.append(event)
+    
+    return critical_events
+
+
+def get_kernel_panic_logs() -> List[Dict[str, str]]:
+    """专门查找内核崩溃信息。"""
+    panics = []
+    
+    # 检查 dmesg
+    if check_command_exists('dmesg'):
+        try:
+            _, stdout, _ = execute_command(['dmesg'], check_returncode=False, timeout=10)
+            
+            for line in stdout.split('\n'):
+                if 'Kernel panic' in line or 'sysrq' in line.lower():
+                    panics.append({
+                        "source": "dmesg",
+                        "message": line.strip()
+                    })
+        except:
+            pass
+    
+    # 检查 journalctl
+    if check_command_exists('journalctl'):
+        try:
+            _, stdout, _ = execute_command(
+                ['journalctl', '-k', '--no-pager', '-g', 'panic'],
+                check_returncode=False, timeout=10
+            )
+            
+            for line in stdout.split('\n'):
+                if 'panic' in line.lower():
+                    panics.append({
+                        "source": "journalctl",
+                        "message": line.strip()
+                    })
+        except:
+            pass
+    
+    return panics
+
+
+def get_hardware_error_logs() -> Dict[str, List[str]]:
+    """获取特定类型的硬件错误日志。"""
+    result = {
+        "mce_errors": [],
+        "ecc_errors": [],
+        "io_errors": [],
+        "thermal_errors": []
+    }
+    
+    if check_command_exists('dmesg'):
+        try:
+            _, stdout, _ = execute_command(['dmesg'], check_returncode=False, timeout=10)
+            
+            for line in stdout.split('\n'):
+                # MCE 错误
+                if re.search(r'Machine check|CMCI|hardware error', line, re.IGNORECASE):
+                    result["mce_errors"].append(line.strip())
+                
+                # ECC 错误
+                if re.search(r'ECC|EDAC|memory error', line, re.IGNORECASE):
+                    result["ecc_errors"].append(line.strip())
+                
+                # I/O 错误
+                if re.search(r'I/O error|ata.*error|blk_update', line, re.IGNORECASE):
+                    result["io_errors"].append(line.strip())
+                
+                # 热错误
+                if re.search(r'thermal|overheat|critical temp', line, re.IGNORECASE):
+                    result["thermal_errors"].append(line.strip())
+        except:
+            pass
+    
+    # 限制数量
+    for key in result:
+        result[key] = result[key][:20]
+    
+    return result
+
+
+def search_logs_by_keyword(keyword: str, max_lines: int = 100) -> List[str]:
+    """
+    根据关键词搜索日志。
+
+    Args:
+        keyword: 搜索关键词
+        max_lines: 最大返回行数
+
+    Returns:
+        List[str]: 匹配的行列表
+    """
+    results = []
+    
+    # 搜索 dmesg
+    if check_command_exists('dmesg'):
+        try:
+            _, stdout, _ = execute_command(
+                ['dmesg'],
+                check_returncode=False, timeout=10
+            )
+            
+            for line in stdout.split('\n'):
+                if keyword.lower() in line.lower():
+                    results.append(f"[dmesg] {line.strip()}")
+                    if len(results) >= max_lines:
+                        return results
+        except:
+            pass
+    
+    # 搜索 journalctl
+    if check_command_exists('journalctl'):
+        try:
+            _, stdout, _ = execute_command(
+                ['journalctl', '-n', str(max_lines * 2), '--no-pager'],
+                check_returncode=False, timeout=10
+            )
+            
+            for line in stdout.split('\n'):
+                if keyword.lower() in line.lower():
+                    results.append(f"[journal] {line.strip()}")
+                    if len(results) >= max_lines:
+                        return results
+        except:
+            pass
+    
+    return results
+
+
+def get_system_logs(since: Optional[str] = None, until: Optional[str] = None) -> Dict[str, Any]:
+    """
+    获取系统日志。
+
+    Args:
+        since: 开始时间 (格式: '2024-01-01 00:00:00')
+        until: 结束时间
+
+    Returns:
+        Dict[str, Any]: 日志数据
+    """
+    result = {
+        "dmesg": "",
+        "journalctl": "",
+        "kern_log": ""
+    }
+    
+    # dmesg
+    if check_command_exists('dmesg'):
+        try:
+            _, stdout, _ = execute_command(['dmesg'], check_returncode=False, timeout=10)
+            result["dmesg"] = stdout
+        except:
+            pass
+    
+    # journalctl
+    if check_command_exists('journalctl'):
+        try:
+            cmd = ['journalctl', '--no-pager', '-n', '5000']
+            if since:
+                cmd.extend(['--since', since])
+            if until:
+                cmd.extend(['--until', until])
+            
+            _, stdout, _ = execute_command(cmd, check_returncode=False, timeout=15)
+            result["journalctl"] = stdout
+        except:
+            pass
+    
+    # /var/log/kern.log
+    kern_log_path = '/var/log/kern.log'
+    if os.path.exists(kern_log_path):
+        try:
+            with open(kern_log_path, 'r', encoding='utf-8', errors='ignore') as f:
+                lines = f.readlines()[-5000:]  # 最后 5000 行
+                result["kern_log"] = ''.join(lines)
+        except:
+            pass
+    
+    return result
+
+
+if __name__ == '__main__':
+    import json
+    print(json.dumps(analyze_logs(), indent=2, ensure_ascii=False))
--- a/modules/memory.py
+++ b/modules/memory.py
@@ -0,0 +1,577 @@
+"""
+ServerGuard - 内存检测与压力测试模块
+
+深度检测内存的读写错误和稳定性。
+"""
+
+import os
+import re
+import time
+from typing import Dict, Any, List, Optional
+
+import sys
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from utils import (
+    execute_command, check_command_exists, safe_int, safe_float,
+    format_bytes, require_root
+)
+
+
+def run_memory_check(stress_test: bool = False, stress_duration: int = 300) -> Dict[str, Any]:
+    """
+    执行内存检测。
+
+    Args:
+        stress_test: 是否执行压力测试
+        stress_duration: 压力测试持续时间（秒）
+
+    Returns:
+        Dict[str, Any]: 检测结果
+    """
+    result = {
+        "status": "success",
+        "summary": {},
+        "dimm_info": [],
+        "ecc_status": {},
+        "edac_errors": {},
+        "stress_test": {}
+    }
+    
+    try:
+        # 获取内存摘要信息
+        result["summary"] = get_memory_summary()
+        
+        # 获取 DIMM 详细信息
+        result["dimm_info"] = get_dimm_info()
+        
+        # 检查 ECC 状态
+        result["ecc_status"] = check_ecc_status()
+        
+        # 检查 EDAC 错误
+        result["edac_errors"] = check_edac_errors()
+        if result["edac_errors"].get("total_errors", 0) > 0:
+            result["status"] = "warning"
+        
+        # 执行内存压力测试
+        if stress_test:
+            # 优先使用 memtester
+            if check_command_exists('memtester'):
+                result["stress_test"] = run_memtester(stress_duration)
+            # 备选使用 stress-ng
+            elif check_command_exists('stress-ng'):
+                result["stress_test"] = run_memory_stress_ng(stress_duration)
+            # 最后使用 stress
+            elif check_command_exists('stress'):
+                result["stress_test"] = run_memory_stress(stress_duration)
+            else:
+                result["stress_test"] = {
+                    "passed": False,
+                    "error": "未找到内存压力测试工具 (memtester/stress-ng/stress)"
+                }
+            
+            if not result["stress_test"].get("passed", False):
+                result["status"] = "error"
+                
+    except Exception as e:
+        result["status"] = "error"
+        result["error"] = str(e)
+    
+    return result
+
+
+def get_memory_summary() -> Dict[str, Any]:
+    """获取内存摘要信息。"""
+    result = {
+        "total_bytes": 0,
+        "total_gb": 0,
+        "available_bytes": 0,
+        "available_gb": 0,
+        "used_bytes": 0,
+        "used_gb": 0,
+        "free_bytes": 0,
+        "free_gb": 0,
+        "buffers_bytes": 0,
+        "cached_bytes": 0,
+        "swap_total_bytes": 0,
+        "swap_used_bytes": 0,
+        "swap_free_bytes": 0
+    }
+    
+    try:
+        with open('/proc/meminfo', 'r') as f:
+            meminfo = f.read()
+        
+        # 解析 meminfo
+        patterns = {
+            "total_bytes": r'MemTotal:\s+(\d+)',
+            "free_bytes": r'MemFree:\s+(\d+)',
+            "available_bytes": r'MemAvailable:\s+(\d+)',
+            "buffers_bytes": r'Buffers:\s+(\d+)',
+            "cached_bytes": r'Cached:\s+(\d+)',
+            "swap_total_bytes": r'SwapTotal:\s+(\d+)',
+            "swap_free_bytes": r'SwapFree:\s+(\d+)'
+        }
+        
+        for key, pattern in patterns.items():
+            match = re.search(pattern, meminfo)
+            if match:
+                kb = safe_int(match.group(1))
+                bytes_val = kb * 1024
+                result[key] = bytes_val
+                
+                # 同时设置 GB 版本
+                gb_key = key.replace('bytes', 'gb')
+                result[gb_key] = round(bytes_val / (1024**3), 2)
+        
+        # 计算已用内存
+        result["used_bytes"] = result["total_bytes"] - result["free_bytes"] - result["buffers_bytes"] - result["cached_bytes"]
+        result["used_gb"] = round(result["used_bytes"] / (1024**3), 2)
+        
+        # 计算交换空间使用情况
+        result["swap_used_bytes"] = result["swap_total_bytes"] - result["swap_free_bytes"]
+        result["swap_used_gb"] = round(result["swap_used_bytes"] / (1024**3), 2)
+        result["swap_free_gb"] = round(result["swap_free_bytes"] / (1024**3), 2)
+        
+        # 计算使用百分比
+        if result["total_bytes"] > 0:
+            result["usage_percent"] = round((result["used_bytes"] / result["total_bytes"]) * 100, 1)
+        
+    except Exception as e:
+        result["error"] = str(e)
+    
+    return result
+
+
+def get_dimm_info() -> List[Dict[str, Any]]:
+    """获取 DIMM（内存条）详细信息。"""
+    dimms = []
+    
+    if check_command_exists('dmidecode'):
+        try:
+            _, stdout, _ = execute_command(
+                ['dmidecode', '-t', 'memory'],
+                check_returncode=False, timeout=15
+            )
+            
+            # 分割每个内存设备
+            devices = stdout.split('Memory Device')
+            
+            for device in devices[1:]:  # 第一个是标题，跳过
+                dimm = {}
+                
+                # 解析各项属性
+                patterns = {
+                    "array_handle": r'Array Handle:\s*(\S+)',
+                    "error_handle": r'Error Information Handle:\s*(\S+)',
+                    "total_width": r'Total Width:\s*(\d+)',
+                    "data_width": r'Data Width:\s*(\d+)',
+                    "size": r'Size:\s*(.*)',
+                    "form_factor": r'Form Factor:\s*(\S+)',
+                    "set": r'Set:\s*(\S+)',
+                    "locator": r'Locator:\s*(.+)',
+                    "bank_locator": r'Bank Locator:\s*(.+)',
+                    "type": r'Type:\s*(\S+)',
+                    "type_detail": r'Type Detail:\s*(.+)',
+                    "speed": r'Speed:\s*(.*)',
+                    "manufacturer": r'Manufacturer:\s*(\S+)',
+                    "serial_number": r'Serial Number:\s*(\S+)',
+                    "asset_tag": r'Asset Tag:\s*(\S+)',
+                    "part_number": r'Part Number:\s*(\S+)',
+                    "rank": r'Rank:\s*(\d+)',
+                    "configured_speed": r'Configured Memory Speed:\s*(.*)',
+                    "minimum_voltage": r'Minimum Voltage:\s*(.+)',
+                    "maximum_voltage": r'Maximum Voltage:\s*(.+)',
+                    "configured_voltage": r'Configured Voltage:\s*(.+)'
+                }
+                
+                for key, pattern in patterns.items():
+                    match = re.search(pattern, device, re.IGNORECASE)
+                    if match:
+                        value = match.group(1).strip()
+                        # 跳过无效值
+                        if value not in ['Not Specified', 'To be filled by O.E.M.', 'None', 'No Module Installed', 'Unknown']:
+                            dimm[key] = value
+                
+                # 解析大小
+                if 'size' in dimm:
+                    size_str = dimm['size']
+                    if 'MB' in size_str:
+                        dimm["size_mb"] = safe_int(size_str.replace('MB', '').strip())
+                    elif 'GB' in size_str:
+                        dimm["size_gb"] = safe_float(size_str.replace('GB', '').strip())
+                        dimm["size_mb"] = int(dimm["size_gb"] * 1024)
+                    elif 'No Module' in size_str:
+                        continue  # 跳过空插槽
+                
+                # 解析速度
+                if 'speed' in dimm:
+                    speed_str = dimm['speed']
+                    if 'MT/s' in speed_str:
+                        dimm["speed_mts"] = safe_int(speed_str.replace('MT/s', '').strip())
+                    elif 'MHz' in speed_str:
+                        dimm["speed_mhz"] = safe_int(speed_str.replace('MHz', '').strip())
+                
+                if dimm:
+                    dimms.append(dimm)
+                    
+        except Exception as e:
+            pass
+    
+    return dimms
+
+
+def check_ecc_status() -> Dict[str, Any]:
+    """检查 ECC（错误校正码）内存状态。"""
+    result = {
+        "supported": False,
+        "enabled": False,
+        "mode": "unknown",
+        "errors": 0
+    }
+    
+    # 方法 1: 检查 /proc/meminfo
+    try:
+        with open('/proc/meminfo', 'r') as f:
+            content = f.read()
+            
+        if 'HardwareCorrupted' in content:
+            result["supported"] = True
+            match = re.search(r'HardwareCorrupted:\s+(\d+)\s+kB', content)
+            if match:
+                result["errors"] = safe_int(match.group(1))
+    except:
+        pass
+    
+    # 方法 2: 使用 dmidecode 检查内存类型
+    if check_command_exists('dmidecode'):
+        try:
+            _, stdout, _ = execute_command(
+                ['dmidecode', '-t', 'memory'],
+                check_returncode=False, timeout=10
+            )
+            
+            if 'ECC' in stdout or 'Error Correction' in stdout:
+                result["supported"] = True
+                
+                # 尝试提取 ECC 模式
+                match = re.search(r'Error Correction Type:\s*(.+)', stdout)
+                if match:
+                    result["mode"] = match.group(1).strip()
+                    result["enabled"] = result["mode"] != 'None'
+                    
+        except:
+            pass
+    
+    # 方法 3: 检查 EDAC
+    edac_path = '/sys/devices/system/edac/mc'
+    if os.path.exists(edac_path):
+        result["edac_available"] = True
+        try:
+            # 检查每个内存控制器
+            for mc in os.listdir(edac_path):
+                if mc.startswith('mc'):
+                    mc_path = os.path.join(edac_path, mc)
+                    ce_file = os.path.join(mc_path, 'ce_count')  # Correctable errors
+                    ue_file = os.path.join(mc_path, 'ue_count')  # Uncorrectable errors
+                    
+                    if os.path.exists(ce_file):
+                        with open(ce_file, 'r') as f:
+                            ce_count = safe_int(f.read().strip())
+                            result["correctable_errors"] = result.get("correctable_errors", 0) + ce_count
+                    
+                    if os.path.exists(ue_file):
+                        with open(ue_file, 'r') as f:
+                            ue_count = safe_int(f.read().strip())
+                            result["uncorrectable_errors"] = result.get("uncorrectable_errors", 0) + ue_count
+        except:
+            pass
+    
+    return result
+
+
+def check_edac_errors() -> Dict[str, Any]:
+    """检查 EDAC（Error Detection and Correction）错误。"""
+    result = {
+        "total_errors": 0,
+        "correctable_errors": 0,
+        "uncorrectable_errors": 0,
+        "memory_controllers": []
+    }
+    
+    edac_path = '/sys/devices/system/edac/mc'
+    
+    if not os.path.exists(edac_path):
+        result["note"] = "EDAC 不可用"
+        return result
+    
+    try:
+        for mc_name in os.listdir(edac_path):
+            if not mc_name.startswith('mc'):
+                continue
+                
+            mc_path = os.path.join(edac_path, mc_name)
+            mc_info = {"name": mc_name}
+            
+            # 读取 CE 计数
+            ce_file = os.path.join(mc_path, 'ce_count')
+            if os.path.exists(ce_file):
+                with open(ce_file, 'r') as f:
+                    ce = safe_int(f.read().strip())
+                    mc_info["correctable_errors"] = ce
+                    result["correctable_errors"] += ce
+            
+            # 读取 UE 计数
+            ue_file = os.path.join(mc_path, 'ue_count')
+            if os.path.exists(ue_file):
+                with open(ue_file, 'r') as f:
+                    ue = safe_int(f.read().strip())
+                    mc_info["uncorrectable_errors"] = ue
+                    result["uncorrectable_errors"] += ue
+            
+            # 读取内存控制器信息
+            info_files = ['mc_name', 'size_mb', 'mem_type', 'edac_mc_mode']
+            for info_file in info_files:
+                filepath = os.path.join(mc_path, info_file)
+                if os.path.exists(filepath):
+                    with open(filepath, 'r') as f:
+                        mc_info[info_file] = f.read().strip()
+            
+            result["memory_controllers"].append(mc_info)
+        
+        result["total_errors"] = result["correctable_errors"] + result["uncorrectable_errors"]
+        
+    except Exception as e:
+        result["error"] = str(e)
+    
+    return result
+
+
+@require_root
+def run_memtester(duration: int = 300) -> Dict[str, Any]:
+    """
+    运行内存压力测试。
+
+    Args:
+        duration: 测试持续时间（秒），实际 memtester 是基于大小而非时间
+
+    Returns:
+        Dict[str, Any]: 测试结果
+    """
+    result = {
+        "passed": False,
+        "size_mb": 0,
+        "iterations": 1,
+        "start_time": None,
+        "end_time": None,
+        "duration_seconds": 0,
+        "errors": [],
+        "tests_run": []
+    }
+    
+    if not check_command_exists('memtester'):
+        result["errors"].append("memtester 未安装")
+        return result
+    
+    try:
+        # 计算测试内存大小
+        # 留出一些内存给系统和 stress-ng 使用
+        with open('/proc/meminfo', 'r') as f:
+            content = f.read()
+        
+        match = re.search(r'MemAvailable:\s+(\d+)', content)
+        if match:
+            available_mb = safe_int(match.group(1)) // 1024
+            # 使用可用内存的 70%
+            test_size_mb = max(64, int(available_mb * 0.7))
+        else:
+            test_size_mb = 256
+        
+        result["size_mb"] = test_size_mb
+        result["start_time"] = time.strftime('%Y-%m-%d %H:%M:%S')
+        start_ts = time.time()
+        
+        # 运行 memtester
+        cmd = ['memtester', f'{test_size_mb}M', '1']
+        
+        _, stdout, stderr = execute_command(
+            cmd,
+            timeout=max(300, test_size_mb),  # 根据内存大小调整超时
+            check_returncode=False
+        )
+        
+        result["end_time"] = time.strftime('%Y-%m-%d %H:%M:%S')
+        result["duration_seconds"] = round(time.time() - start_ts, 2)
+        
+        output = stdout + stderr
+        result["raw_output"] = output[:2000]  # 保存部分原始输出
+        
+        # 分析结果
+        if 'FAILURE' in output.upper():
+            result["passed"] = False
+            # 提取错误信息
+            for line in output.split('\n'):
+                if 'FAILURE' in line.upper() or 'error' in line.lower():
+                    result["errors"].append(line.strip())
+        elif 'SUCCESS' in output.upper() or 'ok' in output.lower() or 'finished' in output.lower():
+            result["passed"] = True
+        else:
+            # 检查是否完成所有测试
+            if 'Done' in output or 'finished' in output.lower():
+                result["passed"] = True
+            else:
+                result["passed"] = False
+                result["errors"].append("测试可能未完成")
+        
+        # 提取运行的测试
+        test_names = [
+            'Stuck Address', 'Random Value', 'Compare XOR',
+            'Compare SUB', 'Compare MUL', 'Compare DIV',
+            'Compare OR', 'Compare AND', 'Sequential Increment',
+            'Solid Bits', 'Block Sequential', 'Checkerboard',
+            'Bit Spread', 'Bit Flip', 'Walking Ones', 'Walking Zeroes'
+        ]
+        
+        for test in test_names:
+            if test in output:
+                result["tests_run"].append(test)
+        
+    except Exception as e:
+        result["passed"] = False
+        result["errors"].append(str(e))
+    
+    return result
+
+
+@require_root
+def run_memory_stress_ng(duration: int = 300) -> Dict[str, Any]:
+    """
+    使用 stress-ng 进行内存压力测试。
+
+    Args:
+        duration: 测试持续时间（秒）
+
+    Returns:
+        Dict[str, Any]: 测试结果
+    """
+    result = {
+        "passed": False,
+        "tool": "stress-ng",
+        "duration_seconds": duration,
+        "start_time": None,
+        "end_time": None,
+        "errors": []
+    }
+    
+    if not check_command_exists('stress-ng'):
+        result["errors"].append("stress-ng 未安装")
+        return result
+    
+    try:
+        result["start_time"] = time.strftime('%Y-%m-%d %H:%M:%S')
+        
+        # 运行 stress-ng 内存测试
+        cmd = [
+            'stress-ng',
+            '--vm', '4',  # 4 个 vm worker
+            '--vm-bytes', '80%',  # 每个 worker 使用 80% 可用内存
+            '--vm-method', 'all',  # 使用所有测试方法
+            '--timeout', str(duration),
+            '--metrics-brief'
+        ]
+        
+        _, stdout, stderr = execute_command(
+            cmd,
+            timeout=duration + 30,
+            check_returncode=False
+        )
+        
+        result["end_time"] = time.strftime('%Y-%m-%d %H:%M:%S')
+        
+        output = stdout + stderr
+        
+        if 'error' in output.lower() or 'fail' in output.lower():
+            result["passed"] = False
+        else:
+            result["passed"] = True
+        
+        # 提取指标
+        bogo_ops = re.search(r'stress-ng:\s+vm:\s+(\d+)\s+bogo ops', output)
+        if bogo_ops:
+            result["bogo_ops"] = safe_int(bogo_ops.group(1))
+        
+    except Exception as e:
+        result["passed"] = False
+        result["errors"].append(str(e))
+    
+    return result
+
+
+@require_root
+def run_memory_stress(duration: int = 300) -> Dict[str, Any]:
+    """
+    使用 stress 进行内存压力测试（备选方案）。
+
+    Args:
+        duration: 测试持续时间（秒）
+
+    Returns:
+        Dict[str, Any]: 测试结果
+    """
+    result = {
+        "passed": False,
+        "tool": "stress",
+        "duration_seconds": duration,
+        "start_time": None,
+        "end_time": None,
+        "workers": 4,
+        "errors": []
+    }
+    
+    if not check_command_exists('stress'):
+        result["errors"].append("stress 未安装")
+        return result
+    
+    try:
+        result["start_time"] = time.strftime('%Y-%m-%d %H:%M:%S')
+        
+        # 运行 stress 内存测试
+        # --vm: 内存分配 worker 数量
+        # --vm-bytes: 每个 worker 分配的内存
+        # --vm-keep: 保持内存占用
+        # --timeout: 超时时间
+        cmd = [
+            'stress',
+            '--vm', '4',
+            '--vm-bytes', '80%',
+            '--vm-keep',
+            '--timeout', str(duration)
+        ]
+        
+        _, stdout, stderr = execute_command(
+            cmd,
+            timeout=duration + 30,
+            check_returncode=False
+        )
+        
+        result["end_time"] = time.strftime('%Y-%m-%d %H:%M:%S')
+        
+        output = stdout + stderr
+        
+        # stress 的成功退出码通常是 0
+        # 如果有错误输出，可能是失败的
+        if 'error' in output.lower() or 'fail' in output.lower():
+            result["passed"] = False
+        else:
+            result["passed"] = True
+        
+    except Exception as e:
+        result["passed"] = False
+        result["errors"].append(str(e))
+    
+    return result
+
+
+if __name__ == '__main__':
+    import json
+    print(json.dumps(run_memory_check(stress_test=False), indent=2, ensure_ascii=False))
--- a/modules/sensors.py
+++ b/modules/sensors.py
@@ -0,0 +1,545 @@
+"""
+ServerGuard - 电源与主板传感器监控模块
+
+监控电源、主板传感器数据，包括温度、电压、风扇转速等。
+"""
+
+import os
+import re
+from typing import Dict, Any, List, Optional
+
+import sys
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from utils import (
+    execute_command, check_command_exists, parse_key_value_output,
+    safe_int, safe_float, require_root
+)
+
+
+def run_sensors_check() -> Dict[str, Any]:
+    """
+    执行传感器检测。
+
+    Returns:
+        Dict[str, Any]: 检测结果
+    """
+    result = {
+        "status": "success",
+        "lm_sensors": {},
+        "ipmi_sensors": {},
+        "thermal_zones": {},
+        "power_supplies": {},
+        "ipmi_sel": {}
+    }
+    
+    try:
+        # 获取 lm-sensors 数据
+        result["lm_sensors"] = get_lm_sensors_data()
+        
+        # 获取 IPMI 传感器数据
+        result["ipmi_sensors"] = get_ipmi_sensors_data()
+        
+        # 获取 thermal zone 数据
+        result["thermal_zones"] = get_thermal_zones()
+        
+        # 获取电源信息
+        result["power_supplies"] = get_power_supply_info()
+        
+        # 获取 IPMI SEL 日志
+        result["ipmi_sel"] = get_ipmi_sel_logs()
+        
+        # 检查警告条件
+        warnings = check_sensor_warnings(result)
+        if warnings:
+            result["warnings"] = warnings
+            result["status"] = "warning"
+        
+    except Exception as e:
+        result["status"] = "error"
+        result["error"] = str(e)
+    
+    return result
+
+
+def get_lm_sensors_data() -> Dict[str, Any]:
+    """获取 lm-sensors 传感器数据。"""
+    result = {
+        "available": False,
+        "chips": {}
+    }
+    
+    if not check_command_exists('sensors'):
+        result["error"] = "lm-sensors 未安装"
+        return result
+    
+    try:
+        # 检测传感器芯片
+        _, stdout, _ = execute_command(
+            ['sensors', '-u'],
+            check_returncode=False, timeout=15
+        )
+        
+        if not stdout.strip():
+            result["error"] = "无传感器数据，可能需要运行 sensors-detect"
+            return result
+        
+        result["available"] = True
+        
+        # 解析 sensors -u 输出
+        current_chip = None
+        current_adapter = None
+        current_feature = None
+        
+        for line in stdout.split('\n'):
+            line = line.rstrip()
+            
+            if not line:
+                continue
+            
+            # 检测芯片名称行（以冒号结尾的非缩进行）
+            if not line.startswith(' ') and line.endswith(':'):
+                current_chip = line.rstrip(':')
+                result["chips"][current_chip] = {
+                    "features": {}
+                }
+                current_feature = None
+                continue
+            
+            # 检测 Adapter 行
+            if line.strip().startswith('Adapter:'):
+                current_adapter = line.split(':', 1)[1].strip()
+                if current_chip:
+                    result["chips"][current_chip]["adapter"] = current_adapter
+                continue
+            
+            # 检测功能名称行（缩进的非冒号结尾行）
+            if line.startswith('  ') and not line.startswith('    ') and not line.endswith(':'):
+                current_feature = line.strip().rstrip(':')
+                if current_chip:
+                    result["chips"][current_chip]["features"][current_feature] = {}
+                continue
+            
+            # 检测属性行（四个空格缩进）
+            if line.startswith('    ') and ':' in line and current_chip and current_feature:
+                key_value = line.strip().split(':', 1)
+                if len(key_value) == 2:
+                    key = key_value[0].strip()
+                    value_str = key_value[1].strip()
+                    
+                    # 提取数值
+                    value_match = re.search(r'([\d.]+)', value_str)
+                    if value_match:
+                        value = safe_float(value_match.group(1))
+                        
+                        feature_data = result["chips"][current_chip]["features"][current_feature]
+                        
+                        # 分类存储
+                        if '_input' in key:
+                            feature_data["value"] = value
+                        elif '_max' in key:
+                            feature_data["max"] = value
+                        elif '_min' in key:
+                            feature_data["min"] = value
+                        elif '_crit' in key:
+                            feature_data["critical"] = value
+                        elif '_alarm' in key:
+                            feature_data["alarm"] = value > 0
+                        else:
+                            feature_data[key] = value
+        
+        # 提取常用传感器的汇总数据
+        result["summary"] = extract_sensor_summary(result["chips"])
+        
+    except Exception as e:
+        result["error"] = str(e)
+    
+    return result
+
+
+def extract_sensor_summary(chips: Dict[str, Any]) -> Dict[str, Any]:
+    """从传感器数据中提取常用指标的汇总。"""
+    summary = {
+        "temperatures": {},
+        "voltages": {},
+        "fans": {},
+        "powers": {},
+        "currents": {}
+    }
+    
+    for chip_name, chip_data in chips.items():
+        for feature_name, feature_data in chip_data.get("features", {}).items():
+            value = feature_data.get("value")
+            if value is None:
+                continue
+            
+            feature_lower = feature_name.lower()
+            
+            # 温度传感器
+            if 'temp' in feature_lower or 'thermal' in feature_lower:
+                # 提取传感器编号
+                temp_match = re.search(r'temp(\d+)', feature_lower)
+                if temp_match:
+                    temp_id = temp_match.group(1)
+                    summary["temperatures"][f"{chip_name}_temp{temp_id}"] = {
+                        "value": value,
+                        "max": feature_data.get("max"),
+                        "critical": feature_data.get("critical"),
+                        "alarm": feature_data.get("alarm", False)
+                    }
+            
+            # 电压传感器
+            elif 'in' in feature_lower or 'voltage' in feature_lower or 'vcc' in feature_lower:
+                summary["voltages"][f"{chip_name}_{feature_name}"] = {
+                    "value": value,
+                    "min": feature_data.get("min"),
+                    "max": feature_data.get("max"),
+                    "alarm": feature_data.get("alarm", False)
+                }
+            
+            # 风扇转速
+            elif 'fan' in feature_lower:
+                fan_match = re.search(r'fan(\d+)', feature_lower)
+                if fan_match:
+                    fan_id = fan_match.group(1)
+                    summary["fans"][f"{chip_name}_fan{fan_id}"] = {
+                        "rpm": value,
+                        "min": feature_data.get("min"),
+                        "alarm": feature_data.get("alarm", False)
+                    }
+            
+            # 功率传感器
+            elif 'power' in feature_lower or 'watt' in feature_lower:
+                summary["powers"][f"{chip_name}_{feature_name}"] = {
+                    "value": value,
+                    "max": feature_data.get("max")
+                }
+            
+            # 电流传感器
+            elif 'curr' in feature_lower or 'amp' in feature_lower:
+                summary["currents"][f"{chip_name}_{feature_name}"] = {
+                    "value": value,
+                    "max": feature_data.get("max")
+                }
+    
+    return summary
+
+
+def get_ipmi_sensors_data() -> Dict[str, Any]:
+    """获取 IPMI 传感器数据。"""
+    result = {
+        "available": False,
+        "sensors": {}
+    }
+    
+    if not check_command_exists('ipmitool'):
+        result["note"] = "ipmitool 未安装"
+        return result
+    
+    try:
+        # 检查 IPMI 是否可用
+        _, stdout, stderr = execute_command(
+            ['ipmitool', 'sensor'],
+            check_returncode=False, timeout=10
+        )
+        
+        if 'Could not open device' in stderr or 'Driver not found' in stderr:
+            result["note"] = "IPMI 设备不可用"
+            return result
+        
+        result["available"] = True
+        
+        # 解析传感器列表
+        for line in stdout.split('\n'):
+            if not line.strip() or '|' not in line:
+                continue
+            
+            parts = [p.strip() for p in line.split('|')]
+            if len(parts) >= 4:
+                sensor_name = parts[0]
+                sensor_value = parts[1]
+                sensor_unit = parts[2]
+                sensor_status = parts[3]
+                
+                result["sensors"][sensor_name] = {
+                    "value": sensor_value,
+                    "unit": sensor_unit,
+                    "status": sensor_status
+                }
+        
+        # 分类传感器
+        result["categories"] = categorize_ipmi_sensors(result["sensors"])
+        
+    except Exception as e:
+        result["error"] = str(e)
+    
+    return result
+
+
+def categorize_ipmi_sensors(sensors: Dict[str, Any]) -> Dict[str, Dict[str, Any]]:
+    """将 IPMI 传感器分类。"""
+    categories = {
+        "temperatures": {},
+        "voltages": {},
+        "fans": {},
+        "power": {},
+        "currents": {},
+        "other": {}
+    }
+    
+    for name, data in sensors.items():
+        name_lower = name.lower()
+        unit = data.get("unit", "").lower()
+        
+        if 'temp' in name_lower or unit == 'degrees c':
+            categories["temperatures"][name] = data
+        elif 'volt' in name_lower or unit == 'volts' or 'vcc' in name_lower or '3.3v' in name_lower or '5v' in name_lower or '12v' in name_lower:
+            categories["voltages"][name] = data
+        elif 'fan' in name_lower or 'rpm' in unit:
+            categories["fans"][name] = data
+        elif 'power' in name_lower or 'watt' in unit:
+            categories["power"][name] = data
+        elif 'current' in name_lower or 'amp' in unit:
+            categories["currents"][name] = data
+        else:
+            categories["other"][name] = data
+    
+    return categories
+
+
+def get_thermal_zones() -> Dict[str, Any]:
+    """从 thermal zone 获取温度信息。"""
+    result = {
+        "zones": {},
+        "policies": {}
+    }
+    
+    thermal_path = '/sys/class/thermal'
+    
+    if not os.path.exists(thermal_path):
+        return result
+    
+    try:
+        for zone_name in os.listdir(thermal_path):
+            if not zone_name.startswith('thermal_zone'):
+                continue
+            
+            zone_path = os.path.join(thermal_path, zone_name)
+            zone_info = {}
+            
+            # 读取类型
+            type_file = os.path.join(zone_path, 'type')
+            if os.path.exists(type_file):
+                with open(type_file, 'r') as f:
+                    zone_info["type"] = f.read().strip()
+            
+            # 读取温度 (毫摄氏度转换为摄氏度)
+            temp_file = os.path.join(zone_path, 'temp')
+            if os.path.exists(temp_file):
+                with open(temp_file, 'r') as f:
+                    temp_mc = safe_int(f.read().strip())
+                    zone_info["temperature_c"] = temp_mc / 1000.0
+            
+            # 读取策略
+            policy_file = os.path.join(zone_path, 'policy')
+            if os.path.exists(policy_file):
+                with open(policy_file, 'r') as f:
+                    zone_info["policy"] = f.read().strip()
+            
+            # 读取临界温度
+            trip_point_file = os.path.join(zone_path, 'trip_point_0_temp')
+            if os.path.exists(trip_point_file):
+                with open(trip_point_file, 'r') as f:
+                    zone_info["critical_temp_c"] = safe_int(f.read().strip()) / 1000.0
+            
+            result["zones"][zone_name] = zone_info
+        
+        # 读取 thermal 策略
+        for policy_file in os.listdir('/sys/class/thermal'):
+            if policy_file.startswith('cooling_device'):
+                policy_path = os.path.join('/sys/class/thermal', policy_file)
+                policy_info = {}
+                
+                type_file = os.path.join(policy_path, 'type')
+                if os.path.exists(type_file):
+                    with open(type_file, 'r') as f:
+                        policy_info["type"] = f.read().strip()
+                
+                cur_state_file = os.path.join(policy_path, 'cur_state')
+                if os.path.exists(cur_state_file):
+                    with open(cur_state_file, 'r') as f:
+                        policy_info["current_state"] = safe_int(f.read().strip())
+                
+                max_state_file = os.path.join(policy_path, 'max_state')
+                if os.path.exists(max_state_file):
+                    with open(max_state_file, 'r') as f:
+                        policy_info["max_state"] = safe_int(f.read().strip())
+                
+                result["policies"][policy_file] = policy_info
+                
+    except Exception as e:
+        result["error"] = str(e)
+    
+    return result
+
+
+def get_power_supply_info() -> Dict[str, Any]:
+    """获取电源信息。"""
+    result = {
+        "supplies": []
+    }
+    
+    power_supply_path = '/sys/class/power_supply'
+    
+    if not os.path.exists(power_supply_path):
+        return result
+    
+    try:
+        for supply_name in os.listdir(power_supply_path):
+            supply_path = os.path.join(power_supply_path, supply_name)
+            supply_info = {"name": supply_name}
+            
+            # 读取所有属性文件
+            for attr in os.listdir(supply_path):
+                attr_path = os.path.join(supply_path, attr)
+                if os.path.isfile(attr_path):
+                    try:
+                        with open(attr_path, 'r') as f:
+                            value = f.read().strip()
+                            # 尝试转换为数字
+                            if value.isdigit():
+                                supply_info[attr] = safe_int(value)
+                            else:
+                                try:
+                                    supply_info[attr] = safe_float(value)
+                                except:
+                                    supply_info[attr] = value
+                    except:
+                        pass
+            
+            result["supplies"].append(supply_info)
+            
+    except Exception as e:
+        result["error"] = str(e)
+    
+    return result
+
+
+def get_ipmi_sel_logs() -> Dict[str, Any]:
+    """获取 IPMI SEL（System Event Log）日志。"""
+    result = {
+        "available": False,
+        "entries": [],
+        "hardware_errors": [],
+        "critical_events": []
+    }
+    
+    if not check_command_exists('ipmitool'):
+        result["note"] = "ipmitool 未安装"
+        return result
+    
+    try:
+        # 获取 SEL 列表
+        _, stdout, stderr = execute_command(
+            ['ipmitool', 'sel', 'elist'],
+            check_returncode=False, timeout=15
+        )
+        
+        if 'Could not open device' in stderr or 'Driver not found' in stderr:
+            result["note"] = "IPMI 设备不可用"
+            return result
+        
+        result["available"] = True
+        
+        # 解析 SEL 条目
+        critical_keywords = ['critical', 'failure', 'error', 'thermal', 'voltage', 'power']
+        hardware_keywords = ['memory', 'processor', 'hard drive', 'fan', 'power supply', 'temperature']
+        
+        for line in stdout.split('\n'):
+            if not line.strip():
+                continue
+            
+            # SEL 格式: ID | Date/Time | Source | Event
+            parts = [p.strip() for p in line.split('|')]
+            if len(parts) >= 4:
+                entry = {
+                    "id": parts[0],
+                    "datetime": parts[1],
+                    "source": parts[2],
+                    "event": parts[3]
+                }
+                
+                result["entries"].append(entry)
+                
+                # 检查是否为关键事件
+                event_lower = entry["event"].lower()
+                if any(kw in event_lower for kw in critical_keywords):
+                    result["critical_events"].append(entry)
+                
+                # 检查是否为硬件错误
+                if any(kw in event_lower for kw in hardware_keywords):
+                    result["hardware_errors"].append(entry)
+        
+        result["total_entries"] = len(result["entries"])
+        result["critical_count"] = len(result["critical_events"])
+        result["hardware_error_count"] = len(result["hardware_errors"])
+        
+    except Exception as e:
+        result["error"] = str(e)
+    
+    return result
+
+
+def check_sensor_warnings(sensor_data: Dict[str, Any]) -> List[str]:
+    """检查传感器警告条件。"""
+    warnings = []
+    
+    # 检查 lm-sensors 告警
+    lm_sensors = sensor_data.get("lm_sensors", {})
+    summary = lm_sensors.get("summary", {})
+    
+    # 温度告警
+    for name, temp_data in summary.get("temperatures", {}).items():
+        if temp_data.get("alarm"):
+            warnings.append(f"温度传感器 {name} 告警: {temp_data.get('value')}°C")
+        elif temp_data.get("value", 0) > 90:
+            warnings.append(f"温度传感器 {name} 温度过高: {temp_data.get('value')}°C")
+    
+    # 电压告警
+    for name, volt_data in summary.get("voltages", {}).items():
+        if volt_data.get("alarm"):
+            warnings.append(f"电压传感器 {name} 告警: {volt_data.get('value')}V")
+    
+    # 风扇告警
+    for name, fan_data in summary.get("fans", {}).items():
+        if fan_data.get("alarm"):
+            warnings.append(f"风扇 {name} 告警: {fan_data.get('rpm')} RPM")
+        elif fan_data.get("rpm", 0) == 0 and fan_data.get("min", 0) > 0:
+            warnings.append(f"风扇 {name} 可能已停止: {fan_data.get('rpm')} RPM")
+    
+    # 检查 IPMI 告警
+    ipmi_sensors = sensor_data.get("ipmi_sensors", {})
+    for name, data in ipmi_sensors.get("sensors", {}).items():
+        status = data.get("status", "").lower()
+        if status in ['critical', 'non-recoverable', 'warning']:
+            warnings.append(f"IPMI 传感器 {name} 状态异常: {data.get('status')}")
+    
+    # 检查 IPMI SEL 关键事件
+    ipmi_sel = sensor_data.get("ipmi_sel", {})
+    if ipmi_sel.get("critical_count", 0) > 0:
+        warnings.append(f"IPMI SEL 中有 {ipmi_sel['critical_count']} 个关键事件")
+    
+    # 检查 thermal zone 温度
+    thermal_zones = sensor_data.get("thermal_zones", {})
+    for zone_name, zone_data in thermal_zones.get("zones", {}).items():
+        temp = zone_data.get("temperature_c", 0)
+        critical = zone_data.get("critical_temp_c", 100)
+        if temp > critical * 0.9:  # 超过临界温度的 90%
+            warnings.append(f"Thermal zone {zone_name} 温度接近临界值: {temp}°C (临界: {critical}°C)")
+    
+    return warnings
+
+
+if __name__ == '__main__':
+    import json
+    print(json.dumps(run_sensors_check(), indent=2, ensure_ascii=False))
--- a/modules/storage.py
+++ b/modules/storage.py
@@ -0,0 +1,602 @@
+"""
+ServerGuard - 存储设备检测模块
+
+检查硬盘/SSD 的健康状况、SMART 数据、RAID 状态。
+"""
+
+import os
+import re
+import json
+from typing import Dict, Any, List, Optional
+
+import sys
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from utils import (
+    execute_command, check_command_exists, parse_key_value_output,
+    safe_int, safe_float, format_bytes, require_root
+)
+
+
+def run_storage_check() -> Dict[str, Any]:
+    """
+    执行存储设备检测。
+
+    Returns:
+        Dict[str, Any]: 检测结果
+    """
+    result = {
+        "status": "success",
+        "devices": [],
+        "raid_status": {},
+        "io_stats": {}
+    }
+    
+    try:
+        # 获取存储设备列表
+        devices = get_storage_devices()
+        
+        # 检测每个设备
+        for device in devices:
+            device_info = check_device(device)
+            result["devices"].append(device_info)
+            
+            # 如果有严重问题，标记警告状态
+            if device_info.get("health") in ['FAILED', 'WARNING']:
+                result["status"] = "warning"
+        
+        # 检查 RAID 状态
+        result["raid_status"] = check_raid_status()
+        
+        # 获取 I/O 统计
+        result["io_stats"] = get_io_statistics()
+        
+    except Exception as e:
+        result["status"] = "error"
+        result["error"] = str(e)
+    
+    return result
+
+
+def get_storage_devices() -> List[Dict[str, str]]:
+    """获取存储设备列表。"""
+    devices = []
+    
+    # 方法 1: 使用 lsblk
+    if check_command_exists('lsblk'):
+        try:
+            _, stdout, _ = execute_command(
+                ['lsblk', '-d', '-n', '-o', 'NAME,TYPE,ROTA', '-J'],
+                check_returncode=False, timeout=10
+            )
+            
+            data = json.loads(stdout)
+            for dev in data.get('blockdevices', []):
+                if dev.get('type') == 'disk':
+                    devices.append({
+                        "name": dev['name'],
+                        "path": f"/dev/{dev['name']}",
+                        "type": "hdd" if dev.get('rota') else "ssd"
+                    })
+        except:
+            pass
+    
+    # 方法 2: 扫描 /sys/block
+    if not devices:
+        try:
+            for name in os.listdir('/sys/block'):
+                if name.startswith(('sd', 'hd', 'nvme', 'vd', 'xvd', 'mmcblk')):
+                    dev_type = "unknown"
+                    try:
+                        with open(f'/sys/block/{name}/queue/rotational', 'r') as f:
+                            dev_type = "hdd" if f.read().strip() == '1' else "ssd"
+                    except:
+                        pass
+                    
+                    devices.append({
+                        "name": name,
+                        "path": f"/dev/{name}",
+                        "type": dev_type
+                    })
+        except:
+            pass
+    
+    return devices
+
+
+def check_device(device: Dict[str, str]) -> Dict[str, Any]:
+    """检查单个存储设备。"""
+    result = {
+        "name": device["name"],
+        "path": device["path"],
+        "type": device.get("type", "unknown"),
+        "model": "Unknown",
+        "serial": "Unknown",
+        "firmware": "Unknown",
+        "size_bytes": 0,
+        "size_human": "Unknown",
+        "health": "UNKNOWN",
+        "smart_status": {},
+        "temperature_c": None,
+        "power_on_hours": None,
+        "start_stop_count": None,
+        "reallocated_sectors": None,
+        "pending_sectors": None,
+        "test_result": None
+    }
+    
+    # 获取设备基本信息
+    result.update(get_device_info(device["path"]))
+    
+    # 获取 SMART 数据
+    smart_data = get_smart_data(device["path"])
+    result["smart_status"] = smart_data
+    
+    # 分析健康状态
+    result["health"] = analyze_health(smart_data)
+    
+    # 提取关键属性
+    if "attributes" in smart_data:
+        attrs = smart_data["attributes"]
+        
+        # 温度
+        for temp_attr in ['194 Temperature_Celsius', '190 Airflow_Temperature_Cel', 'Temperature']:
+            if temp_attr in attrs:
+                temp_val = attrs[temp_attr].get('raw_value')
+                if temp_val:
+                    result["temperature_c"] = safe_int(temp_val.split()[0])
+                    break
+        
+        # 运行时间
+        if '9 Power_On_Hours' in attrs:
+            result["power_on_hours"] = safe_int(attrs['9 Power_On_Hours'].get('raw_value', 0))
+        
+        # 启动次数
+        if '4 Start_Stop_Count' in attrs:
+            result["start_stop_count"] = safe_int(attrs['4 Start_Stop_Count'].get('raw_value', 0))
+        
+        # 重映射扇区
+        if '5 Reallocated_Sector_Ct' in attrs:
+            result["reallocated_sectors"] = safe_int(attrs['5 Reallocated_Sector_Ct'].get('raw_value', 0))
+        
+        # 待处理扇区
+        if '197 Current_Pending_Sector' in attrs:
+            result["pending_sectors"] = safe_int(attrs['197 Current_Pending_Sector'].get('raw_value', 0))
+    
+    # NVMe 特殊处理
+    if device["name"].startswith('nvme'):
+        nvme_data = get_nvme_data(device["path"])
+        result["nvme_data"] = nvme_data
+        if nvme_data.get("temperature"):
+            result["temperature_c"] = nvme_data["temperature"]
+        if nvme_data.get("health"):
+            result["health"] = nvme_data["health"]
+    
+    return result
+
+
+def get_device_info(device_path: str) -> Dict[str, Any]:
+    """获取设备基本信息。"""
+    info = {}
+    
+    # 使用 smartctl -i 获取信息
+    if check_command_exists('smartctl'):
+        try:
+            _, stdout, _ = execute_command(
+                ['smartctl', '-i', device_path],
+                check_returncode=False, timeout=10
+            )
+            
+            patterns = {
+                "model": r'Device Model:\s*(.+)',
+                "serial": r'Serial Number:\s*(\S+)',
+                "firmware": r'Firmware Version:\s*(\S+)',
+                "size_human": r'User Capacity:\s*(.+)',
+                "sector_size": r'Sector Size:\s*(.+)',
+                "rotation_rate": r'Rotation Rate:\s*(.+)',
+                "form_factor": r'Form Factor:\s*(.+)',
+                "transport": r'Transport protocol:\s*(.+)'
+            }
+            
+            for key, pattern in patterns.items():
+                match = re.search(pattern, stdout)
+                if match:
+                    info[key] = match.group(1).strip()
+            
+            # 提取容量字节数
+            size_match = re.search(r'User Capacity:\s*[\d,]+\s*bytes\s*\[(\d+)\]', stdout)
+            if size_match:
+                info["size_bytes"] = safe_int(size_match.group(1))
+            
+            # 是否为 SSD
+            if 'Solid State Device' in stdout or 'Rotation Rate: Solid State Device' in stdout:
+                info["is_ssd"] = True
+            elif 'Rotation Rate' in stdout and 'Solid State' not in stdout:
+                info["is_ssd"] = False
+                
+        except:
+            pass
+    
+    # 备用：从 /sys 获取大小
+    if "size_bytes" not in info or info["size_bytes"] == 0:
+        try:
+            dev_name = os.path.basename(device_path)
+            with open(f'/sys/block/{dev_name}/size', 'r') as f:
+                sectors = safe_int(f.read().strip())
+                info["size_bytes"] = sectors * 512
+                info["size_human"] = format_bytes(info["size_bytes"])
+        except:
+            pass
+    
+    return info
+
+
+def get_smart_data(device_path: str) -> Dict[str, Any]:
+    """获取 SMART 数据。"""
+    result = {
+        "supported": False,
+        "enabled": False,
+        "overall": "UNKNOWN",
+        "attributes": {},
+        "self_tests": []
+    }
+    
+    if not check_command_exists('smartctl'):
+        result["error"] = "smartctl 未安装"
+        return result
+    
+    try:
+        # 检查 SMART 支持
+        _, stdout, _ = execute_command(
+            ['smartctl', '-i', device_path],
+            check_returncode=False, timeout=10
+        )
+        
+        if 'SMART support is: Available' in stdout:
+            result["supported"] = True
+        if 'SMART support is: Enabled' in stdout:
+            result["enabled"] = True
+        
+        # 获取所有 SMART 数据
+        _, stdout, _ = execute_command(
+            ['smartctl', '-a', device_path],
+            check_returncode=False, timeout=15
+        )
+        
+        # 解析整体健康状态
+        if 'PASSED' in stdout or 'OK' in stdout:
+            result["overall"] = "PASSED"
+        elif 'FAILED' in stdout:
+            result["overall"] = "FAILED"
+        
+        # 解析 SMART 属性表 (ATA 设备)
+        if 'ID#' in stdout and 'ATTRIBUTE_NAME' in stdout:
+            lines = stdout.split('\n')
+            in_attributes = False
+            
+            for line in lines:
+                if 'ID#' in line and 'ATTRIBUTE_NAME' in line:
+                    in_attributes = True
+                    continue
+                
+                if in_attributes:
+                    if not line.strip() or line.startswith('SMART'):
+                        break
+                    
+                    # 解析属性行
+                    # 格式: ID# ATTRIBUTE_NAME FLAG VALUE WORST THRESH TYPE UPDATED WHEN_FAILED RAW_VALUE
+                    parts = line.split()
+                    if len(parts) >= 10:
+                        attr_id = parts[0]
+                        attr_name = parts[1]
+                        attr_key = f"{attr_id} {attr_name}"
+                        
+                        result["attributes"][attr_key] = {
+                            "flag": parts[2],
+                            "value": safe_int(parts[3]),
+                            "worst": safe_int(parts[4]),
+                            "thresh": safe_int(parts[5]),
+                            "type": parts[6],
+                            "updated": parts[7],
+                            "when_failed": parts[8] if parts[8] != '-' else None,
+                            "raw_value": ' '.join(parts[9:])
+                        }
+        
+        # 解析自检日志
+        if 'SMART Self-test log' in stdout:
+            self_test_section = False
+            for line in stdout.split('\n'):
+                if 'SMART Self-test log' in line:
+                    self_test_section = True
+                    continue
+                if self_test_section and line.strip() and not line.startswith('SMART'):
+                    if '#' in line:
+                        result["self_tests"].append(line.strip())
+        
+        # 解析错误日志
+        if 'SMART Error Log' in stdout:
+            error_match = re.search(r'Error (\d+)\s+occurred at', stdout)
+            if error_match:
+                result["error_count"] = safe_int(error_match.group(1))
+                
+    except Exception as e:
+        result["error"] = str(e)
+    
+    return result
+
+
+def get_nvme_data(device_path: str) -> Dict[str, Any]:
+    """获取 NVMe 设备特有数据。"""
+    result = {
+        "health": "UNKNOWN",
+        "temperature": None,
+        "available_spare": None,
+        "percentage_used": None,
+        "data_units_read": None,
+        "data_units_written": None,
+        "host_reads": None,
+        "host_writes": None
+    }
+    
+    if not check_command_exists('nvme'):
+        return result
+    
+    try:
+        # 获取 SMART 日志
+        _, stdout, _ = execute_command(
+            ['nvme', 'smart-log', device_path],
+            check_returncode=False, timeout=10
+        )
+        
+        # 解析关键指标
+        temp_match = re.search(r'temperature\s*:\s*(\d+)', stdout)
+        if temp_match:
+            result["temperature"] = safe_int(temp_match.group(1)) - 273  # 转换为摄氏度
+        
+        spare_match = re.search(r'available spare\s*:\s*(\d+)%', stdout)
+        if spare_match:
+            result["available_spare"] = safe_int(spare_match.group(1))
+        
+        used_match = re.search(r'percentage used\s*:\s*(\d+)%', stdout)
+        if used_match:
+            result["percentage_used"] = safe_int(used_match.group(1))
+        
+        # 评估健康状态
+        if result["percentage_used"] is not None:
+            if result["percentage_used"] < 90:
+                result["health"] = "PASSED"
+            else:
+                result["health"] = "WARNING"
+        
+        if result["available_spare"] is not None and result["available_spare"] < 10:
+            result["health"] = "WARNING"
+            
+    except:
+        pass
+    
+    return result
+
+
+def analyze_health(smart_data: Dict[str, Any]) -> str:
+    """分析设备健康状态。"""
+    if not smart_data.get("supported"):
+        return "UNKNOWN"
+    
+    if smart_data.get("overall") == "FAILED":
+        return "FAILED"
+    
+    # 检查关键属性
+    attrs = smart_data.get("attributes", {})
+    
+    critical_attrs = {
+        '5 Reallocated_Sector_Ct': 'reallocated_sectors',
+        '197 Current_Pending_Sector': 'pending_sectors',
+        '198 Offline_Uncorrectable': 'offline_uncorrectable',
+        '196 Reallocation_Event_Count': 'reallocation_events'
+    }
+    
+    for attr_name, description in critical_attrs.items():
+        if attr_name in attrs:
+            raw_value = attrs[attr_name].get('raw_value', '0')
+            value = safe_int(raw_value.split()[0])
+            if value > 0:
+                return "WARNING"
+    
+    # 检查温度
+    for temp_attr in ['194 Temperature_Celsius', '190 Airflow_Temperature_Cel']:
+        if temp_attr in attrs:
+            temp = attrs[temp_attr].get('value', 0)
+            if temp > 60:  # 温度阈值
+                return "WARNING"
+    
+    return "PASSED"
+
+
+def check_raid_status() -> Dict[str, Any]:
+    """检查 RAID 阵列状态。"""
+    result = {
+        "raid_available": False,
+        "controllers": [],
+        "arrays": []
+    }
+    
+    # 检查软件 RAID (mdadm)
+    if check_command_exists('mdadm'):
+        try:
+            _, stdout, _ = execute_command(
+                ['mdadm', '--detail', '--scan'],
+                check_returncode=False, timeout=10
+            )
+            
+            if stdout.strip():
+                result["software_raid"] = True
+                result["mdadm_config"] = stdout.strip()
+                
+                # 获取详细信息
+                _, detail, _ = execute_command(
+                    ['cat', '/proc/mdstat'],
+                    check_returncode=False, timeout=5
+                )
+                result["mdstat"] = detail
+                
+                # 解析每个阵列
+                for line in detail.split('\n'):
+                    if line.startswith('md'):
+                        parts = line.split()
+                        array_info = {
+                            "name": parts[0],
+                            "status": "active" if "active" in line else "inactive"
+                        }
+                        
+                        # 检查是否有降级
+                        if '_' in line or 'recovery' in line:
+                            array_info["degraded"] = True
+                            result["status"] = "warning"
+                        
+                        result["arrays"].append(array_info)
+                        
+        except:
+            pass
+    
+    # 检查硬件 RAID (MegaCli/storcli)
+    if check_command_exists('storcli'):
+        try:
+            _, stdout, _ = execute_command(
+                ['storcli', '/c0', 'show'],
+                check_returncode=False, timeout=10
+            )
+            result["hardware_raid"] = True
+            result["controller_type"] = "LSI/Broadcom"
+            result["storcli_output"] = stdout[:500]  # 保存部分输出
+        except:
+            pass
+    elif check_command_exists('MegaCli'):
+        try:
+            _, stdout, _ = execute_command(
+                ['MegaCli', '-AdpAllInfo', '-aALL'],
+                check_returncode=False, timeout=10
+            )
+            result["hardware_raid"] = True
+            result["controller_type"] = "LSI"
+            result["megacli_output"] = stdout[:500]
+        except:
+            pass
+    
+    return result
+
+
+def get_io_statistics() -> Dict[str, Any]:
+    """获取 I/O 统计信息。"""
+    result = {}
+    
+    # 从 /proc/diskstats 获取
+    try:
+        with open('/proc/diskstats', 'r') as f:
+            for line in f:
+                parts = line.split()
+                if len(parts) >= 14:
+                    device = parts[2]
+                    # 只关注物理磁盘
+                    if device.startswith(('sd', 'hd', 'nvme', 'vd')) and not device[-1].isdigit():
+                        result[device] = {
+                            "reads_completed": safe_int(parts[3]),
+                            "reads_merged": safe_int(parts[4]),
+                            "sectors_read": safe_int(parts[5]),
+                            "time_reading_ms": safe_int(parts[6]),
+                            "writes_completed": safe_int(parts[7]),
+                            "writes_merged": safe_int(parts[8]),
+                            "sectors_written": safe_int(parts[9]),
+                            "time_writing_ms": safe_int(parts[10]),
+                            "ios_in_progress": safe_int(parts[11]),
+                            "time_doing_ios_ms": safe_int(parts[12]),
+                            "weighted_time_ios_ms": safe_int(parts[13])
+                        }
+    except:
+        pass
+    
+    return result
+
+
+@require_root
+def run_io_test(device_path: str, test_size_mb: int = 100) -> Dict[str, Any]:
+    """
+    运行简单的 I/O 性能测试。
+
+    Args:
+        device_path: 设备路径
+        test_size_mb: 测试大小（MB）
+
+    Returns:
+        Dict[str, Any]: 测试结果
+    """
+    result = {
+        "passed": False,
+        "device": device_path,
+        "test_size_mb": test_size_mb,
+        "read_speed_mbps": None,
+        "write_speed_mbps": None,
+        "errors": []
+    }
+    
+    # 使用 fio 进行测试
+    if check_command_exists('fio'):
+        try:
+            import tempfile
+            
+            with tempfile.NamedTemporaryFile(mode='w', suffix='.fio', delete=False) as f:
+                fio_config = f"""
+[global]
+directory=/tmp
+filename=serverguard_test
+direct=1
+size={test_size_mb}M
+unlink=1
+
+[seq_read]
+stonewall
+rw=read
+bs=1M
+
+[seq_write]
+stonewall
+rw=write
+bs=1M
+"""
+                f.write(fio_config)
+                fio_file = f.name
+            
+            try:
+                _, stdout, stderr = execute_command(
+                    ['fio', fio_file, '--output-format=json'],
+                    timeout=120,
+                    check_returncode=False
+                )
+                
+                data = json.loads(stdout)
+                
+                for job in data.get('jobs', []):
+                    job_name = job.get('jobname', '')
+                    read_bw = job.get('read', {}).get('bw', 0) / 1024  # 转换为 MB/s
+                    write_bw = job.get('write', {}).get('bw', 0) / 1024
+                    
+                    if 'read' in job_name.lower() and read_bw > 0:
+                        result["read_speed_mbps"] = round(read_bw, 2)
+                    if 'write' in job_name.lower() and write_bw > 0:
+                        result["write_speed_mbps"] = round(write_bw, 2)
+                
+                result["passed"] = True
+                
+            finally:
+                os.unlink(fio_file)
+                
+        except Exception as e:
+            result["errors"].append(str(e))
+    else:
+        result["errors"].append("fio 未安装")
+    
+    return result
+
+
+if __name__ == '__main__':
+    import json
+    print(json.dumps(run_storage_check(), indent=2, ensure_ascii=False))
--- a/modules/system_info.py
+++ b/modules/system_info.py
@@ -0,0 +1,476 @@
+"""
+ServerGuard - 系统信息概览模块
+
+收集服务器的硬件和操作系统基本信息。
+"""
+
+import os
+import re
+import platform
+from typing import Dict, Any, List, Optional
+
+import sys
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from utils import (
+    execute_command, parse_key_value_output, check_command_exists,
+    safe_int, safe_float, format_bytes
+)
+
+
+def get_system_info() -> Dict[str, Any]:
+    """
+    获取系统硬件和操作系统信息。
+
+    Returns:
+        Dict[str, Any]: 系统信息字典
+    """
+    result = {
+        "status": "success",
+        "os": {},
+        "cpu": {},
+        "memory": {},
+        "motherboard": {},
+        "storage": [],
+        "network": [],
+        "gpu": []
+    }
+    
+    try:
+        result["os"] = get_os_info()
+        result["cpu"] = get_cpu_info()
+        result["memory"] = get_memory_info()
+        result["motherboard"] = get_motherboard_info()
+        result["storage"] = get_storage_list()
+        result["network"] = get_network_info()
+        result["gpu"] = get_gpu_list()
+    except Exception as e:
+        result["status"] = "error"
+        result["error"] = str(e)
+    
+    return result
+
+
+def get_os_info() -> Dict[str, str]:
+    """获取操作系统信息。"""
+    info = {
+        "platform": platform.system(),
+        "release": platform.release(),
+        "version": platform.version(),
+        "machine": platform.machine(),
+        "processor": platform.processor()
+    }
+    
+    # 尝试获取 Linux 发行版信息
+    if os.path.exists('/etc/os-release'):
+        try:
+            with open('/etc/os-release', 'r') as f:
+                for line in f:
+                    if line.startswith('PRETTY_NAME='):
+                        info["distribution"] = line.split('=', 1)[1].strip().strip('"')
+                        break
+        except:
+            pass
+    
+    # 获取主机名
+    try:
+        _, hostname, _ = execute_command(['hostname'], check_returncode=False)
+        info["hostname"] = hostname.strip()
+    except:
+        info["hostname"] = "unknown"
+    
+    # 获取 uptime
+    try:
+        with open('/proc/uptime', 'r') as f:
+            uptime_seconds = float(f.readline().split()[0])
+            days = int(uptime_seconds // 86400)
+            hours = int((uptime_seconds % 86400) // 3600)
+            minutes = int((uptime_seconds % 3600) // 60)
+            info["uptime"] = f"{days}天 {hours}小时 {minutes}分钟"
+    except:
+        info["uptime"] = "unknown"
+    
+    return info
+
+
+def get_cpu_info() -> Dict[str, Any]:
+    """获取 CPU 信息。"""
+    info = {
+        "model": "Unknown",
+        "vendor": "Unknown",
+        "architecture": "Unknown",
+        "cores": 0,
+        "threads": 0,
+        "frequency_mhz": 0,
+        "cache_size_kb": {}
+    }
+    
+    # 从 /proc/cpuinfo 获取
+    try:
+        cpu_data = {}
+        with open('/proc/cpuinfo', 'r') as f:
+            for line in f:
+                if ':' in line:
+                    key, value = line.split(':', 1)
+                    cpu_data[key.strip()] = value.strip()
+        
+        info["model"] = cpu_data.get('model name', 'Unknown')
+        info["vendor"] = cpu_data.get('vendor_id', 'Unknown')
+        info["architecture"] = cpu_data.get('cpu architecture', platform.machine())
+        info["cores"] = safe_int(cpu_data.get('cpu cores', 0))
+        info["threads"] = safe_int(cpu_data.get('siblings', 0))
+        info["frequency_mhz"] = safe_int(cpu_data.get('cpu MHz', 0))
+        
+        # 缓存信息
+        if 'cache size' in cpu_data:
+            cache = cpu_data['cache size']
+            info["cache_size_kb"] = {"general": cache}
+    except Exception as e:
+        pass
+    
+    # 使用 lscpu 获取更详细的信息
+    if check_command_exists('lscpu'):
+        try:
+            _, stdout, _ = execute_command(['lscpu'], check_returncode=False, timeout=10)
+            lscpu_data = parse_key_value_output(stdout)
+            
+            if 'Model name' in lscpu_data:
+                info["model"] = lscpu_data['Model name']
+            if 'Architecture' in lscpu_data:
+                info["architecture"] = lscpu_data['Architecture']
+            if 'CPU(s)' in lscpu_data:
+                info["threads"] = safe_int(lscpu_data['CPU(s)'])
+            if 'Core(s) per socket' in lscpu_data and 'Socket(s)' in lscpu_data:
+                cores_per_socket = safe_int(lscpu_data['Core(s) per socket'])
+                sockets = safe_int(lscpu_data['Socket(s)'])
+                info["cores"] = cores_per_socket * sockets
+            if 'CPU max MHz' in lscpu_data:
+                info["max_frequency_mhz"] = safe_float(lscpu_data['CPU max MHz'])
+            if 'CPU min MHz' in lscpu_data:
+                info["min_frequency_mhz"] = safe_float(lscpu_data['CPU min MHz'])
+            if 'Virtualization' in lscpu_data:
+                info["virtualization"] = lscpu_data['Virtualization']
+        except:
+            pass
+    
+    return info
+
+
+def get_memory_info() -> Dict[str, Any]:
+    """获取内存信息。"""
+    info = {
+        "total_gb": 0,
+        "available_gb": 0,
+        "slots_total": 0,
+        "slots_used": 0,
+        "slots": [],
+        "type": "Unknown",
+        "speed_mhz": 0,
+        "ecc_supported": False
+    }
+    
+    # 从 /proc/meminfo 获取总内存
+    try:
+        with open('/proc/meminfo', 'r') as f:
+            for line in f:
+                if line.startswith('MemTotal:'):
+                    kb = safe_int(line.split()[1])
+                    info["total_gb"] = round(kb / 1024 / 1024, 2)
+                elif line.startswith('MemAvailable:'):
+                    kb = safe_int(line.split()[1])
+                    info["available_gb"] = round(kb / 1024 / 1024, 2)
+    except:
+        pass
+    
+    # 使用 dmidecode 获取详细内存信息
+    if check_command_exists('dmidecode'):
+        try:
+            _, stdout, _ = execute_command(
+                ['dmidecode', '-t', 'memory'],
+                check_returncode=False, timeout=15
+            )
+            
+            memory_devices = stdout.split('Memory Device')
+            slots = []
+            
+            for device in memory_devices[1:]:  # 第一个是标题，跳过
+                slot = {}
+                
+                # 解析各项属性
+                size_match = re.search(r'Size:\s*(\d+)\s*MB', device)
+                if size_match:
+                    slot["size_gb"] = round(safe_int(size_match.group(1)) / 1024, 2)
+                
+                type_match = re.search(r'Type:\s*(DDR\d+)', device)
+                if type_match:
+                    slot["type"] = type_match.group(1)
+                    info["type"] = type_match.group(1)
+                
+                speed_match = re.search(r'Speed:\s*(\d+)\s*MT/s', device)
+                if speed_match:
+                    slot["speed_mhz"] = safe_int(speed_match.group(1))
+                
+                manufacturer_match = re.search(r'Manufacturer:\s*(\S+)', device)
+                if manufacturer_match:
+                    slot["manufacturer"] = manufacturer_match.group(1)
+                
+                locator_match = re.search(r'Locator:\s*(.+)', device)
+                if locator_match:
+                    slot["locator"] = locator_match.group(1).strip()
+                
+                if slot and slot.get("size_gb", 0) > 0:
+                    slots.append(slot)
+            
+            info["slots"] = slots
+            info["slots_used"] = len(slots)
+            
+            # 计算总插槽数
+            array_match = re.search(r'Number Of Devices:\s*(\d+)', stdout)
+            if array_match:
+                info["slots_total"] = safe_int(array_match.group(1))
+            else:
+                info["slots_total"] = len(slots)
+                
+        except:
+            pass
+    
+    # 使用 free 命令作为备用
+    if info["total_gb"] == 0 and check_command_exists('free'):
+        try:
+            _, stdout, _ = execute_command(['free', '-m'], check_returncode=False)
+            lines = stdout.strip().split('\n')
+            if len(lines) > 1:
+                parts = lines[1].split()
+                if len(parts) >= 2:
+                    info["total_gb"] = round(safe_int(parts[1]) / 1024, 2)
+        except:
+            pass
+    
+    # 检查 ECC 支持
+    try:
+        with open('/proc/meminfo', 'r') as f:
+            content = f.read()
+            if 'HardwareCorrupted' in content:
+                info["ecc_supported"] = True
+    except:
+        pass
+    
+    return info
+
+
+def get_motherboard_info() -> Dict[str, str]:
+    """获取主板信息。"""
+    info = {
+        "manufacturer": "Unknown",
+        "product_name": "Unknown",
+        "version": "Unknown",
+        "serial_number": "Unknown",
+        "bios_vendor": "Unknown",
+        "bios_version": "Unknown",
+        "bios_date": "Unknown"
+    }
+    
+    if check_command_exists('dmidecode'):
+        try:
+            # 获取主板信息
+            _, stdout, _ = execute_command(
+                ['dmidecode', '-t', 'baseboard'],
+                check_returncode=False, timeout=10
+            )
+            
+            patterns = {
+                "manufacturer": r'Manufacturer:\s*(.+)',
+                "product_name": r'Product Name:\s*(.+)',
+                "version": r'Version:\s*(.+)',
+                "serial_number": r'Serial Number:\s*(.+)'
+            }
+            
+            for key, pattern in patterns.items():
+                match = re.search(pattern, stdout)
+                if match:
+                    value = match.group(1).strip()
+                    if value not in ['Not Specified', 'To be filled by O.E.M.', 'None']:
+                        info[key] = value
+            
+            # 获取 BIOS 信息
+            _, stdout, _ = execute_command(
+                ['dmidecode', '-t', 'bios'],
+                check_returncode=False, timeout=10
+            )
+            
+            bios_patterns = {
+                "bios_vendor": r'Vendor:\s*(.+)',
+                "bios_version": r'Version:\s*(.+)',
+                "bios_date": r'Release Date:\s*(.+)'
+            }
+            
+            for key, pattern in bios_patterns.items():
+                match = re.search(pattern, stdout)
+                if match:
+                    info[key] = match.group(1).strip()
+                    
+        except:
+            pass
+    
+    return info
+
+
+def get_storage_list() -> List[Dict[str, Any]]:
+    """获取存储设备列表。"""
+    devices = []
+    
+    # 使用 lsblk 获取块设备列表
+    if check_command_exists('lsblk'):
+        try:
+            _, stdout, _ = execute_command(
+                ['lsblk', '-d', '-o', 'NAME,SIZE,TYPE,MODEL,VENDOR,ROTA', '-n', '-J'],
+                check_returncode=False, timeout=10
+            )
+            
+            import json
+            data = json.loads(stdout)
+            
+            for device in data.get('blockdevices', []):
+                dev_info = {
+                    "name": device.get('name', 'unknown'),
+                    "path": f"/dev/{device.get('name', 'unknown')}",
+                    "size": device.get('size', 'unknown'),
+                    "type": device.get('type', 'unknown'),
+                    "model": device.get('model', 'unknown'),
+                    "vendor": device.get('vendor', 'unknown'),
+                    "is_rotational": device.get('rota', True)
+                }
+                devices.append(dev_info)
+                
+        except:
+            pass
+    
+    # 备用方法：直接读取 /sys/block
+    if not devices:
+        try:
+            for name in os.listdir('/sys/block'):
+                if name.startswith(('sd', 'hd', 'nvme', 'vd')):
+                    dev_info = {"name": name, "path": f"/dev/{name}"}
+                    
+                    # 尝试读取大小
+                    try:
+                        with open(f'/sys/block/{name}/size', 'r') as f:
+                            sectors = safe_int(f.read().strip())
+                            size_bytes = sectors * 512
+                            dev_info["size"] = format_bytes(size_bytes)
+                    except:
+                        dev_info["size"] = "unknown"
+                    
+                    # 判断是否为 SSD
+                    try:
+                        with open(f'/sys/block/{name}/queue/rotational', 'r') as f:
+                            dev_info["is_rotational"] = f.read().strip() == '1'
+                        dev_info["type"] = 'hdd' if dev_info["is_rotational"] else 'ssd'
+                    except:
+                        dev_info["type"] = 'unknown'
+                    
+                    devices.append(dev_info)
+        except:
+            pass
+    
+    return devices
+
+
+def get_network_info() -> List[Dict[str, Any]]:
+    """获取网络接口信息。"""
+    interfaces = []
+    
+    # 使用 ip 命令
+    if check_command_exists('ip'):
+        try:
+            _, stdout, _ = execute_command(
+                ['ip', '-j', 'link', 'show'],
+                check_returncode=False, timeout=10
+            )
+            
+            import json
+            data = json.loads(stdout)
+            
+            for iface in data:
+                iface_info = {
+                    "name": iface.get('ifname', 'unknown'),
+                    "state": iface.get('operstate', 'unknown'),
+                    "mac_address": iface.get('address', 'unknown'),
+                    "type": iface.get('link_type', 'unknown')
+                }
+                
+                # 获取 IP 地址
+                if 'addr_info' in iface:
+                    ips = []
+                    for addr in iface['addr_info']:
+                        if addr.get('family') == 'inet':
+                            ips.append(f"{addr.get('local')}/{addr.get('prefixlen', '')}")
+                    if ips:
+                        iface_info["ip_addresses"] = ips
+                
+                interfaces.append(iface_info)
+                
+        except:
+            pass
+    
+    return interfaces
+
+
+def get_gpu_list() -> List[Dict[str, Any]]:
+    """获取显卡列表。"""
+    gpus = []
+    
+    # 使用 lspci 查找 VGA 和 3D 控制器
+    if check_command_exists('lspci'):
+        try:
+            _, stdout, _ = execute_command(
+                ['lspci', '-nn'],
+                check_returncode=False, timeout=10
+            )
+            
+            for line in stdout.split('\n'):
+                if 'VGA' in line or '3D controller' in line or 'Display controller' in line:
+                    # 提取设备信息
+                    parts = line.split(': ', 1)
+                    if len(parts) == 2:
+                        bus_id = parts[0].split()[0]
+                        description = parts[1]
+                        
+                        gpu_info = {
+                            "bus_id": bus_id,
+                            "description": description,
+                            "type": "integrated" if "Intel" in description else "discrete"
+                        }
+                        
+                        # 尝试获取更详细的信息
+                        try:
+                            _, detail, _ = execute_command(
+                                ['lspci', '-v', '-s', bus_id],
+                                check_returncode=False, timeout=5
+                            )
+                            
+                            # 提取驱动信息
+                            driver_match = re.search(r'Kernel driver in use:\s*(\S+)', detail)
+                            if driver_match:
+                                gpu_info["driver"] = driver_match.group(1)
+                            
+                            # 提取模块信息
+                            modules_match = re.search(r'Kernel modules:\s*(.+)', detail)
+                            if modules_match:
+                                gpu_info["modules"] = modules_match.group(1).strip()
+                                
+                        except:
+                            pass
+                        
+                        gpus.append(gpu_info)
+                        
+        except:
+            pass
+    
+    return gpus
+
+
+if __name__ == '__main__':
+    # 测试模块
+    import json
+    print(json.dumps(get_system_info(), indent=2, ensure_ascii=False))
--- a/quick_test.py
+++ b/quick_test.py
@@ -0,0 +1,189 @@
+#!/usr/bin/env python3
+"""
+ServerGuard - 快速测试脚本
+
+用于快速验证各模块是否正常工作，不进行压力测试。
+"""
+
+import sys
+import os
+
+# 设置日志级别为警告，减少输出
+import logging
+logging.basicConfig(level=logging.WARNING)
+
+def test_imports():
+    """测试所有模块是否能正常导入"""
+    print("测试模块导入...")
+    modules_to_test = [
+        'utils',
+        'reporter',
+        'modules.system_info',
+        'modules.cpu',
+        'modules.memory',
+        'modules.storage',
+        'modules.sensors',
+        'modules.gpu',
+        'modules.log_analyzer'
+    ]
+    
+    failed = []
+    for module in modules_to_test:
+        try:
+            __import__(module)
+            print(f"  ✓ {module}")
+        except Exception as e:
+            print(f"  ✗ {module}: {e}")
+            failed.append(module)
+    
+    if failed:
+        print(f"\n有 {len(failed)} 个模块导入失败")
+        return False
+    else:
+        print("\n所有模块导入成功!")
+        return True
+
+
+def test_basic_functions():
+    """测试基本功能"""
+    print("\n测试基本功能...")
+    
+    from modules import system_info, cpu, memory, storage, sensors, gpu, log_analyzer
+    
+    # 返回字典的测试函数
+    dict_tests = [
+        ("系统信息", system_info.get_system_info),
+        ("CPU 信息", cpu.get_cpu_details),
+        ("内存信息", memory.get_memory_summary),
+        ("传感器数据", sensors.get_lm_sensors_data),
+        ("日志分析", log_analyzer.analyze_logs),
+    ]
+    
+    # 返回列表的测试函数
+    list_tests = [
+        ("存储设备", storage.get_storage_devices),
+        ("GPU 信息", gpu.check_generic_gpus),
+    ]
+    
+    # 测试返回字典的函数
+    for name, func in dict_tests:
+        try:
+            result = func()
+            if isinstance(result, dict):
+                status = result.get("status", "unknown")
+                if status == "error":
+                    print(f"  ⚠ {name}: 有错误 - {result.get('error', 'Unknown')}")
+                else:
+                    print(f"  ✓ {name}: 正常")
+            else:
+                print(f"  ✓ {name}: 正常 (返回 {type(result).__name__})")
+        except Exception as e:
+            print(f"  ✗ {name}: 异常 - {e}")
+    
+    # 测试返回列表的函数
+    for name, func in list_tests:
+        try:
+            result = func()
+            if isinstance(result, list):
+                print(f"  ✓ {name}: 正常 (找到 {len(result)} 个项目)")
+            else:
+                print(f"  ⚠ {name}: 返回类型异常 - {type(result).__name__}")
+        except Exception as e:
+            print(f"  ✗ {name}: 异常 - {e}")
+    
+    print("\n基本功能测试完成")
+
+
+def test_utils():
+    """测试工具函数"""
+    print("\n测试工具函数...")
+    
+    from utils import safe_int, safe_float, format_bytes
+    
+    # 测试 safe_int
+    assert safe_int("123") == 123
+    assert safe_int("32 GB") == 32
+    assert safe_int("invalid", -1) == -1
+    print("  ✓ safe_int")
+    
+    # 测试 safe_float
+    assert safe_float("123.5") == 123.5
+    assert safe_float("2.5GHz") == 2.5
+    print("  ✓ safe_float")
+    
+    # 测试 format_bytes
+    assert format_bytes(1024) == "1.00 KB"
+    assert format_bytes(1024**2) == "1.00 MB"
+    print("  ✓ format_bytes")
+    
+    print("\n工具函数测试通过")
+
+
+def test_report_generation():
+    """测试报告生成"""
+    print("\n测试报告生成...")
+    
+    from reporter import ReportGenerator
+    
+    generator = ReportGenerator()
+    
+    test_data = {
+        "scan_type": "test",
+        "timestamp": "2024-01-01 00:00:00",
+        "modules": {
+            "cpu": {
+                "status": "success",
+                "temperature": {"current_c": 45}
+            },
+            "memory": {
+                "status": "success",
+                "total_gb": 32
+            }
+        }
+    }
+    
+    formats = ['text', 'json', 'html']
+    for fmt in formats:
+        try:
+            report = generator.generate_report(test_data, fmt)
+            print(f"  ✓ {fmt.upper()} 格式: {len(report)} 字符")
+        except Exception as e:
+            print(f"  ✗ {fmt.upper()} 格式: {e}")
+    
+    print("\n报告生成测试完成")
+
+
+def main():
+    """主函数"""
+    print("=" * 60)
+    print("ServerGuard 快速测试")
+    print("=" * 60)
+    print()
+    
+    # 测试导入
+    if not test_imports():
+        print("\n模块导入测试失败，请检查依赖安装")
+        sys.exit(1)
+    
+    # 测试工具函数
+    test_utils()
+    
+    # 测试报告生成
+    test_report_generation()
+    
+    # 测试基本功能
+    test_basic_functions()
+    
+    print()
+    print("=" * 60)
+    print("测试完成!")
+    print("=" * 60)
+    print()
+    print("运行完整诊断命令:")
+    print("  sudo python3 main.py --quick    # 快速检测")
+    print("  sudo python3 main.py --full     # 全面诊断（含压力测试）")
+    print()
+
+
+if __name__ == '__main__':
+    main()
--- a/reporter.py
+++ b/reporter.py
@@ -0,0 +1,387 @@
+"""
+ServerGuard - 报告生成模块
+
+负责将检测结果格式化为各种输出格式。
+"""
+
+import json
+import csv
+import os
+from typing import Dict, Any, List
+from datetime import datetime
+from io import StringIO
+
+try:
+    from rich.console import Console
+    from rich.table import Table
+    from rich.panel import Panel
+    from rich import box
+    HAS_RICH = True
+except ImportError:
+    HAS_RICH = False
+
+
+class ReportGenerator:
+    """报告生成器类"""
+    
+    def __init__(self):
+        self.console = Console() if HAS_RICH else None
+    
+    def generate_report(self, data: Dict[str, Any], format_type: str = 'text') -> str:
+        """
+        根据指定格式生成报告。
+
+        Args:
+            data: 检测结果数据
+            format_type: 报告格式 (text, json, csv, html)
+
+        Returns:
+            str: 格式化的报告内容
+        """
+        if format_type == 'json':
+            return self._format_json_report(data)
+        elif format_type == 'csv':
+            return self._format_csv_report(data)
+        elif format_type == 'html':
+            return self._format_html_report(data)
+        else:
+            return self._format_text_report(data)
+    
+    def save_report(self, data: Dict[str, Any], format_type: str, filepath: str):
+        """
+        保存报告到文件。
+
+        Args:
+            data: 检测结果数据
+            format_type: 报告格式
+            filepath: 输出文件路径
+        """
+        report = self.generate_report(data, format_type)
+        
+        # 确保目录存在
+        os.makedirs(os.path.dirname(filepath) or '.', exist_ok=True)
+        
+        with open(filepath, 'w', encoding='utf-8') as f:
+            f.write(report)
+    
+    def _format_json_report(self, data: Dict[str, Any]) -> str:
+        """生成 JSON 格式报告。"""
+        return json.dumps(data, indent=2, ensure_ascii=False, default=str)
+    
+    def _format_csv_report(self, data: Dict[str, Any]) -> str:
+        """生成 CSV 格式报告。"""
+        output = StringIO()
+        writer = csv.writer(output)
+        
+        # 写入基本信息
+        writer.writerow(['ServerGuard Diagnostic Report'])
+        writer.writerow(['Scan Type', data.get('scan_type', 'unknown')])
+        writer.writerow(['Timestamp', data.get('timestamp', '')])
+        writer.writerow([])
+        
+        # 写入各模块数据
+        for module_name, module_data in data.get('modules', {}).items():
+            writer.writerow([f'Module: {module_name.upper()}'])
+            writer.writerow(['Status', module_data.get('status', 'unknown')])
+            
+            # 展平嵌套字典
+            self._write_dict_to_csv(writer, module_data, prefix='')
+            writer.writerow([])
+        
+        return output.getvalue()
+    
+    def _write_dict_to_csv(self, writer, data: Dict[str, Any], prefix: str = ''):
+        """辅助函数：将字典写入 CSV"""
+        for key, value in data.items():
+            if key == 'status':
+                continue
+            full_key = f"{prefix}.{key}" if prefix else key
+            
+            if isinstance(value, dict):
+                self._write_dict_to_csv(writer, value, full_key)
+            elif isinstance(value, list):
+                writer.writerow([full_key, ', '.join(str(v) for v in value)])
+            else:
+                writer.writerow([full_key, value])
+    
+    def _format_text_report(self, data: Dict[str, Any]) -> str:
+        """生成纯文本格式报告。"""
+        lines = []
+        
+        # 报告头部
+        lines.append("=" * 70)
+        lines.append("ServerGuard 硬件健康诊断报告")
+        lines.append("=" * 70)
+        lines.append(f"扫描类型: {data.get('scan_type', 'unknown').upper()}")
+        lines.append(f"生成时间: {data.get('timestamp', '')}")
+        if 'stress_duration' in data:
+            lines.append(f"压力测试时长: {data['stress_duration']} 秒")
+        lines.append("=" * 70)
+        lines.append("")
+        
+        # 各模块结果
+        for module_name, module_data in data.get('modules', {}).items():
+            lines.append(f"\n[{module_name.upper()}]")
+            lines.append("-" * 70)
+            
+            status = module_data.get('status', 'unknown')
+            status_symbol = '✓' if status == 'success' else '⚠' if status == 'warning' else '✗'
+            lines.append(f"状态: {status_symbol} {status.upper()}")
+            
+            if 'error' in module_data:
+                lines.append(f"错误: {module_data['error']}")
+            
+            # 格式化模块特定数据
+            self._format_module_text(lines, module_name, module_data)
+            
+            lines.append("")
+        
+        # 报告尾部
+        lines.append("=" * 70)
+        lines.append("报告结束")
+        lines.append("=" * 70)
+        
+        return '\n'.join(lines)
+    
+    def _format_module_text(self, lines: List[str], module_name: str, data: Dict[str, Any]):
+        """格式化特定模块的文本输出"""
+        if module_name == 'system':
+            if 'cpu' in data:
+                cpu = data['cpu']
+                lines.append(f"CPU: {cpu.get('model', 'N/A')}")
+                lines.append(f"  核心数: {cpu.get('cores', 'N/A')} 核 / {cpu.get('threads', 'N/A')} 线程")
+            
+            if 'memory' in data:
+                mem = data['memory']
+                lines.append(f"内存: 总计 {mem.get('total_gb', 'N/A')} GB, {mem.get('slots_used', 'N/A')} 个插槽")
+            
+            if 'storage' in data:
+                lines.append(f"存储设备: {len(data['storage'])} 个设备")
+        
+        elif module_name == 'cpu':
+            if 'temperature' in data:
+                temp = data['temperature']
+                lines.append(f"CPU 温度: {temp.get('current_c', 'N/A')}°C")
+            if 'mce_errors' in data:
+                mce = data['mce_errors']
+                lines.append(f"MCE 错误: {mce.get('count', 0)} 个")
+            if 'stress_test' in data:
+                stress = data['stress_test']
+                lines.append(f"压力测试: {'通过' if stress.get('passed') else '失败'}")
+                lines.append(f"  运行时长: {stress.get('duration_seconds', 'N/A')} 秒")
+        
+        elif module_name == 'memory':
+            if 'ecc_status' in data:
+                ecc = data['ecc_status']
+                lines.append(f"ECC 支持: {'是' if ecc.get('supported') else '否'}")
+                if ecc.get('errors', 0) > 0:
+                    lines.append(f"ECC 错误: {ecc['errors']} 个")
+            if 'stress_test' in data:
+                st = data['stress_test']
+                lines.append(f"内存压力测试: {'通过' if st.get('passed') else '失败'}")
+                if st.get('tool'):
+                    lines.append(f"  使用工具: {st.get('tool')}")
+                if st.get('size_mb'):
+                    lines.append(f"  测试大小: {st.get('size_mb')} MB")
+        
+        elif module_name == 'storage':
+            for device in data.get('devices', []):
+                lines.append(f"设备 {device.get('name', 'N/A')}:")
+                lines.append(f"  型号: {device.get('model', 'N/A')}")
+                lines.append(f"  健康状态: {device.get('health', 'N/A')}")
+                if 'smart_status' in device:
+                    smart = device['smart_status']
+                    lines.append(f"  SMART: {smart.get('overall', 'N/A')}")
+        
+        elif module_name == 'sensors':
+            if 'temperatures' in data:
+                lines.append("温度传感器:")
+                for name, value in data['temperatures'].items():
+                    lines.append(f"  {name}: {value}°C")
+            if 'voltages' in data:
+                lines.append("电压传感器:")
+                for name, value in data['voltages'].items():
+                    lines.append(f"  {name}: {value}V")
+        
+        elif module_name == 'logs':
+            if 'hardware_errors' in data:
+                errors = data['hardware_errors']
+                total = sum(errors.values())
+                lines.append(f"硬件错误总计: {total} 个")
+                for error_type, count in errors.items():
+                    if count > 0:
+                        lines.append(f"  {error_type}: {count} 个")
+    
+    def _format_html_report(self, data: Dict[str, Any]) -> str:
+        """生成 HTML 格式报告。"""
+        html_parts = []
+        
+        # HTML 头部
+        html_parts.append("""<!DOCTYPE html>
+<html lang="zh-CN">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>ServerGuard 诊断报告</title>
+    <style>
+        body {
+            font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif;
+            line-height: 1.6;
+            color: #333;
+            max-width: 1200px;
+            margin: 0 auto;
+            padding: 20px;
+            background: #f5f5f5;
+        }
+        .header {
+            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+            color: white;
+            padding: 30px;
+            border-radius: 10px;
+            margin-bottom: 20px;
+        }
+        .header h1 {
+            margin: 0;
+            font-size: 2em;
+        }
+        .header .meta {
+            margin-top: 10px;
+            opacity: 0.9;
+        }
+        .module {
+            background: white;
+            border-radius: 8px;
+            padding: 20px;
+            margin-bottom: 20px;
+            box-shadow: 0 2px 4px rgba(0,0,0,0.1);
+        }
+        .module-header {
+            display: flex;
+            justify-content: space-between;
+            align-items: center;
+            border-bottom: 2px solid #eee;
+            padding-bottom: 10px;
+            margin-bottom: 15px;
+        }
+        .module-title {
+            font-size: 1.5em;
+            font-weight: bold;
+            color: #444;
+        }
+        .status {
+            padding: 5px 15px;
+            border-radius: 20px;
+            font-weight: bold;
+            font-size: 0.9em;
+        }
+        .status-success { background: #d4edda; color: #155724; }
+        .status-warning { background: #fff3cd; color: #856404; }
+        .status-error { background: #f8d7da; color: #721c24; }
+        .status-unknown { background: #e2e3e5; color: #383d41; }
+        .info-grid {
+            display: grid;
+            grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));
+            gap: 15px;
+        }
+        .info-item {
+            padding: 10px;
+            background: #f8f9fa;
+            border-radius: 5px;
+        }
+        .info-label {
+            font-weight: bold;
+            color: #666;
+            font-size: 0.9em;
+        }
+        .info-value {
+            margin-top: 5px;
+            font-size: 1.1em;
+        }
+        .footer {
+            text-align: center;
+            color: #666;
+            margin-top: 30px;
+            padding: 20px;
+        }
+        .error-box {
+            background: #f8d7da;
+            border: 1px solid #f5c6cb;
+            color: #721c24;
+            padding: 15px;
+            border-radius: 5px;
+            margin: 10px 0;
+        }
+    </style>
+</head>
+<body>""")
+        
+        # 报告头部
+        html_parts.append(f"""
+    <div class="header">
+        <h1>🔧 ServerGuard 硬件健康诊断报告</h1>
+        <div class="meta">
+            扫描类型: {data.get('scan_type', 'unknown').upper()} | 
+            生成时间: {data.get('timestamp', '')}
+        </div>
+    </div>""")
+        
+        # 各模块结果
+        for module_name, module_data in data.get('modules', {}).items():
+            status = module_data.get('status', 'unknown')
+            status_class = f'status-{status}'
+            
+            html_parts.append(f"""
+    <div class="module">
+        <div class="module-header">
+            <span class="module-title">{module_name.upper()}</span>
+            <span class="status {status_class}">{status.upper()}</span>
+        </div>""")
+            
+            if 'error' in module_data:
+                html_parts.append(f"""
+        <div class="error-box">
+            <strong>错误:</strong> {module_data['error']}
+        </div>""")
+            else:
+                html_parts.append('        <div class="info-grid">')
+                self._format_module_html(html_parts, module_name, module_data)
+                html_parts.append('        </div>')
+            
+            html_parts.append('    </div>')
+        
+        # 报告尾部
+        html_parts.append("""
+    <div class="footer">
+        <p>由 ServerGuard 生成</p>
+    </div>
+</body>
+</html>""")
+        
+        return '\n'.join(html_parts)
+    
+    def _format_module_html(self, html_parts: List[str], module_name: str, data: Dict[str, Any]):
+        """格式化特定模块的 HTML 输出"""
+        for key, value in data.items():
+            if key == 'status':
+                continue
+            
+            display_key = key.replace('_', ' ').title()
+            
+            if isinstance(value, dict):
+                html_parts.append(f"""
+            <div class="info-item">
+                <div class="info-label">{display_key}</div>
+                <div class="info-value">{len(value)} 项数据</div>
+            </div>""")
+            elif isinstance(value, list):
+                html_parts.append(f"""
+            <div class="info-item">
+                <div class="info-label">{display_key}</div>
+                <div class="info-value">{len(value)} 个项目</div>
+            </div>""")
+            else:
+                html_parts.append(f"""
+            <div class="info-item">
+                <div class="info-label">{display_key}</div>
+                <div class="info-value">{value}</div>
+            </div>""")
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,2 @@
+rich>=13.0.0
+psutil>=5.9.0
--- a/tests/init.py
+++ b/tests/init.py
@@ -0,0 +1,3 @@
+"""
+ServerGuard 测试模块
+"""
--- a/tests/test_modules.py
+++ b/tests/test_modules.py
@@ -0,0 +1,175 @@
+"""
+测试各个硬件检测模块
+"""
+
+import unittest
+from unittest.mock import patch, MagicMock
+import sys
+import os
+
+# 添加父目录到路径
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from modules import system_info, cpu, memory, storage, sensors, gpu, log_analyzer
+
+
+class TestSystemInfo(unittest.TestCase):
+    """测试系统信息模块"""
+    
+    @patch('modules.system_info.execute_command')
+    def test_get_os_info(self, mock_exec):
+        mock_exec.return_value = (0, "test-hostname\n", "")
+        result = system_info.get_os_info()
+        self.assertIn("platform", result)
+        self.assertIn("machine", result)
+    
+    def test_get_cpu_info(self):
+        result = system_info.get_cpu_info()
+        self.assertIn("model", result)
+        # 在大多数系统上应该能获取到一些信息
+        self.assertIsInstance(result["model"], str)
+    
+    def test_get_memory_info(self):
+        result = system_info.get_memory_info()
+        self.assertIn("total_gb", result)
+        self.assertIsInstance(result["total_gb"], (int, float))
+    
+    def test_get_system_info(self):
+        result = system_info.get_system_info()
+        self.assertIn("status", result)
+        self.assertIn("cpu", result)
+        self.assertIn("memory", result)
+
+
+class TestCPU(unittest.TestCase):
+    """测试 CPU 模块"""
+    
+    def test_get_cpu_details(self):
+        result = cpu.get_cpu_details()
+        self.assertIn("model", result)
+        self.assertIn("cores", result)
+        self.assertIsInstance(result["cores"], int)
+    
+    def test_get_cpu_temperature(self):
+        result = cpu.get_cpu_temperature()
+        self.assertIn("status", result)
+        self.assertIn("sensors", result)
+    
+    def test_get_load_average(self):
+        result = cpu.get_load_average()
+        self.assertIn("1min", result)
+        self.assertIn("5min", result)
+        self.assertIn("15min", result)
+    
+    def test_check_mce_errors(self):
+        result = cpu.check_mce_errors()
+        self.assertIn("count", result)
+        self.assertIn("status", result)
+
+
+class TestMemory(unittest.TestCase):
+    """测试内存模块"""
+    
+    def test_get_memory_summary(self):
+        result = memory.get_memory_summary()
+        self.assertIn("total_bytes", result)
+        self.assertIn("total_gb", result)
+        self.assertIsInstance(result["total_gb"], (int, float))
+    
+    def test_get_dimm_info(self):
+        result = memory.get_dimm_info()
+        self.assertIsInstance(result, list)
+    
+    def test_check_ecc_status(self):
+        result = memory.check_ecc_status()
+        self.assertIn("supported", result)
+        self.assertIsInstance(result["supported"], bool)
+    
+    def test_check_edac_errors(self):
+        result = memory.check_edac_errors()
+        self.assertIn("total_errors", result)
+        self.assertIsInstance(result["total_errors"], int)
+
+
+class TestStorage(unittest.TestCase):
+    """测试存储模块"""
+    
+    def test_get_storage_devices(self):
+        result = storage.get_storage_devices()
+        self.assertIsInstance(result, list)
+    
+    def test_check_raid_status(self):
+        result = storage.check_raid_status()
+        self.assertIn("arrays", result)
+        self.assertIsInstance(result["arrays"], list)
+    
+    def test_get_io_statistics(self):
+        result = storage.get_io_statistics()
+        self.assertIsInstance(result, dict)
+
+
+class TestSensors(unittest.TestCase):
+    """测试传感器模块"""
+    
+    def test_get_lm_sensors_data(self):
+        result = sensors.get_lm_sensors_data()
+        self.assertIn("available", result)
+    
+    def test_get_thermal_zones(self):
+        result = sensors.get_thermal_zones()
+        self.assertIn("zones", result)
+        self.assertIsInstance(result["zones"], dict)
+    
+    def test_get_power_supply_info(self):
+        result = sensors.get_power_supply_info()
+        self.assertIn("supplies", result)
+        self.assertIsInstance(result["supplies"], list)
+
+
+class TestGPU(unittest.TestCase):
+    """测试 GPU 模块"""
+    
+    def test_check_generic_gpus(self):
+        result = gpu.check_generic_gpus()
+        self.assertIsInstance(result, list)
+    
+    def test_check_gpu_dmesg_errors(self):
+        result = gpu.check_gpu_dmesg_errors()
+        self.assertIsInstance(result, list)
+
+
+class TestLogAnalyzer(unittest.TestCase):
+    """测试日志分析模块"""
+    
+    def test_get_kernel_panic_logs(self):
+        result = log_analyzer.get_kernel_panic_logs()
+        self.assertIsInstance(result, list)
+    
+    def test_get_hardware_error_logs(self):
+        result = log_analyzer.get_hardware_error_logs()
+        self.assertIn("mce_errors", result)
+        self.assertIn("ecc_errors", result)
+        self.assertIn("io_errors", result)
+    
+    def test_summarize_errors(self):
+        test_data = {
+            "dmesg_analysis": {
+                "error_counts": {
+                    "cpu_errors": 5,
+                    "memory_errors": 3
+                }
+            },
+            "journal_analysis": {
+                "error_counts": {
+                    "cpu_errors": 2,
+                    "memory_errors": 1
+                }
+            }
+        }
+        result = log_analyzer.summarize_errors(test_data)
+        self.assertEqual(result["cpu_errors"], 7)
+        self.assertEqual(result["memory_errors"], 4)
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -0,0 +1,94 @@
+"""
+测试 utils 模块
+"""
+
+import unittest
+import sys
+import os
+
+# 添加父目录到路径
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from utils import (
+    parse_key_value_output, parse_table_output, extract_with_regex,
+    safe_int, safe_float, format_bytes, sanitize_filename,
+    merge_dicts, check_command_exists
+)
+
+
+class TestParseFunctions(unittest.TestCase):
+    """测试解析函数"""
+    
+    def test_parse_key_value_output(self):
+        text = """
+Key1: Value1
+Key2: Value2
+# Comment line
+Key3: Value with: colon
+"""
+        result = parse_key_value_output(text)
+        self.assertEqual(result["Key1"], "Value1")
+        self.assertEqual(result["Key2"], "Value2")
+        self.assertEqual(result["Key3"], "Value with: colon")
+    
+    def test_parse_table_output(self):
+        text = """
+NAME   SIZE TYPE MODEL
+sda    1T   disk Samsung SSD
+sdb    2T   disk WD HDD
+"""
+        result = parse_table_output(text, headers=["NAME", "SIZE", "TYPE", "MODEL"])
+        self.assertEqual(len(result), 2)
+        self.assertEqual(result[0]["NAME"], "sda")
+        self.assertEqual(result[1]["TYPE"], "disk")
+    
+    def test_extract_with_regex(self):
+        text = "Temperature: 45.5 degrees"
+        result = extract_with_regex(text, r'Temperature:\s*([\d.]+)')
+        self.assertEqual(result, "45.5")
+    
+    def test_safe_int(self):
+        self.assertEqual(safe_int("123"), 123)
+        self.assertEqual(safe_int("123.5"), 123)
+        self.assertEqual(safe_int("1,234"), 1234)
+        self.assertEqual(safe_int("32 GB"), 32)
+        self.assertEqual(safe_int("invalid"), 0)
+        self.assertEqual(safe_int("invalid", -1), -1)
+    
+    def test_safe_float(self):
+        self.assertEqual(safe_float("123.5"), 123.5)
+        self.assertEqual(safe_float("2.5GHz"), 2.5)
+        self.assertEqual(safe_float("invalid"), 0.0)
+    
+    def test_format_bytes(self):
+        self.assertEqual(format_bytes(0), "0 B")
+        self.assertEqual(format_bytes(1024), "1.00 KB")
+        self.assertEqual(format_bytes(1024**2), "1.00 MB")
+        self.assertEqual(format_bytes(1024**3), "1.00 GB")
+    
+    def test_sanitize_filename(self):
+        self.assertEqual(sanitize_filename("file<name>.txt"), "file_name_.txt")
+        self.assertEqual(sanitize_filename("path/to/file"), "path/to/file")
+    
+    def test_merge_dicts(self):
+        base = {"a": 1, "b": {"c": 2}}
+        update = {"b": {"d": 3}, "e": 4}
+        result = merge_dicts(base, update)
+        self.assertEqual(result["a"], 1)
+        self.assertEqual(result["b"]["c"], 2)
+        self.assertEqual(result["b"]["d"], 3)
+        self.assertEqual(result["e"], 4)
+
+
+class TestCommandFunctions(unittest.TestCase):
+    """测试命令相关函数"""
+    
+    def test_check_command_exists(self):
+        # ls 应该存在
+        self.assertTrue(check_command_exists("ls"))
+        # 不存在的命令
+        self.assertFalse(check_command_exists("nonexistent_command_12345"))
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/utils.py
+++ b/utils.py
@@ -0,0 +1,419 @@
+"""
+ServerGuard - 通用工具库
+
+提供命令执行、日志配置、输出解析等通用功能。
+"""
+
+import subprocess
+import logging
+import sys
+import os
+import re
+import json
+from typing import List, Dict, Any, Optional, Tuple, Union
+from datetime import datetime
+
+
+class ServerGuardError(Exception):
+    """ServerGuard 基础异常类"""
+    pass
+
+
+class CommandExecutionError(ServerGuardError):
+    """命令执行异常"""
+    pass
+
+
+class PermissionError(ServerGuardError):
+    """权限异常"""
+    pass
+
+
+def execute_command(
+    cmd_list: List[str],
+    timeout: int = 60,
+    check_returncode: bool = True,
+    capture_output: bool = True,
+    shell: bool = False,
+    input_data: Optional[str] = None
+) -> Tuple[int, str, str]:
+    """
+    安全地执行外部命令。
+
+    Args:
+        cmd_list: 命令及其参数的列表
+        timeout: 命令超时时间（秒）
+        check_returncode: 是否在非零返回码时抛出异常
+        capture_output: 是否捕获输出
+        shell: 是否使用 shell 执行
+        input_data: 输入到命令的字符串数据
+
+    Returns:
+        Tuple[returncode, stdout, stderr]
+
+    Raises:
+        CommandExecutionError: 命令执行失败
+        PermissionError: 权限不足
+    """
+    logger = logging.getLogger(__name__)
+    
+    # 安全：禁止使用 shell=True 时传递未经验证的命令字符串
+    if shell and isinstance(cmd_list, list):
+        cmd_str = ' '.join(cmd_list)
+        logger.warning(f"Using shell=True with command: {cmd_str}")
+    
+    try:
+        logger.debug(f"Executing command: {' '.join(cmd_list)}")
+        
+        kwargs = {
+            'timeout': timeout,
+            'shell': shell,
+            'universal_newlines': True  # Python 3.6 compatible version of text=True
+        }
+        if capture_output:
+            kwargs['stdout'] = subprocess.PIPE
+            kwargs['stderr'] = subprocess.PIPE
+        if input_data:
+            kwargs['input'] = input_data
+            
+        result = subprocess.run(cmd_list, **kwargs)
+        
+        stdout = result.stdout if result.stdout else ""
+        stderr = result.stderr if result.stderr else ""
+        
+        if check_returncode and result.returncode != 0:
+            error_msg = f"Command failed with code {result.returncode}: {' '.join(cmd_list)}\nstderr: {stderr}"
+            logger.error(error_msg)
+            raise CommandExecutionError(error_msg)
+        
+        return result.returncode, stdout, stderr
+        
+    except subprocess.TimeoutExpired:
+        error_msg = f"Command timed out after {timeout}s: {' '.join(cmd_list)}"
+        logger.error(error_msg)
+        raise CommandExecutionError(error_msg)
+    except FileNotFoundError:
+        error_msg = f"Command not found: {cmd_list[0]}"
+        logger.error(error_msg)
+        raise CommandExecutionError(error_msg)
+    except PermissionError as e:
+        error_msg = f"Permission denied executing: {' '.join(cmd_list)}"
+        logger.error(error_msg)
+        raise PermissionError(error_msg) from e
+
+
+def check_root_privileges() -> bool:
+    """
+    检查当前是否以 root 用户运行。
+
+    Returns:
+        bool: 是否为 root 用户
+    """
+    return os.geteuid() == 0
+
+
+def require_root(func):
+    """
+    装饰器：要求函数必须以 root 权限运行。
+    """
+    def wrapper(*args, **kwargs):
+        if not check_root_privileges():
+            logging.warning(f"Function {func.__name__} requires root privileges")
+            return {
+                "status": "error",
+                "error": "This function requires root privileges. Please run with sudo."
+            }
+        return func(*args, **kwargs)
+    return wrapper
+
+
+def setup_logging(
+    log_file: Optional[str] = None,
+    level: int = logging.INFO,
+    console_output: bool = True
+) -> logging.Logger:
+    """
+    配置日志系统。
+
+    Args:
+        log_file: 日志文件路径，None 则不写入文件
+        level: 日志级别
+        console_output: 是否输出到控制台
+
+    Returns:
+        logging.Logger: 配置好的 logger 实例
+    """
+    logger = logging.getLogger()
+    logger.setLevel(level)
+    
+    # 清除已有的 handlers
+    logger.handlers = []
+    
+    formatter = logging.Formatter(
+        '%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+        datefmt='%Y-%m-%d %H:%M:%S'
+    )
+    
+    if console_output:
+        console_handler = logging.StreamHandler(sys.stdout)
+        console_handler.setFormatter(formatter)
+        logger.addHandler(console_handler)
+    
+    if log_file:
+        os.makedirs(os.path.dirname(log_file) or '.', exist_ok=True)
+        file_handler = logging.FileHandler(log_file)
+        file_handler.setFormatter(formatter)
+        logger.addHandler(file_handler)
+    
+    return logger
+
+
+def parse_key_value_output(text: str, delimiter: str = ':') -> Dict[str, str]:
+    """
+    解析 key: value 格式的文本输出。
+
+    Args:
+        text: 要解析的文本
+        delimiter: 键值分隔符
+
+    Returns:
+        Dict[str, str]: 解析后的字典
+    """
+    result = {}
+    for line in text.strip().split('\n'):
+        line = line.strip()
+        if not line or line.startswith('#'):
+            continue
+        
+        parts = line.split(delimiter, 1)
+        if len(parts) == 2:
+            key = parts[0].strip()
+            value = parts[1].strip()
+            result[key] = value
+    
+    return result
+
+
+def parse_table_output(text: str, headers: Optional[List[str]] = None) -> List[Dict[str, str]]:
+    """
+    解析表格格式的文本输出。
+
+    Args:
+        text: 要解析的文本
+        headers: 表头列表，None 则从第一行自动提取
+
+    Returns:
+        List[Dict[str, str]]: 解析后的列表
+    """
+    lines = [line.strip() for line in text.strip().split('\n') if line.strip()]
+    if not lines:
+        return []
+    
+    if headers is None:
+        # 尝试自动检测表头
+        headers = [h.strip() for h in lines[0].split() if h.strip()]
+        data_lines = lines[1:]
+    else:
+        data_lines = lines
+    
+    result = []
+    for line in data_lines:
+        values = line.split()
+        if len(values) >= len(headers):
+            row = {headers[i]: values[i] for i in range(len(headers))}
+            result.append(row)
+    
+    return result
+
+
+def extract_with_regex(text: str, pattern: str, group: int = 1, default: Any = None) -> Any:
+    """
+    使用正则表达式从文本中提取内容。
+
+    Args:
+        text: 要搜索的文本
+        pattern: 正则表达式模式
+        group: 捕获组索引
+        default: 未匹配时的默认值
+
+    Returns:
+        匹配结果或默认值
+    """
+    match = re.search(pattern, text)
+    if match:
+        try:
+            return match.group(group)
+        except IndexError:
+            return default
+    return default
+
+
+def safe_int(value: Any, default: int = 0) -> int:
+    """
+    安全地将值转换为整数。
+
+    Args:
+        value: 要转换的值
+        default: 转换失败时的默认值
+
+    Returns:
+        int: 转换后的整数
+    """
+    try:
+        # 移除常见单位后缀
+        if isinstance(value, str):
+            value = value.strip().lower()
+            value = re.sub(r'[\s,]', '', value)
+            # 处理带单位的数值 (如 "32 GB", "2.5GHz")
+            value = re.sub(r'[^\d.-]', '', value)
+        return int(float(value))
+    except (ValueError, TypeError):
+        return default
+
+
+def safe_float(value: Any, default: float = 0.0) -> float:
+    """
+    安全地将值转换为浮点数。
+
+    Args:
+        value: 要转换的值
+        default: 转换失败时的默认值
+
+    Returns:
+        float: 转换后的浮点数
+    """
+    try:
+        if isinstance(value, str):
+            value = value.strip().lower()
+            value = re.sub(r'[\s,]', '', value)
+            value = re.sub(r'[^\d.-]', '', value)
+        return float(value)
+    except (ValueError, TypeError):
+        return default
+
+
+def get_timestamp() -> str:
+    """
+    获取当前时间戳字符串。
+
+    Returns:
+        str: 格式化的时间戳
+    """
+    return datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+
+
+def get_file_timestamp() -> str:
+    """
+    获取适合文件名的当前时间戳字符串。
+
+    Returns:
+        str: 格式化的文件名时间戳
+    """
+    return datetime.now().strftime('%Y%m%d_%H%M%S')
+
+
+def read_file_lines(filepath: str, max_lines: int = 1000) -> List[str]:
+    """
+    安全地读取文件内容。
+
+    Args:
+        filepath: 文件路径
+        max_lines: 最大读取行数
+
+    Returns:
+        List[str]: 文件行列表
+    """
+    try:
+        with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
+            lines = []
+            for i, line in enumerate(f):
+                if i >= max_lines:
+                    break
+                lines.append(line.rstrip('\n'))
+            return lines
+    except (IOError, OSError) as e:
+        logging.getLogger(__name__).warning(f"Failed to read file {filepath}: {e}")
+        return []
+
+
+def check_command_exists(command: str) -> bool:
+    """
+    检查命令是否存在。
+
+    Args:
+        command: 命令名称
+
+    Returns:
+        bool: 命令是否存在
+    """
+    try:
+        # Python 3.6 compatible version
+        subprocess.run(
+            ['which', command],
+            stdout=subprocess.DEVNULL,
+            stderr=subprocess.DEVNULL,
+            check=True
+        )
+        return True
+    except (subprocess.CalledProcessError, FileNotFoundError):
+        return False
+
+
+def format_bytes(size_bytes: int) -> str:
+    """
+    将字节数格式化为人类可读的字符串。
+
+    Args:
+        size_bytes: 字节数
+
+    Returns:
+        str: 格式化后的字符串
+    """
+    if size_bytes == 0:
+        return "0 B"
+    
+    units = ['B', 'KB', 'MB', 'GB', 'TB', 'PB']
+    unit_index = 0
+    size = float(size_bytes)
+    
+    while size >= 1024 and unit_index < len(units) - 1:
+        size /= 1024
+        unit_index += 1
+    
+    return f"{size:.2f} {units[unit_index]}"
+
+
+def sanitize_filename(filename: str) -> str:
+    """
+    清理文件名，移除不安全字符。
+
+    Args:
+        filename: 原始文件名
+
+    Returns:
+        str: 清理后的文件名
+    """
+    # 移除或替换不安全字符
+    filename = re.sub(r'[<>:"/\\|?*]', '_', filename)
+    filename = filename.strip('. ')
+    return filename
+
+
+def merge_dicts(base: Dict[str, Any], update: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    递归合并两个字典。
+
+    Args:
+        base: 基础字典
+        update: 更新字典
+
+    Returns:
+        Dict[str, Any]: 合并后的字典
+    """
+    result = base.copy()
+    for key, value in update.items():
+        if key in result and isinstance(result[key], dict) and isinstance(value, dict):
+            result[key] = merge_dicts(result[key], value)
+        else:
+            result[key] = value
+    return result