From c4f4fefa0a1189e520d58bfd1dc9b958f0814e21 Mon Sep 17 00:00:00 2001 From: zj <1052308357@qq.com> Date: Mon, 2 Mar 2026 14:14:40 +0800 Subject: [PATCH] first commit --- .gitignore | 54 ++++ README.md | 111 ++++++++ config/config.yaml | 121 ++++++++ install.sh | 280 +++++++++++++++++++ main.py | 419 ++++++++++++++++++++++++++++ modules/__init__.py | 15 + modules/cpu.py | 518 ++++++++++++++++++++++++++++++++++ modules/gpu.py | 497 +++++++++++++++++++++++++++++++++ modules/log_analyzer.py | 553 ++++++++++++++++++++++++++++++++++++ modules/memory.py | 577 ++++++++++++++++++++++++++++++++++++++ modules/sensors.py | 545 ++++++++++++++++++++++++++++++++++++ modules/storage.py | 602 ++++++++++++++++++++++++++++++++++++++++ modules/system_info.py | 476 +++++++++++++++++++++++++++++++ quick_test.py | 189 +++++++++++++ reporter.py | 387 ++++++++++++++++++++++++++ requirements.txt | 2 + tests/__init__.py | 3 + tests/test_modules.py | 175 ++++++++++++ tests/test_utils.py | 94 +++++++ utils.py | 419 ++++++++++++++++++++++++++++ 20 files changed, 6037 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 config/config.yaml create mode 100755 install.sh create mode 100755 main.py create mode 100644 modules/__init__.py create mode 100644 modules/cpu.py create mode 100644 modules/gpu.py create mode 100644 modules/log_analyzer.py create mode 100644 modules/memory.py create mode 100644 modules/sensors.py create mode 100644 modules/storage.py create mode 100644 modules/system_info.py create mode 100755 quick_test.py create mode 100644 reporter.py create mode 100644 requirements.txt create mode 100644 tests/__init__.py create mode 100644 tests/test_modules.py create mode 100644 tests/test_utils.py create mode 100644 utils.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0676001 --- /dev/null +++ b/.gitignore @@ -0,0 +1,54 @@ +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# 虚拟环境 +venv/ +env/ +ENV/ +.venv/ + +# IDE +.vscode/ +.idea/ +*.swp +*.swo +*~ + +# 日志和报告 +*.log +reports/ +*.json +*.csv +*.html + +# 配置文件(可能包含敏感信息) +config/local_config.yaml +config/secrets.yaml + +# 操作系统 +.DS_Store +Thumbs.db + +# 临时文件 +tmp/ +temp/ +*.tmp diff --git a/README.md b/README.md new file mode 100644 index 0000000..c4d637c --- /dev/null +++ b/README.md @@ -0,0 +1,111 @@ +# ServerGuard - 服务器硬件健康诊断系统 + +ServerGuard 是一款基于 Python 的 Linux 命令行工具,用于诊断服务器硬件(CPU、内存、存储、电源、显卡等)的潜在故障。 + +## 功能特性 + +- **硬件信息概览**:收集 CPU、内存、主板、存储、显卡等详细信息 +- **CPU 检测**:温度监控、MCE 错误检查、压力测试 +- **内存检测**:DIMM 信息、ECC 状态检查、内存压力测试 +- **存储检测**:SMART 数据分析、I/O 性能测试、RAID 状态检查 +- **传感器监控**:电压、风扇转速、温度监控(支持 IPMI) +- **显卡检测**:GPU 信息、温度、驱动状态检查 +- **日志分析**:自动扫描系统日志中的硬件错误 +- **报告生成**:支持 JSON、CSV、纯文本、HTML 格式 + +## 安装 + +### 系统要求 + +- Python 3.8+ +- Linux 操作系统 +- root 权限(大多数硬件诊断功能需要) + +### 安装系统依赖 + +**Debian/Ubuntu:** +```bash +sudo apt update +sudo apt install -y lshw dmidecode smartmontools lm-sensors stress-ng memtester ipmitool edac-utils fio mdadm pciutils usbutils +``` + +**CentOS/RHEL:** +```bash +sudo yum install -y lshw dmidecode smartmontools lm_sensors stress-ng memtester OpenIPMI edac-utils fio mdadm pciutils usbutils +``` + +### 安装 Python 依赖 + +```bash +python3 -m venv venv +source venv/bin/activate +pip install -r requirements.txt +``` + +## 使用方法 + +### 快速检测(非侵入性) + +```bash +sudo python3 main.py --quick +``` + +### 全面诊断(包含压力测试) + +```bash +sudo python3 main.py --full +``` + +### 运行特定模块 + +```bash +sudo python3 main.py --module cpu +sudo python3 main.py --module memory +sudo python3 main.py --module storage +``` + +### 生成不同格式的报告 + +```bash +sudo python3 main.py --full --format json --output report.json +sudo python3 main.py --full --format html --output report.html +``` + +### 查看帮助 + +```bash +python3 main.py --help +``` + +## 项目结构 + +``` +ServerGuard/ +├── main.py # 程序入口和核心调度器 +├── utils.py # 通用工具库 +├── reporter.py # 报告生成模块 +├── requirements.txt # Python 依赖 +├── README.md # 项目说明 +├── config/ +│ └── config.yaml # 配置文件 +├── modules/ +│ ├── __init__.py +│ ├── system_info.py # 系统信息概览 +│ ├── cpu.py # CPU 检测 +│ ├── memory.py # 内存检测 +│ ├── storage.py # 存储检测 +│ ├── sensors.py # 传感器监控 +│ ├── gpu.py # 显卡检测 +│ └── log_analyzer.py # 日志分析 +└── tests/ # 测试文件 +``` + +## 注意事项 + +1. **权限要求**:大多数硬件诊断功能需要 root 权限运行 +2. **压力测试**:全面诊断中的压力测试会占用大量系统资源,建议在维护窗口期进行 +3. **数据安全**:存储设备坏块扫描可能破坏数据,请谨慎使用 + +## 许可证 + +MIT License diff --git a/config/config.yaml b/config/config.yaml new file mode 100644 index 0000000..0da1ec2 --- /dev/null +++ b/config/config.yaml @@ -0,0 +1,121 @@ +# ServerGuard 配置文件 + +# 应用设置 +app: + name: "ServerGuard" + version: "1.0.0" + description: "服务器硬件健康诊断系统" + +# 日志设置 +logging: + level: INFO # DEBUG, INFO, WARNING, ERROR + file: "/var/log/serverguard.log" + max_size_mb: 100 + backup_count: 5 + console_output: true + +# 报告设置 +report: + default_format: "text" # text, json, csv, html + output_directory: "./reports" + include_timestamp: true + max_report_size_mb: 10 + +# 检测模块设置 +modules: + # CPU 检测设置 + cpu: + enabled: true + temperature_warning: 85 # 温度警告阈值(摄氏度) + temperature_critical: 95 # 温度危险阈值(摄氏度) + stress_test: + duration_seconds: 300 # 压力测试持续时间 + check_mce: true # 检查 MCE 错误 + + # 内存检测设置 + memory: + enabled: true + memtester: + enabled: true + memory_percent: 70 # 使用可用内存的百分比进行测试 + stress_test: + duration_seconds: 300 + check_ecc: true # 检查 ECC 错误 + + # 存储检测设置 + storage: + enabled: true + smart_check: true + check_reallocated_sectors: true + reallocated_threshold: 1 # 重映射扇区警告阈值 + temperature_warning: 60 # 硬盘温度警告阈值 + temperature_critical: 70 # 硬盘温度危险阈值 + run_io_test: false # 是否运行 I/O 性能测试(耗时) + io_test_size_mb: 100 + check_raid: true # 检查 RAID 状态 + + # 传感器检测设置 + sensors: + enabled: true + lm_sensors: true + ipmi: true + check_fans: true + fan_min_rpm: 500 # 风扇最低转速警告阈值 + voltage_tolerance: 0.1 # 电压偏差容忍度(比例) + + # GPU 检测设置 + gpu: + enabled: true + check_nvidia: true + check_amd: true + check_intel: true + temperature_warning: 85 + + # 日志分析设置 + log_analyzer: + enabled: true + check_dmesg: true + check_journalctl: true + max_lines: 5000 + lookback_days: 7 # 分析最近几天的日志 + +# 告警设置 +alerts: + enabled: false + smtp: + host: "" + port: 587 + username: "" + password: "" + use_tls: true + from_address: "serverguard@example.com" + to_addresses: [] + + webhook: + enabled: false + url: "" + headers: {} + + # 告警阈值 + thresholds: + cpu_temperature: 85 + memory_usage_percent: 90 + disk_usage_percent: 90 + hardware_error_count: 1 + +# 压力测试设置(全面诊断模式) +stress_test: + cpu: + enabled: true + workers: 0 # 0 表示使用所有核心 + timeout_seconds: 300 + + memory: + enabled: true + workers: 4 + timeout_seconds: 300 + + io: + enabled: false # I/O 压力测试可能很危险,默认关闭 + workers: 4 + timeout_seconds: 300 diff --git a/install.sh b/install.sh new file mode 100755 index 0000000..8e77f41 --- /dev/null +++ b/install.sh @@ -0,0 +1,280 @@ +#!/bin/bash +# ServerGuard 安装脚本 +# 支持 Debian/Ubuntu 和 CentOS/RHEL + +echo "========================================" +echo "ServerGuard 安装脚本" +echo "========================================" +echo "" + +# 检查是否为 root +if [ "$EUID" -ne 0 ]; then + echo "错误: 请以 root 权限运行此脚本" + echo " sudo ./install.sh" + exit 1 +fi + +# 检测 Linux 发行版 +if [ -f /etc/os-release ]; then + . /etc/os-release + OS=$NAME + VER=$VERSION_ID +else + echo "无法检测操作系统类型" + exit 1 +fi + +echo "检测到操作系统: $OS $VER" +echo "" + +# 记录安装失败的包 +FAILED_PACKAGES="" + +# 安装单个包的函数 +install_package() { + local pkg=$1 + local pkg_manager=$2 + + if [ "$pkg_manager" = "apt" ]; then + apt-get install -y "$pkg" 2>/dev/null && return 0 + else + yum install -y "$pkg" 2>/dev/null && return 0 + fi + + FAILED_PACKAGES="$FAILED_PACKAGES $pkg" + return 1 +} + +# 安装 Debian/Ubuntu 依赖 +install_debian_deps() { + echo "正在安装 Debian/Ubuntu 依赖..." + apt-get update + + # 核心依赖(必须) + CORE_PKGS="lshw dmidecode smartmontools lm-sensors ipmitool mdadm pciutils usbutils util-linux coreutils grep gawk sed" + + # 可选依赖 + OPTIONAL_PKGS="stress-ng memtester edac-utils fio nvme-cli" + + echo "安装核心依赖..." + for pkg in $CORE_PKGS; do + install_package "$pkg" "apt" || echo "警告: $pkg 安装失败" + done + + echo "安装可选依赖..." + for pkg in $OPTIONAL_PKGS; do + install_package "$pkg" "apt" || echo "注意: $pkg 安装失败(可选)" + done +} + +# 安装 RHEL/CentOS 依赖 +install_redhat_deps() { + echo "正在安装 RHEL/CentOS 依赖..." + + # 尝试启用 EPEL + if ! rpm -qa | grep -q epel-release; then + echo "启用 EPEL 仓库..." + yum install -y epel-release 2>/dev/null || true + fi + + # 对于 CentOS 8/RHEL 8,启用 PowerTools/CRB 仓库 + if [[ "$VER" == 8* ]] || [[ "$VER" == "8" ]]; then + echo "启用 PowerTools 仓库..." + yum config-manager --set-enabled powertools 2>/dev/null || \ + yum config-manager --set-enabled PowerTools 2>/dev/null || true + + # 尝试启用 CRB (CodeReady Builder) 对于 RHEL 8 + subscription-manager repos --enable codeready-builder-for-rhel-8-x86_64-rpms 2>/dev/null || true + fi + + # 核心依赖(必须) + CORE_PKGS="lshw dmidecode smartmontools lm_sensors ipmitool mdadm pciutils usbutils util-linux coreutils grep gawk sed" + + echo "安装核心依赖..." + for pkg in $CORE_PKGS; do + install_package "$pkg" "yum" || echo "警告: $pkg 安装失败" + done + + # 尝试安装 OpenIPMI (替代 ipmitool 的依赖) + install_package "OpenIPMI" "yum" || echo "注意: OpenIPMI 安装失败(可选)" + + # 可选依赖 + OPTIONAL_PKGS="memtester edac-utils fio nvme-cli" + + echo "安装可选依赖..." + for pkg in $OPTIONAL_PKGS; do + install_package "$pkg" "yum" || echo "注意: $pkg 安装失败(可选)" + done + + # 特别处理 stress-ng + echo "尝试安装 stress-ng..." + if ! yum install -y stress-ng 2>/dev/null; then + echo "注意: stress-ng 从默认仓库安装失败" + + # 尝试从 EPEL 安装 stress (备选) + echo "尝试安装 stress 作为备选..." + if yum install -y stress 2>/dev/null; then + echo "stress 安装成功,可作为压力测试备选工具" + else + echo "警告: stress 和 stress-ng 都安装失败" + echo " 压力测试功能将不可用" + FAILED_PACKAGES="$FAILED_PACKAGES stress-ng" + fi + fi + + # 对于 CentOS 8,提供手动安装 stress-ng 的指导 + if [[ "$VER" == 8* ]] && [[ "$FAILED_PACKAGES" == *"stress-ng"* ]]; then + echo "" + echo "============================================" + echo "注意: CentOS 8 中 stress-ng 需要从源码编译安装" + echo "============================================" + echo "手动安装步骤:" + echo " 1. 安装编译依赖:" + echo " yum install -y gcc make libaio-devel libattr-devel libbsd-devel libcap-devel libgcrypt-devel" + echo " 2. 下载并编译 stress-ng:" + echo " cd /tmp" + echo " git clone https://github.com/ColinIanKing/stress-ng.git" + echo " cd stress-ng" + echo " make" + echo " make install" + echo "============================================" + echo "" + fi +} + +# 根据发行版安装 +case "$OS" in + *Debian*|*Ubuntu*) + install_debian_deps + ;; + *CentOS*|*Red*Hat*|*Fedora*|*Alma*|*Rocky*) + install_redhat_deps + ;; + *) + echo "不支持的操作系统: $OS" + echo "请手动安装以下工具:" + echo " lshw, dmidecode, smartmontools, lm-sensors, stress-ng, memtester" + echo " ipmitool, edac-utils, fio, mdadm, pciutils, usbutils" + exit 1 + ;; +esac + +echo "" +echo "系统依赖安装完成" + +# 显示安装失败的包 +if [ -n "$FAILED_PACKAGES" ]; then + echo "" + echo "以下包安装失败: $FAILED_PACKAGES" + echo "某些功能可能受限,ServerGuard 仍可运行基本检测" +fi + +echo "" + +# 检查 Python 版本 +echo "检查 Python 版本..." +if command -v python3 &> /dev/null; then + PYTHON_VERSION=$(python3 --version 2>&1 | awk '{print $2}') + echo "找到 Python $PYTHON_VERSION" +elif command -v python &> /dev/null; then + PYTHON_VERSION=$(python --version 2>&1 | awk '{print $2}') + echo "找到 Python $PYTHON_VERSION" +else + echo "错误: 未找到 Python" + echo "请安装 Python 3.6 或更高版本" + exit 1 +fi + +# 检查 Python 版本号 +PYTHON_MAJOR=$(echo $PYTHON_VERSION | cut -d. -f1) +PYTHON_MINOR=$(echo $PYTHON_VERSION | cut -d. -f2) + +if [ "$PYTHON_MAJOR" -lt 3 ] || ([ "$PYTHON_MAJOR" -eq 3 ] && [ "$PYTHON_MINOR" -lt 6 ]); then + echo "错误: Python 版本过低 ($PYTHON_VERSION)" + echo "需要 Python 3.6 或更高版本" + exit 1 +fi + +echo "Python 版本符合要求" +echo "" + +# 安装 Python 依赖 +echo "安装 Python 依赖..." +PIP_CMD="pip3" +if ! command -v pip3 &> /dev/null; then + PIP_CMD="pip" +fi + +$PIP_CMD install -r requirements.txt || { + echo "警告: pip 安装失败,尝试使用 --user 选项" + $PIP_CMD install --user -r requirements.txt +} + +echo "" + +# 配置 lm-sensors +if command -v sensors-detect &> /dev/null; then + echo "" + echo "检测到 lm-sensors 需要配置" + echo "是否要运行 sensors-detect 配置传感器? (y/N)" + read -r response + if [[ "$response" =~ ^([yY][eE][sS]|[yY])$ ]]; then + echo "正在运行 sensors-detect..." + sensors-detect --auto || true + fi +fi + +echo "" +echo "========================================" +echo "安装完成!" +echo "========================================" +echo "" + +# 检查依赖状态 +echo "依赖检查:" +echo "------------" +for cmd in lshw dmidecode smartctl sensors ipmitool; do + if command -v "$cmd" &> /dev/null; then + echo " ✓ $cmd" + else + echo " ✗ $cmd (未安装)" + fi +done + +echo "" +echo "压力测试工具:" +if command -v stress-ng &> /dev/null; then + echo " ✓ stress-ng (推荐)" +elif command -v stress &> /dev/null; then + echo " ✓ stress (备选)" +else + echo " ✗ stress/stress-ng (未安装,压力测试不可用)" +fi + +echo "" +echo "使用方法:" +echo " 快速检测: sudo python3 main.py --quick" +echo " 全面诊断: sudo python3 main.py --full" +echo " 特定模块: sudo python3 main.py --module cpu" +echo " 生成报告: sudo python3 main.py --quick --format json --output report.json" +echo "" +echo "查看帮助: python3 main.py --help" +echo "" + +# 创建快捷方式(可选) +echo "是否要创建 /usr/local/bin/serverguard 快捷方式? (y/N)" +read -r response +if [[ "$response" =~ ^([yY][eE][sS]|[yY])$ ]]; then + SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + cat > /usr/local/bin/serverguard << EOF +#!/bin/bash +cd "$SCRIPT_DIR" +python3 main.py "\$@" +EOF + chmod +x /usr/local/bin/serverguard + echo "快捷方式已创建: serverguard" + echo "现在可以直接使用: sudo serverguard --quick" +fi + +echo "" +echo "安装完成!" diff --git a/main.py b/main.py new file mode 100755 index 0000000..86a11a5 --- /dev/null +++ b/main.py @@ -0,0 +1,419 @@ +#!/usr/bin/env python3 +""" +ServerGuard - 服务器硬件健康诊断系统 + +主程序入口,负责命令行参数解析、模块调度和报告生成。 + +使用方法: + sudo python3 main.py --quick # 快速检测 + sudo python3 main.py --full # 全面诊断(含压力测试) + sudo python3 main.py --module cpu # 仅检测 CPU + sudo python3 main.py --full --format json --output report.json +""" + +import argparse +import sys +import os +from typing import Optional, Dict, Any + +from utils import setup_logging, check_root_privileges, get_file_timestamp +from reporter import ReportGenerator + + +def parse_arguments() -> argparse.Namespace: + """ + 解析命令行参数。 + + Returns: + argparse.Namespace: 解析后的参数 + """ + parser = argparse.ArgumentParser( + prog='ServerGuard', + description='服务器硬件健康诊断系统 - 用于诊断 Linux 服务器硬件故障', + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +示例: + %(prog)s --quick # 快速硬件检测 + %(prog)s --full # 全面诊断(含压力测试) + %(prog)s --module cpu # 仅检测 CPU + %(prog)s --module memory,storage # 检测内存和存储 + %(prog)s --full --format json # 生成 JSON 格式报告 + %(prog)s --list-modules # 列出所有可用模块 + +注意: 大多数诊断功能需要 root 权限,请使用 sudo 运行。 + """ + ) + + # 主要操作模式(互斥) + mode_group = parser.add_mutually_exclusive_group(required=True) + mode_group.add_argument( + '--quick', '-q', + action='store_true', + help='快速检测模式(非侵入性,仅收集信息)' + ) + mode_group.add_argument( + '--full', '-f', + action='store_true', + help='全面诊断模式(包含压力测试,耗时较长)' + ) + mode_group.add_argument( + '--module', '-m', + type=str, + metavar='MODULE', + help='运行指定模块,多个模块用逗号分隔 (cpu,memory,storage,sensors,gpu,logs)' + ) + mode_group.add_argument( + '--list-modules', '-l', + action='store_true', + help='列出所有可用的检测模块' + ) + + # 报告选项 + parser.add_argument( + '--format', + type=str, + choices=['text', 'json', 'csv', 'html'], + default='text', + help='报告格式 (默认: text)' + ) + parser.add_argument( + '--output', '-o', + type=str, + metavar='FILE', + help='输出文件路径(不指定则输出到控制台)' + ) + parser.add_argument( + '--log', + type=str, + metavar='FILE', + default='/var/log/serverguard.log', + help='日志文件路径 (默认: /var/log/serverguard.log)' + ) + + # 测试参数 + parser.add_argument( + '--stress-duration', + type=int, + default=300, + metavar='SECONDS', + help='压力测试持续时间,单位秒 (默认: 300)' + ) + parser.add_argument( + '--verbose', '-v', + action='store_true', + help='显示详细输出' + ) + parser.add_argument( + '--yes', '-y', + action='store_true', + help='自动确认所有警告提示(如压力测试警告)' + ) + + return parser.parse_args() + + +def list_available_modules(): + """列出所有可用的检测模块。""" + modules = { + 'system': '系统信息概览', + 'cpu': 'CPU 检测与压力测试', + 'memory': '内存检测与压力测试', + 'storage': '存储设备检测', + 'sensors': '电源与传感器监控', + 'gpu': '显卡检测', + 'logs': '日志分析' + } + + print("可用的检测模块:") + print("-" * 40) + for name, description in modules.items(): + print(f" {name:12} - {description}") + print("-" * 40) + print("\n使用示例:") + print(" sudo python3 main.py --module cpu") + print(" sudo python3 main.py --module cpu,memory,storage") + + +def confirm_stress_test(duration: int, auto_confirm: bool = False) -> bool: + """ + 确认是否执行压力测试。 + + Args: + duration: 压力测试持续时间 + auto_confirm: 是否自动确认 + + Returns: + bool: 是否继续 + """ + if auto_confirm: + return True + + print("\n" + "=" * 60) + print("警告:即将执行压力测试") + print("=" * 60) + print(f"测试持续时间: {duration} 秒 ({duration // 60} 分钟)") + print("此测试将占用大量系统资源,可能导致:") + print(" - CPU 和内存使用率接近 100%") + print(" - 系统响应变慢") + print(" - 温度升高") + print("建议在维护窗口期进行,并确保服务器可接受高负载。") + print("=" * 60) + + try: + response = input("\n是否继续? [y/N]: ").strip().lower() + return response in ('y', 'yes') + except KeyboardInterrupt: + print("\n操作已取消") + return False + + +def run_module(module_name: str, stress_test: bool = False, stress_duration: int = 300) -> Dict[str, Any]: + """ + 运行指定的检测模块。 + + Args: + module_name: 模块名称 + stress_test: 是否执行压力测试 + stress_duration: 压力测试持续时间 + + Returns: + Dict[str, Any]: 模块检测结果 + """ + import logging + logger = logging.getLogger(__name__) + + module_map = { + 'system': 'modules.system_info', + 'cpu': 'modules.cpu', + 'memory': 'modules.memory', + 'storage': 'modules.storage', + 'sensors': 'modules.sensors', + 'gpu': 'modules.gpu', + 'logs': 'modules.log_analyzer' + } + + if module_name not in module_map: + logger.error(f"未知模块: {module_name}") + return {"status": "error", "error": f"未知模块: {module_name}"} + + try: + module = __import__(module_map[module_name], fromlist=['']) + + if module_name == 'system': + return module.get_system_info() + elif module_name == 'cpu': + return module.run_cpu_check(stress_test, stress_duration) + elif module_name == 'memory': + return module.run_memory_check(stress_test, stress_duration) + elif module_name == 'storage': + return module.run_storage_check() + elif module_name == 'sensors': + return module.run_sensors_check() + elif module_name == 'gpu': + return module.run_gpu_check() + elif module_name == 'logs': + return module.analyze_logs() + + except Exception as e: + logger.error(f"运行模块 {module_name} 时出错: {e}") + return {"status": "error", "error": str(e)} + + +def run_quick_check() -> Dict[str, Any]: + """ + 执行快速检测(非侵入性)。 + + Returns: + Dict[str, Any]: 检测结果 + """ + import logging + logger = logging.getLogger(__name__) + + print("正在执行快速硬件检测...") + print("-" * 60) + + results = { + "scan_type": "quick", + "timestamp": get_file_timestamp(), + "modules": {} + } + + modules_to_run = ['system', 'cpu', 'memory', 'storage', 'sensors', 'gpu', 'logs'] + + for module_name in modules_to_run: + print(f"正在检测: {module_name}...", end=' ', flush=True) + try: + result = run_module(module_name, stress_test=False) + results["modules"][module_name] = result + status = result.get("status", "unknown") + if status == "success": + print("[完成]") + elif status == "warning": + print("[警告]") + elif status == "error": + print("[错误]") + else: + print(f"[{status}]") + except Exception as e: + logger.error(f"模块 {module_name} 执行失败: {e}") + results["modules"][module_name] = {"status": "error", "error": str(e)} + print("[失败]") + + print("-" * 60) + return results + + +def run_full_diagnostic(stress_duration: int, auto_confirm: bool = False) -> Dict[str, Any]: + """ + 执行全面诊断(包含压力测试)。 + + Args: + stress_duration: 压力测试持续时间 + auto_confirm: 是否自动确认 + + Returns: + Dict[str, Any]: 检测结果 + """ + import logging + logger = logging.getLogger(__name__) + + if not confirm_stress_test(stress_duration, auto_confirm): + print("诊断已取消") + sys.exit(0) + + print("\n正在执行全面硬件诊断...") + print("=" * 60) + + results = { + "scan_type": "full", + "timestamp": get_file_timestamp(), + "stress_duration": stress_duration, + "modules": {} + } + + # 先执行快速检测 + modules_to_run = ['system', 'cpu', 'memory', 'storage', 'sensors', 'gpu', 'logs'] + + for module_name in modules_to_run: + print(f"\n正在检测: {module_name}...") + try: + # CPU 和内存执行压力测试 + do_stress = module_name in ['cpu', 'memory'] + result = run_module(module_name, stress_test=do_stress, stress_duration=stress_duration) + results["modules"][module_name] = result + status = result.get("status", "unknown") + print(f" 状态: {status}") + except Exception as e: + logger.error(f"模块 {module_name} 执行失败: {e}") + results["modules"][module_name] = {"status": "error", "error": str(e)} + print(f" 状态: 失败 - {e}") + + print("\n" + "=" * 60) + return results + + +def run_specific_modules(module_list: str, stress_duration: int) -> Dict[str, Any]: + """ + 运行指定的模块列表。 + + Args: + module_list: 逗号分隔的模块名称 + stress_duration: 压力测试持续时间 + + Returns: + Dict[str, Any]: 检测结果 + """ + modules = [m.strip() for m in module_list.split(',')] + + results = { + "scan_type": "custom", + "timestamp": get_file_timestamp(), + "modules": {} + } + + print(f"正在执行自定义模块检测: {', '.join(modules)}") + print("-" * 60) + + for module_name in modules: + print(f"正在检测: {module_name}...", end=' ', flush=True) + try: + result = run_module(module_name, stress_test=False) + results["modules"][module_name] = result + status = result.get("status", "unknown") + print(f"[{status}]") + except Exception as e: + results["modules"][module_name] = {"status": "error", "error": str(e)} + print(f"[失败: {e}]") + + print("-" * 60) + return results + + +def main(): + """程序主入口。""" + args = parse_arguments() + + # 设置日志 + log_level = logging.DEBUG if args.verbose else logging.INFO + setup_logging( + log_file=args.log if check_root_privileges() else None, + level=log_level, + console_output=True + ) + logger = logging.getLogger(__name__) + + # 列出模块 + if args.list_modules: + list_available_modules() + sys.exit(0) + + # 检查 root 权限警告 + if not check_root_privileges(): + logger.warning("未以 root 权限运行,部分功能可能受限") + print("警告: 未检测到 root 权限,部分硬件信息可能无法获取") + print("建议: 使用 sudo 运行以获得完整的诊断信息\n") + + # 执行诊断 + try: + if args.quick: + results = run_quick_check() + elif args.full: + results = run_full_diagnostic(args.stress_duration, args.yes) + elif args.module: + results = run_specific_modules(args.module, args.stress_duration) + else: + print("请指定操作模式: --quick, --full, --module 或 --list-modules") + sys.exit(1) + + # 生成报告 + generator = ReportGenerator() + + if args.output: + generator.save_report(results, args.format, args.output) + print(f"\n报告已保存至: {args.output}") + else: + report = generator.generate_report(results, args.format) + print("\n" + "=" * 60) + print("诊断报告") + print("=" * 60) + print(report) + + # 返回退出码:如果有错误则返回 1 + has_errors = any( + m.get("status") == "error" + for m in results.get("modules", {}).values() + ) + sys.exit(1 if has_errors else 0) + + except KeyboardInterrupt: + print("\n\n操作已被用户中断") + sys.exit(130) + except Exception as e: + logger.exception("程序执行过程中发生错误") + print(f"\n错误: {e}") + sys.exit(1) + + +if __name__ == '__main__': + import logging + main() diff --git a/modules/__init__.py b/modules/__init__.py new file mode 100644 index 0000000..9b04e9c --- /dev/null +++ b/modules/__init__.py @@ -0,0 +1,15 @@ +""" +ServerGuard 硬件检测模块 + +包含以下子模块: +- system_info: 系统信息概览 +- cpu: CPU 检测与压力测试 +- memory: 内存检测与压力测试 +- storage: 存储设备检测 +- sensors: 电源与传感器监控 +- gpu: 显卡检测 +- log_analyzer: 日志分析 +""" + +__version__ = "1.0.0" +__author__ = "ServerGuard Team" diff --git a/modules/cpu.py b/modules/cpu.py new file mode 100644 index 0000000..baf738d --- /dev/null +++ b/modules/cpu.py @@ -0,0 +1,518 @@ +""" +ServerGuard - CPU 检测与压力测试模块 + +检查 CPU 状态、温度、错误日志,并执行压力测试。 +""" + +import os +import re +import time +from typing import Dict, Any, List, Optional + +import sys +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from utils import ( + execute_command, check_command_exists, parse_key_value_output, + safe_int, safe_float, require_root +) + + +def run_cpu_check(stress_test: bool = False, stress_duration: int = 300) -> Dict[str, Any]: + """ + 执行 CPU 检测。 + + Args: + stress_test: 是否执行压力测试 + stress_duration: 压力测试持续时间(秒) + + Returns: + Dict[str, Any]: 检测结果 + """ + result = { + "status": "success", + "cpu_info": {}, + "temperature": {}, + "mce_errors": {}, + "load_average": {}, + "stress_test": {} + } + + try: + # 获取基本信息 + result["cpu_info"] = get_cpu_details() + + # 获取温度 + result["temperature"] = get_cpu_temperature() + if result["temperature"].get("status") == "warning": + result["status"] = "warning" + + # 获取负载 + result["load_average"] = get_load_average() + + # 检查 MCE 错误 + result["mce_errors"] = check_mce_errors() + if result["mce_errors"].get("count", 0) > 0: + result["status"] = "warning" + + # 执行压力测试 + if stress_test: + result["stress_test"] = run_cpu_stress_test(stress_duration) + if not result["stress_test"].get("passed", False): + result["status"] = "error" + + except Exception as e: + result["status"] = "error" + result["error"] = str(e) + + return result + + +def get_cpu_details() -> Dict[str, Any]: + """获取 CPU 详细信息。""" + info = { + "model": "Unknown", + "architecture": "Unknown", + "cores": 0, + "threads": 0, + "current_frequency_mhz": 0, + "bogomips": 0, + "flags": [] + } + + try: + with open('/proc/cpuinfo', 'r') as f: + content = f.read() + + # 解析第一个 CPU 的信息 + cpu_sections = content.split('\n\n') + if cpu_sections: + first_cpu = cpu_sections[0] + data = {} + for line in first_cpu.split('\n'): + if ':' in line: + key, value = line.split(':', 1) + data[key.strip()] = value.strip() + + info["model"] = data.get('model name', 'Unknown') + info["vendor"] = data.get('vendor_id', 'Unknown') + info["architecture"] = data.get('cpu family', 'Unknown') + info["bogomips"] = safe_float(data.get('bogomips', 0)) + + if 'flags' in data: + info["flags"] = data['flags'].split() + + # 统计核心数和线程数 + info["threads"] = content.count('processor\t:') + info["cores"] = len(set(re.findall(r'physical id\t:\s*(\d+)', content))) + if info["cores"] == 0: + info["cores"] = info["threads"] + + # 获取当前频率 + if os.path.exists('/proc/cpuinfo'): + with open('/proc/cpuinfo', 'r') as f: + for line in f: + if 'cpu MHz' in line: + info["current_frequency_mhz"] = safe_float(line.split(':')[1].strip()) + break + + # 获取缩放频率信息 + freq_info = get_cpu_frequency_info() + if freq_info: + info["frequency_info"] = freq_info + + except Exception as e: + info["error"] = str(e) + + return info + + +def get_cpu_frequency_info() -> Dict[str, Any]: + """获取 CPU 频率信息。""" + info = {} + + # 尝试从 cpufreq 获取 + cpu0_path = '/sys/devices/system/cpu/cpu0/cpufreq' + if os.path.exists(cpu0_path): + try: + files = { + "min_mhz": "scaling_min_freq", + "max_mhz": "scaling_max_freq", + "current_mhz": "scaling_cur_freq", + "governor": "scaling_governor", + "driver": "scaling_driver" + } + + for key, filename in files.items(): + filepath = os.path.join(cpu0_path, filename) + if os.path.exists(filepath): + with open(filepath, 'r') as f: + value = f.read().strip() + if 'freq' in filename: + # 频率值通常以 kHz 存储 + info[key] = round(safe_int(value) / 1000, 2) + else: + info[key] = value + except: + pass + + return info + + +def get_cpu_temperature() -> Dict[str, Any]: + """获取 CPU 温度信息。""" + result = { + "status": "success", + "sensors": {}, + "current_c": None, + "high_threshold_c": None, + "critical_threshold_c": None + } + + temperatures = [] + + # 方法 1: 使用 sensors 命令 (lm-sensors) + if check_command_exists('sensors'): + try: + _, stdout, _ = execute_command( + ['sensors', '-u'], + check_returncode=False, timeout=10 + ) + + # 解析 sensors -u 输出 + current_chip = None + current_adapter = None + + for line in stdout.split('\n'): + line = line.strip() + + # 检测芯片名称 + if line and not line.startswith('Adapter:') and not ':' in line and not line.startswith('temp'): + current_chip = line.rstrip(':') + result["sensors"][current_chip] = {} + continue + + if line.startswith('Adapter:'): + current_adapter = line.split(':', 1)[1].strip() + if current_chip: + result["sensors"][current_chip]["adapter"] = current_adapter + continue + + # 解析温度输入值 + if 'temp' in line and '_input' in line: + match = re.match(r'(temp\d+)_input:\s*([\d.]+)', line) + if match: + temp_name = match.group(1) + temp_value = safe_float(match.group(2)) + + if current_chip: + if temp_name not in result["sensors"][current_chip]: + result["sensors"][current_chip][temp_name] = {} + result["sensors"][current_chip][temp_name]["current"] = temp_value + temperatures.append(temp_value) + + # 解析高温阈值 + if 'temp' in line and '_max' in line: + match = re.match(r'(temp\d+)_max:\s*([\d.]+)', line) + if match: + temp_name = match.group(1) + temp_value = safe_float(match.group(2)) + if current_chip and temp_name in result["sensors"][current_chip]: + result["sensors"][current_chip][temp_name]["high"] = temp_value + + # 解析临界温度 + if 'temp' in line and '_crit' in line: + match = re.match(r'(temp\d+)_crit:\s*([\d.]+)', line) + if match: + temp_name = match.group(1) + temp_value = safe_float(match.group(2)) + if current_chip and temp_name in result["sensors"][current_chip]: + result["sensors"][current_chip][temp_name]["critical"] = temp_value + + except: + pass + + # 方法 2: 直接读取 thermal zone + if not temperatures: + try: + thermal_path = '/sys/class/thermal' + if os.path.exists(thermal_path): + for zone in os.listdir(thermal_path): + if zone.startswith('thermal_zone'): + zone_path = os.path.join(thermal_path, zone) + + # 读取类型 + type_file = os.path.join(zone_path, 'type') + zone_type = 'unknown' + if os.path.exists(type_file): + with open(type_file, 'r') as f: + zone_type = f.read().strip() + + # 读取温度 (单位是毫摄氏度) + temp_file = os.path.join(zone_path, 'temp') + if os.path.exists(temp_file): + with open(temp_file, 'r') as f: + temp_mc = safe_int(f.read().strip()) + temp_c = temp_mc / 1000.0 + + if 'x86_pkg_temp' in zone_type or 'cpu' in zone_type.lower(): + result["sensors"][zone] = { + "type": zone_type, + "current": temp_c + } + temperatures.append(temp_c) + except: + pass + + # 方法 3: 尝试从 hwmon 读取 + if not temperatures: + try: + hwmon_path = '/sys/class/hwmon' + if os.path.exists(hwmon_path): + for hwmon in os.listdir(hwmon_path): + hwmon_dir = os.path.join(hwmon_path, hwmon) + + # 读取名称 + name_file = os.path.join(hwmon_dir, 'name') + if os.path.exists(name_file): + with open(name_file, 'r') as f: + name = f.read().strip() + else: + name = hwmon + + # 查找温度输入 + for file in os.listdir(hwmon_dir): + if file.startswith('temp') and file.endswith('_input'): + temp_file = os.path.join(hwmon_dir, file) + with open(temp_file, 'r') as f: + temp_mc = safe_int(f.read().strip()) + temp_c = temp_mc / 1000.0 + + sensor_name = file.replace('_input', '') + result["sensors"][f"{name}_{sensor_name}"] = { + "current": temp_c + } + temperatures.append(temp_c) + except: + pass + + # 计算平均温度 + if temperatures: + result["current_c"] = round(sum(temperatures) / len(temperatures), 1) + result["max_c"] = round(max(temperatures), 1) + + # 检查温度警告 + if result["max_c"] > 85: + result["status"] = "warning" + result["warning"] = f"CPU 温度过高: {result['max_c']}°C" + else: + result["status"] = "unknown" + result["warning"] = "无法获取 CPU 温度信息" + + return result + + +def get_load_average() -> Dict[str, Any]: + """获取系统负载信息。""" + result = {} + + try: + with open('/proc/loadavg', 'r') as f: + load_data = f.read().strip().split() + + if len(load_data) >= 3: + result["1min"] = safe_float(load_data[0]) + result["5min"] = safe_float(load_data[1]) + result["15min"] = safe_float(load_data[2]) + + # 获取 CPU 核心数以计算相对负载 + num_cores = os.cpu_count() or 1 + result["cores"] = num_cores + result["relative_1min"] = round(result["1min"] / num_cores, 2) + result["relative_5min"] = round(result["5min"] / num_cores, 2) + result["relative_15min"] = round(result["15min"] / num_cores, 2) + + except: + pass + + return result + + +def check_mce_errors() -> Dict[str, Any]: + """检查 Machine Check Exception (MCE) 错误。""" + result = { + "count": 0, + "errors": [], + "status": "ok" + } + + # 方法 1: 检查 dmesg + if check_command_exists('dmesg'): + try: + _, stdout, _ = execute_command( + ['dmesg'], + check_returncode=False, timeout=10 + ) + + mce_keywords = ['Machine check events logged', 'Hardware Error', 'CMCI storm'] + + for line in stdout.split('\n'): + for keyword in mce_keywords: + if keyword in line: + result["count"] += 1 + if len(result["errors"]) < 10: # 限制错误数量 + result["errors"].append(line.strip()) + result["status"] = "warning" + break + + except: + pass + + # 方法 2: 检查 mcelog + if check_command_exists('mcelog'): + try: + # 尝试读取 mcelog 输出 + _, stdout, _ = execute_command( + ['mcelog', '--client'], + check_returncode=False, timeout=5 + ) + + if stdout.strip() and 'no machine check' not in stdout.lower(): + result["count"] += stdout.count('MCE') + result["status"] = "warning" + result["mcelog_available"] = True + except: + pass + + # 方法 3: 检查 /dev/mcelog + if os.path.exists('/dev/mcelog'): + result["mcelog_device"] = True + + return result + + +@require_root +def run_cpu_stress_test(duration: int = 300) -> Dict[str, Any]: + """ + 运行 CPU 压力测试。 + + Args: + duration: 测试持续时间(秒) + + Returns: + Dict[str, Any]: 测试结果 + """ + result = { + "passed": False, + "duration_seconds": duration, + "cpu_cores": os.cpu_count() or 1, + "start_time": None, + "end_time": None, + "max_temperature": None, + "tool_used": None, + "errors": [] + } + + # 使用 stress-ng 进行压力测试(首选) + if check_command_exists('stress-ng'): + result["tool_used"] = "stress-ng" + try: + result["start_time"] = time.strftime('%Y-%m-%d %H:%M:%S') + + # 获取测试前温度 + temp_before = get_cpu_temperature() + + # 运行 stress-ng + # --cpu 0 使用所有 CPU 核心 + # --timeout 指定超时时间 + # --metrics-brief 输出简要指标 + cmd = [ + 'stress-ng', + '--cpu', '0', + '--timeout', str(duration), + '--metrics-brief' + ] + + _, stdout, stderr = execute_command( + cmd, + timeout=duration + 30, # 给一些额外时间 + check_returncode=False + ) + + result["end_time"] = time.strftime('%Y-%m-%d %H:%M:%S') + + # 获取测试后温度 + temp_after = get_cpu_temperature() + + # 分析输出 + output = stdout + stderr + + # 检查是否有错误 + if 'error' in output.lower() or 'fail' in output.lower(): + result["passed"] = False + result["errors"].append("压力测试过程中发现错误") + else: + result["passed"] = True + + # 提取性能指标 + bogo_ops = re.search(r'stress-ng:\s+cpu:\s+(\d+)\s+bogo ops', output) + if bogo_ops: + result["bogo_ops"] = safe_int(bogo_ops.group(1)) + + bogo_ops_per_sec = re.search(r'(\d+\.\d+)\s+bogo ops per second', output) + if bogo_ops_per_sec: + result["bogo_ops_per_second"] = safe_float(bogo_ops_per_sec.group(1)) + + # 温度分析 + if temp_after.get("max_c"): + result["max_temperature"] = temp_after["max_c"] + if temp_after["max_c"] > 95: + result["warnings"] = [f"测试期间温度过高: {temp_after['max_c']}°C"] + + result["temperature_before"] = temp_before + result["temperature_after"] = temp_after + + except Exception as e: + result["passed"] = False + result["errors"].append(str(e)) + + # 备选: 使用 stress + elif check_command_exists('stress'): + result["tool_used"] = "stress" + try: + result["start_time"] = time.strftime('%Y-%m-%d %H:%M:%S') + + temp_before = get_cpu_temperature() + + num_cores = os.cpu_count() or 1 + _, stdout, stderr = execute_command( + ['stress', '--cpu', str(num_cores), '--timeout', str(duration)], + timeout=duration + 30, + check_returncode=False + ) + + result["end_time"] = time.strftime('%Y-%m-%d %H:%M:%S') + temp_after = get_cpu_temperature() + + result["passed"] = True + result["temperature_before"] = temp_before + result["temperature_after"] = temp_after + + if temp_after.get("max_c"): + result["max_temperature"] = temp_after["max_c"] + + except Exception as e: + result["passed"] = False + result["errors"].append(str(e)) + + else: + result["passed"] = False + result["errors"].append("未找到压力测试工具 (stress-ng 或 stress)") + result["note"] = "请安装 stress-ng 或 stress: yum install stress / apt install stress-ng" + + return result + + +if __name__ == '__main__': + import json + print(json.dumps(run_cpu_check(stress_test=False), indent=2, ensure_ascii=False)) diff --git a/modules/gpu.py b/modules/gpu.py new file mode 100644 index 0000000..89b22c2 --- /dev/null +++ b/modules/gpu.py @@ -0,0 +1,497 @@ +""" +ServerGuard - 显卡检测模块 + +检测 GPU 信息、温度、驱动状态等。 +""" + +import os +import re +from typing import Dict, Any, List, Optional + +import sys +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from utils import ( + execute_command, check_command_exists, parse_key_value_output, + safe_int, safe_float, format_bytes +) + + +def run_gpu_check() -> Dict[str, Any]: + """ + 执行 GPU 检测。 + + Returns: + Dict[str, Any]: 检测结果 + """ + result = { + "status": "success", + "gpus": [], + "errors": [] + } + + try: + # 检测 NVIDIA GPU + nvidia_gpus = check_nvidia_gpus() + if nvidia_gpus: + result["gpus"].extend(nvidia_gpus) + + # 检测 AMD GPU + amd_gpus = check_amd_gpus() + if amd_gpus: + result["gpus"].extend(amd_gpus) + + # 检测 Intel GPU + intel_gpus = check_intel_gpus() + if intel_gpus: + result["gpus"].extend(intel_gpus) + + # 如果没有找到 GPU,使用 lspci 基础检测 + if not result["gpus"]: + result["gpus"] = check_generic_gpus() + + # 检查系统日志中的 GPU 错误 + result["dmesg_errors"] = check_gpu_dmesg_errors() + + # 如果有错误,更新状态 + if result["dmesg_errors"]: + result["status"] = "warning" + + if not result["gpus"]: + result["status"] = "unknown" + result["note"] = "未检测到 GPU 设备" + + except Exception as e: + result["status"] = "error" + result["error"] = str(e) + + return result + + +def check_nvidia_gpus() -> List[Dict[str, Any]]: + """检测 NVIDIA GPU。""" + gpus = [] + + if not check_command_exists('nvidia-smi'): + return gpus + + try: + # 获取 GPU 列表和基本信息 + _, stdout, _ = execute_command( + ['nvidia-smi', '--query-gpu=gpu_name,gpu_bus_id,pci.bus_id,driver_version,pstate,pcie.link.gen.max,pcie.link.gen.current', + '--format=csv,noheader'], + check_returncode=False, timeout=10 + ) + + for i, line in enumerate(stdout.strip().split('\n')): + if not line.strip(): + continue + + parts = [p.strip() for p in line.split(',')] + if len(parts) >= 4: + gpu_info = { + "vendor": "NVIDIA", + "index": i, + "name": parts[0], + "bus_id": parts[1] if len(parts) > 1 else "unknown", + "pci_bus_id": parts[2] if len(parts) > 2 else "unknown", + "driver_version": parts[3], + "pstate": parts[4] if len(parts) > 4 else "unknown", + "pcie_max_gen": parts[5] if len(parts) > 5 else "unknown", + "pcie_current_gen": parts[6] if len(parts) > 6 else "unknown" + } + + # 获取详细信息 + gpu_info.update(get_nvidia_gpu_details(i)) + gpus.append(gpu_info) + + except Exception as e: + pass + + return gpus + + +def get_nvidia_gpu_details(gpu_index: int) -> Dict[str, Any]: + """获取单个 NVIDIA GPU 的详细信息。""" + details = {} + + try: + # 获取温度和功耗 + _, stdout, _ = execute_command( + ['nvidia-smi', '--query-gpu=temperature.gpu,power.draw,power.limit,clocks.gr,clocks.mem,utilization.gpu,utilization.memory,memory.total,memory.used,memory.free,serial,uuid,vbios_version', + '--format=csv,noheader,nounits', '-i', str(gpu_index)], + check_returncode=False, timeout=10 + ) + + parts = [p.strip() for p in stdout.split(',')] + if len(parts) >= 10: + details["temperature_c"] = safe_int(parts[0]) if parts[0] != '[Not Supported]' else None + details["power_draw_w"] = safe_float(parts[1]) if parts[1] != '[Not Supported]' else None + details["power_limit_w"] = safe_float(parts[2]) if parts[2] != '[Not Supported]' else None + details["graphics_clock_mhz"] = safe_int(parts[3]) if parts[3] != '[Not Supported]' else None + details["memory_clock_mhz"] = safe_int(parts[4]) if parts[4] != '[Not Supported]' else None + details["gpu_utilization_percent"] = safe_int(parts[5]) if parts[5] != '[Not Supported]' else None + details["memory_utilization_percent"] = safe_int(parts[6]) if parts[6] != '[Not Supported]' else None + details["memory_total_mb"] = safe_int(parts[7]) if parts[7] != '[Not Supported]' else None + details["memory_used_mb"] = safe_int(parts[8]) if parts[8] != '[Not Supported]' else None + details["memory_free_mb"] = safe_int(parts[9]) if parts[9] != '[Not Supported]' else None + + if len(parts) > 10: + details["serial"] = parts[10] if parts[10] != '[Not Supported]' else None + if len(parts) > 11: + details["uuid"] = parts[11] if parts[11] != '[Not Supported]' else None + if len(parts) > 12: + details["vbios_version"] = parts[12] if parts[12] != '[Not Supported]' else None + + # 获取 ECC 状态 + _, ecc_output, _ = execute_command( + ['nvidia-smi', '--query-gpu=ecc.mode.current,ecc.mode.pending,ecc.errors.corrected.volatile.total,ecc.errors.uncorrected.volatile.total', + '--format=csv,noheader', '-i', str(gpu_index)], + check_returncode=False, timeout=10 + ) + + ecc_parts = [p.strip() for p in ecc_output.split(',')] + if len(ecc_parts) >= 4: + details["ecc_mode"] = ecc_parts[0] if ecc_parts[0] != '[Not Supported]' else None + details["ecc_pending"] = ecc_parts[1] if ecc_parts[1] != '[Not Supported]' else None + details["ecc_corrected_errors"] = safe_int(ecc_parts[2]) if ecc_parts[2] != '[Not Supported]' else 0 + details["ecc_uncorrected_errors"] = safe_int(ecc_parts[3]) if ecc_parts[3] != '[Not Supported]' else 0 + + # 获取进程信息 + _, proc_output, _ = execute_command( + ['nvidia-smi', 'pmon', '-s', 'um', '-c', '1', '-i', str(gpu_index)], + check_returncode=False, timeout=5 + ) + + processes = [] + for line in proc_output.split('\n')[2:]: # 跳过表头 + if line.strip() and not line.startswith('#'): + proc_parts = line.split() + if len(proc_parts) >= 5: + processes.append({ + "pid": proc_parts[1], + "type": proc_parts[2], + "sm_util": proc_parts[3], + "mem_util": proc_parts[4] + }) + + if processes: + details["processes"] = processes + + except: + pass + + return details + + +def check_amd_gpus() -> List[Dict[str, Any]]: + """检测 AMD GPU。""" + gpus = [] + + # 使用 radeontop 获取信息 + if check_command_exists('radeontop'): + try: + # radeontop 需要图形环境,使用 -d 参数输出到文件 + import tempfile + + with tempfile.NamedTemporaryFile(mode='r', suffix='.txt', delete=False) as f: + dump_file = f.name + + try: + _, stdout, _ = execute_command( + ['radeontop', '-d', dump_file, '-l', '1'], + check_returncode=False, timeout=5 + ) + + with open(dump_file, 'r') as f: + output = f.read() + + gpu_info = {"vendor": "AMD"} + + # 解析 radeontop 输出 + for line in output.split('\n'): + if 'GPU' in line and ':' in line: + parts = line.split(':') + if len(parts) == 2: + key = parts[0].strip().lower().replace(' ', '_') + value = parts[1].strip() + gpu_info[key] = value + + if gpu_info: + gpus.append(gpu_info) + + finally: + if os.path.exists(dump_file): + os.unlink(dump_file) + + except: + pass + + # 尝试从 sysfs 获取 AMD GPU 信息 + try: + for card in os.listdir('/sys/class/drm'): + if card.startswith('card') and not card[-1].isdigit() or (card.startswith('card') and os.path.exists(f'/sys/class/drm/{card}/device/vendor')): + vendor_path = f'/sys/class/drm/{card}/device/vendor' + if os.path.exists(vendor_path): + with open(vendor_path, 'r') as f: + vendor_id = f.read().strip() + + # AMD vendor ID 是 0x1002 + if vendor_id == '0x1002': + gpu_info = { + "vendor": "AMD", + "card": card + } + + # 获取设备信息 + device_path = f'/sys/class/drm/{card}/device/device' + if os.path.exists(device_path): + with open(device_path, 'r') as f: + gpu_info["device_id"] = f.read().strip() + + # 获取驱动 + driver_path = f'/sys/class/drm/{card}/device/driver' + if os.path.exists(driver_path): + driver = os.path.basename(os.readlink(driver_path)) + gpu_info["driver"] = driver + + # 获取温度 + temp_path = f'/sys/class/drm/{card}/device/hwmon/hwmon0/temp1_input' + if os.path.exists(temp_path): + with open(temp_path, 'r') as f: + temp_mc = safe_int(f.read().strip()) + gpu_info["temperature_c"] = temp_mc / 1000.0 + + # 获取频率 + freq_path = f'/sys/class/drm/{card}/device/pp_dpm_sclk' + if os.path.exists(freq_path): + with open(freq_path, 'r') as f: + gpu_info["core_clock_levels"] = f.read().strip() + + gpus.append(gpu_info) + + except: + pass + + return gpus + + +def check_intel_gpus() -> List[Dict[str, Any]]: + """检测 Intel GPU。""" + gpus = [] + + # 从 sysfs 获取 Intel GPU 信息 + try: + for card in os.listdir('/sys/class/drm'): + if not card.startswith('card'): + continue + + vendor_path = f'/sys/class/drm/{card}/device/vendor' + if not os.path.exists(vendor_path): + continue + + with open(vendor_path, 'r') as f: + vendor_id = f.read().strip() + + # Intel vendor ID 是 0x8086 + if vendor_id == '0x8086': + gpu_info = { + "vendor": "Intel", + "card": card + } + + # 获取设备信息 + device_path = f'/sys/class/drm/{card}/device/device' + if os.path.exists(device_path): + with open(device_path, 'r') as f: + gpu_info["device_id"] = f.read().strip() + + # 获取驱动 + driver_path = f'/sys/class/drm/{card}/device/driver' + if os.path.exists(driver_path): + driver = os.path.basename(os.readlink(driver_path)) + gpu_info["driver"] = driver + + # Intel GPU 通常集成,标记为集成显卡 + gpu_info["type"] = "integrated" + + gpus.append(gpu_info) + + except: + pass + + return gpus + + +def check_generic_gpus() -> List[Dict[str, Any]]: + """使用 lspci 进行通用 GPU 检测。""" + gpus = [] + + if not check_command_exists('lspci'): + return gpus + + try: + _, stdout, _ = execute_command( + ['lspci', '-nn'], + check_returncode=False, timeout=10 + ) + + for line in stdout.split('\n'): + if 'VGA' in line or '3D controller' in line or 'Display controller' in line: + parts = line.split(': ', 1) + if len(parts) == 2: + bus_id = parts[0].split()[0] + description = parts[1] + + gpu_info = { + "bus_id": bus_id, + "description": description + } + + # 识别厂商 + desc_lower = description.lower() + if 'nvidia' in desc_lower: + gpu_info["vendor"] = "NVIDIA" + elif 'amd' in desc_lower or 'ati' in desc_lower: + gpu_info["vendor"] = "AMD" + elif 'intel' in desc_lower: + gpu_info["vendor"] = "Intel" + else: + gpu_info["vendor"] = "Unknown" + + # 识别类型 + if 'VGA' in line: + gpu_info["type"] = "vga" + elif '3D controller' in line: + gpu_info["type"] = "3d" + elif 'Display controller' in line: + gpu_info["type"] = "display" + + # 获取详细信息 + try: + _, detail, _ = execute_command( + ['lspci', '-v', '-s', bus_id], + check_returncode=False, timeout=5 + ) + + # 提取驱动信息 + driver_match = re.search(r'Kernel driver in use:\s*(\S+)', detail) + if driver_match: + gpu_info["driver"] = driver_match.group(1) + + # 提取模块信息 + modules_match = re.search(r'Kernel modules:\s*(.+)', detail) + if modules_match: + gpu_info["modules"] = modules_match.group(1).strip() + + except: + pass + + gpus.append(gpu_info) + + except: + pass + + return gpus + + +def check_gpu_dmesg_errors() -> List[Dict[str, str]]: + """检查 dmesg 中的 GPU 相关错误。""" + errors = [] + + if not check_command_exists('dmesg'): + return errors + + try: + _, stdout, _ = execute_command( + ['dmesg'], + check_returncode=False, timeout=10 + ) + + # GPU 相关错误关键词 + gpu_error_patterns = [ + r'GPU has fallen off the bus', + r'NVRM: Xid', + r'nvidia.*error', + r'amdgpu.*error', + r'i915.*error', + r'GPU hang', + r'ring.*timeout', + r'Failed to load firmware', + r'VRAM lost', + r'gpu.*fault', + r' thermal ', + ] + + for line in stdout.split('\n'): + line_lower = line.lower() + + # 检查是否包含 GPU 相关错误 + is_gpu_error = any( + re.search(pattern, line, re.IGNORECASE) + for pattern in gpu_error_patterns + ) + + if is_gpu_error and ('error' in line_lower or 'fail' in line_lower or 'warn' in line_lower or 'Xid' in line): + # 提取时间戳 + timestamp_match = re.match(r'\[\s*([\d.]+)\]', line) + timestamp = timestamp_match.group(1) if timestamp_match else "unknown" + + errors.append({ + "timestamp": timestamp, + "message": line.strip() + }) + + # 去重并限制数量 + seen = set() + unique_errors = [] + for error in errors: + msg = error["message"] + if msg not in seen and len(unique_errors) < 20: + seen.add(msg) + unique_errors.append(error) + + return unique_errors + + except: + return [] + + +def get_gpu_processes() -> List[Dict[str, Any]]: + """获取使用 GPU 的进程列表(仅 NVIDIA)。""" + processes = [] + + if not check_command_exists('nvidia-smi'): + return processes + + try: + _, stdout, _ = execute_command( + ['nvidia-smi', 'pmon', '-s', 'um', '-c', '1'], + check_returncode=False, timeout=5 + ) + + lines = stdout.strip().split('\n') + # 跳过前两行(表头) + for line in lines[2:]: + if line.strip() and not line.startswith('#'): + parts = line.split() + if len(parts) >= 8: + processes.append({ + "gpu_index": safe_int(parts[0]), + "pid": parts[1], + "type": parts[2], + "sm_util": parts[3], + "mem_util": parts[4], + "enc_util": parts[5], + "dec_util": parts[6], + "command": parts[7] + }) + except: + pass + + return processes + + +if __name__ == '__main__': + import json + print(json.dumps(run_gpu_check(), indent=2, ensure_ascii=False)) diff --git a/modules/log_analyzer.py b/modules/log_analyzer.py new file mode 100644 index 0000000..47cd4f6 --- /dev/null +++ b/modules/log_analyzer.py @@ -0,0 +1,553 @@ +""" +ServerGuard - 日志分析模块 + +自动分析系统日志,查找硬件相关错误关键词。 +""" + +import os +import re +import gzip +from typing import Dict, Any, List, Optional +from datetime import datetime, timedelta + +import sys +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from utils import execute_command, check_command_exists, safe_int + + +# 硬件错误关键词分类 +HARDWARE_ERROR_PATTERNS = { + "cpu_errors": [ + r'Machine check events? logged', + r'Hardware Error', + r'CMCI storm', + r'machine check', + r'CPU\s*\d+.*temperature', + r'thermal.*cpu', + r'CPU.*throttl', + r'core.*temp', + r'CPU.*fault', + r'uncorrectable', + r'correctable.*error', + ], + "memory_errors": [ + r'Hardware error.*memory', + r'EDAC.*error', + r'memory.*error', + r'Memory.*parity', + r'ECC.*error', + r'ue\s+count', + r'ce\s+count', + r'Out of memory', + r'oom-kill', + r'page allocation failure', + ], + "storage_errors": [ + r'I/O error', + r'Buffer I/O error', + r'blk_update_request', + r'ata\d+.*error', + r'SATA.*error', + r'NVMe.*error', + r'critical.*warning', + r'disk error', + r'block.*error', + r'SMART.*failure', + r'medium error', + r'uncorrectable error', + ], + "pci_errors": [ + r'PCIe.*error', + r'pcieport.*error', + r'PCI.*error', + r'AER:\s*', + r'Corrected error', + r'Uncorrected error', + r'Non-Fatal error', + r'Fatal error', + r'Unsupported Request', + ], + "usb_errors": [ + r'usb.*error', + r'USB.*over-current', + r'usb.*disconnect', + r'usb.*timeout', + r'ehci.*error', + r'xhci.*error', + ], + "power_errors": [ + r'thermal.*shutdown', + r'critical.*temperature', + r'overheat', + r'power.*fail', + r'under.*voltage', + r'over.*voltage', + r'brownout', + r'power.*button', + ], + "kernel_panics": [ + r'Kernel panic', + r'sysrq.*trigger', + r'watchdog.*bug', + r'softlockup', + r'hardlockup', + r'BUG:.*spinlock', + r'BUG:.*scheduling', + r'Oops:', + r'Call Trace:', + r'general protection fault', + r'double fault', + r'stack.*corruption', + ] +} + + +def analyze_logs() -> Dict[str, Any]: + """ + 分析系统日志中的硬件错误。 + + Returns: + Dict[str, Any]: 分析结果 + """ + result = { + "status": "success", + "scan_time": datetime.now().isoformat(), + "dmesg_analysis": {}, + "journal_analysis": {}, + "hardware_errors": {}, + "critical_events": [], + "summary": {} + } + + try: + # 分析 dmesg + result["dmesg_analysis"] = analyze_dmesg() + + # 分析 journalctl + result["journal_analysis"] = analyze_journalctl() + + # 汇总错误统计 + result["hardware_errors"] = summarize_errors(result) + + # 识别关键事件 + result["critical_events"] = identify_critical_events(result) + + # 生成摘要 + total_errors = sum(result["hardware_errors"].values()) + result["summary"] = { + "total_errors_found": total_errors, + "critical_events": len(result["critical_events"]), + "recommend_action": total_errors > 0 + } + + # 如果有错误,标记警告状态 + if total_errors > 0: + result["status"] = "warning" + + except Exception as e: + result["status"] = "error" + result["error"] = str(e) + + return result + + +def analyze_dmesg() -> Dict[str, Any]: + """分析 dmesg 输出。""" + result = { + "available": False, + "error_counts": {key: 0 for key in HARDWARE_ERROR_PATTERNS.keys()}, + "recent_errors": [], + "boot_errors": [] + } + + if not check_command_exists('dmesg'): + result["note"] = "dmesg 不可用" + return result + + try: + # 获取 dmesg 输出 + _, stdout, _ = execute_command( + ['dmesg', '--time-format=iso'], + check_returncode=False, timeout=15 + ) + + result["available"] = True + + # 如果没有 --time-format 支持,使用标准格式 + if not stdout.strip(): + _, stdout, _ = execute_command( + ['dmesg'], + check_returncode=False, timeout=15 + ) + + lines = stdout.split('\n') + + # 分析每一行 + for line in lines: + if not line.strip(): + continue + + # 检查各类错误 + for error_type, patterns in HARDWARE_ERROR_PATTERNS.items(): + for pattern in patterns: + if re.search(pattern, line, re.IGNORECASE): + result["error_counts"][error_type] += 1 + + # 保存最近的一些错误 + if len(result["recent_errors"]) < 50: + error_entry = { + "type": error_type, + "message": line.strip(), + "pattern": pattern + } + if error_entry not in result["recent_errors"]: + result["recent_errors"].append(error_entry) + break + + # 检查启动错误 + result["boot_errors"] = extract_boot_errors(lines) + + except Exception as e: + result["error"] = str(e) + + return result + + +def analyze_journalctl() -> Dict[str, Any]: + """分析 journalctl 日志。""" + result = { + "available": False, + "error_counts": {key: 0 for key in HARDWARE_ERROR_PATTERNS.keys()}, + "recent_errors": [], + "boot_events": [] + } + + if not check_command_exists('journalctl'): + result["note"] = "journalctl 不可用" + return result + + try: + # 获取最近 1000 行日志 + _, stdout, stderr = execute_command( + ['journalctl', '-n', '1000', '--no-pager', '-p', 'err'], + check_returncode=False, timeout=15 + ) + + if 'No journal files were found' in stderr: + result["note"] = "无 journal 文件" + return result + + result["available"] = True + + lines = stdout.split('\n') + + for line in lines: + if not line.strip(): + continue + + # 检查各类错误 + for error_type, patterns in HARDWARE_ERROR_PATTERNS.items(): + for pattern in patterns: + if re.search(pattern, line, re.IGNORECASE): + result["error_counts"][error_type] += 1 + + if len(result["recent_errors"]) < 50: + error_entry = { + "type": error_type, + "message": line.strip() + } + if error_entry not in result["recent_errors"]: + result["recent_errors"].append(error_entry) + break + + # 获取启动事件 + result["boot_events"] = get_journal_boot_events() + + except Exception as e: + result["error"] = str(e) + + return result + + +def extract_boot_errors(lines: List[str]) -> List[Dict[str, str]]: + """提取启动过程中的错误。""" + boot_errors = [] + in_boot = False + + for line in lines: + # 检测启动阶段 + if 'Linux version' in line or 'Command line:' in line: + in_boot = True + + if in_boot and ('error' in line.lower() or 'fail' in line.lower() or 'warn' in line.lower()): + # 排除常见的非关键消息 + if not any(x in line.lower() for x in ['firmware', 'efi', 'acpi']): + boot_errors.append({ + "stage": "boot", + "message": line.strip() + }) + + # 启动完成后停止 + if in_boot and ('systemd' in line and 'startup' in line): + in_boot = False + + return boot_errors[:20] # 限制数量 + + +def get_journal_boot_events() -> List[Dict[str, str]]: + """获取 journalctl 中的启动事件。""" + events = [] + + try: + # 获取当前启动的日志 + _, stdout, _ = execute_command( + ['journalctl', '-b', '0', '--no-pager', '-p', 'warning'], + check_returncode=False, timeout=10 + ) + + for line in stdout.split('\n'): + if 'error' in line.lower() or 'fail' in line.lower() or 'hardware' in line.lower(): + events.append({"message": line.strip()}) + + return events[:20] + + except: + return [] + + +def summarize_errors(analysis_result: Dict[str, Any]) -> Dict[str, int]: + """汇总错误统计。""" + summary = {key: 0 for key in HARDWARE_ERROR_PATTERNS.keys()} + + # 合并 dmesg 和 journalctl 的统计 + dmesg_counts = analysis_result.get("dmesg_analysis", {}).get("error_counts", {}) + journal_counts = analysis_result.get("journal_analysis", {}).get("error_counts", {}) + + for error_type in summary.keys(): + summary[error_type] = dmesg_counts.get(error_type, 0) + journal_counts.get(error_type, 0) + + return summary + + +def identify_critical_events(analysis_result: Dict[str, Any]) -> List[Dict[str, Any]]: + """识别需要立即关注的关键事件。""" + critical_events = [] + + # 合并所有错误 + all_errors = [] + all_errors.extend(analysis_result.get("dmesg_analysis", {}).get("recent_errors", [])) + all_errors.extend(analysis_result.get("journal_analysis", {}).get("recent_errors", [])) + + # 定义关键错误模式 + critical_patterns = [ + (r'Kernel panic', 'kernel_panic', '内核崩溃'), + (r'hardlockup', 'hard_lockup', 'CPU 硬死锁'), + (r'softlockup', 'soft_lockup', 'CPU 软死锁'), + (r'thermal.*shutdown', 'thermal_shutdown', '过热关机'), + (r'Hardware Error', 'hardware_error', '硬件错误'), + (r'Fatal.*PCIe', 'pcie_fatal', 'PCIe 致命错误'), + (r'I/O error.*sector', 'disk_io_error', '磁盘 I/O 错误'), + (r'Uncorrectable.*error', 'uncorrectable_error', '不可纠正错误'), + (r'out of memory.*kill', 'oom_kill', 'OOM 进程杀死'), + (r'GPU.*fallen.*bus', 'gpu_disconnect', 'GPU 断开连接'), + ] + + for error in all_errors: + message = error.get("message", "") + for pattern, event_type, description in critical_patterns: + if re.search(pattern, message, re.IGNORECASE): + event = { + "type": event_type, + "description": description, + "message": message[:200], # 限制长度 + "source": "dmesg" if error in analysis_result.get("dmesg_analysis", {}).get("recent_errors", []) else "journal" + } + + # 避免重复 + if event not in critical_events: + critical_events.append(event) + + return critical_events + + +def get_kernel_panic_logs() -> List[Dict[str, str]]: + """专门查找内核崩溃信息。""" + panics = [] + + # 检查 dmesg + if check_command_exists('dmesg'): + try: + _, stdout, _ = execute_command(['dmesg'], check_returncode=False, timeout=10) + + for line in stdout.split('\n'): + if 'Kernel panic' in line or 'sysrq' in line.lower(): + panics.append({ + "source": "dmesg", + "message": line.strip() + }) + except: + pass + + # 检查 journalctl + if check_command_exists('journalctl'): + try: + _, stdout, _ = execute_command( + ['journalctl', '-k', '--no-pager', '-g', 'panic'], + check_returncode=False, timeout=10 + ) + + for line in stdout.split('\n'): + if 'panic' in line.lower(): + panics.append({ + "source": "journalctl", + "message": line.strip() + }) + except: + pass + + return panics + + +def get_hardware_error_logs() -> Dict[str, List[str]]: + """获取特定类型的硬件错误日志。""" + result = { + "mce_errors": [], + "ecc_errors": [], + "io_errors": [], + "thermal_errors": [] + } + + if check_command_exists('dmesg'): + try: + _, stdout, _ = execute_command(['dmesg'], check_returncode=False, timeout=10) + + for line in stdout.split('\n'): + # MCE 错误 + if re.search(r'Machine check|CMCI|hardware error', line, re.IGNORECASE): + result["mce_errors"].append(line.strip()) + + # ECC 错误 + if re.search(r'ECC|EDAC|memory error', line, re.IGNORECASE): + result["ecc_errors"].append(line.strip()) + + # I/O 错误 + if re.search(r'I/O error|ata.*error|blk_update', line, re.IGNORECASE): + result["io_errors"].append(line.strip()) + + # 热错误 + if re.search(r'thermal|overheat|critical temp', line, re.IGNORECASE): + result["thermal_errors"].append(line.strip()) + except: + pass + + # 限制数量 + for key in result: + result[key] = result[key][:20] + + return result + + +def search_logs_by_keyword(keyword: str, max_lines: int = 100) -> List[str]: + """ + 根据关键词搜索日志。 + + Args: + keyword: 搜索关键词 + max_lines: 最大返回行数 + + Returns: + List[str]: 匹配的行列表 + """ + results = [] + + # 搜索 dmesg + if check_command_exists('dmesg'): + try: + _, stdout, _ = execute_command( + ['dmesg'], + check_returncode=False, timeout=10 + ) + + for line in stdout.split('\n'): + if keyword.lower() in line.lower(): + results.append(f"[dmesg] {line.strip()}") + if len(results) >= max_lines: + return results + except: + pass + + # 搜索 journalctl + if check_command_exists('journalctl'): + try: + _, stdout, _ = execute_command( + ['journalctl', '-n', str(max_lines * 2), '--no-pager'], + check_returncode=False, timeout=10 + ) + + for line in stdout.split('\n'): + if keyword.lower() in line.lower(): + results.append(f"[journal] {line.strip()}") + if len(results) >= max_lines: + return results + except: + pass + + return results + + +def get_system_logs(since: Optional[str] = None, until: Optional[str] = None) -> Dict[str, Any]: + """ + 获取系统日志。 + + Args: + since: 开始时间 (格式: '2024-01-01 00:00:00') + until: 结束时间 + + Returns: + Dict[str, Any]: 日志数据 + """ + result = { + "dmesg": "", + "journalctl": "", + "kern_log": "" + } + + # dmesg + if check_command_exists('dmesg'): + try: + _, stdout, _ = execute_command(['dmesg'], check_returncode=False, timeout=10) + result["dmesg"] = stdout + except: + pass + + # journalctl + if check_command_exists('journalctl'): + try: + cmd = ['journalctl', '--no-pager', '-n', '5000'] + if since: + cmd.extend(['--since', since]) + if until: + cmd.extend(['--until', until]) + + _, stdout, _ = execute_command(cmd, check_returncode=False, timeout=15) + result["journalctl"] = stdout + except: + pass + + # /var/log/kern.log + kern_log_path = '/var/log/kern.log' + if os.path.exists(kern_log_path): + try: + with open(kern_log_path, 'r', encoding='utf-8', errors='ignore') as f: + lines = f.readlines()[-5000:] # 最后 5000 行 + result["kern_log"] = ''.join(lines) + except: + pass + + return result + + +if __name__ == '__main__': + import json + print(json.dumps(analyze_logs(), indent=2, ensure_ascii=False)) diff --git a/modules/memory.py b/modules/memory.py new file mode 100644 index 0000000..25e0b3b --- /dev/null +++ b/modules/memory.py @@ -0,0 +1,577 @@ +""" +ServerGuard - 内存检测与压力测试模块 + +深度检测内存的读写错误和稳定性。 +""" + +import os +import re +import time +from typing import Dict, Any, List, Optional + +import sys +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from utils import ( + execute_command, check_command_exists, safe_int, safe_float, + format_bytes, require_root +) + + +def run_memory_check(stress_test: bool = False, stress_duration: int = 300) -> Dict[str, Any]: + """ + 执行内存检测。 + + Args: + stress_test: 是否执行压力测试 + stress_duration: 压力测试持续时间(秒) + + Returns: + Dict[str, Any]: 检测结果 + """ + result = { + "status": "success", + "summary": {}, + "dimm_info": [], + "ecc_status": {}, + "edac_errors": {}, + "stress_test": {} + } + + try: + # 获取内存摘要信息 + result["summary"] = get_memory_summary() + + # 获取 DIMM 详细信息 + result["dimm_info"] = get_dimm_info() + + # 检查 ECC 状态 + result["ecc_status"] = check_ecc_status() + + # 检查 EDAC 错误 + result["edac_errors"] = check_edac_errors() + if result["edac_errors"].get("total_errors", 0) > 0: + result["status"] = "warning" + + # 执行内存压力测试 + if stress_test: + # 优先使用 memtester + if check_command_exists('memtester'): + result["stress_test"] = run_memtester(stress_duration) + # 备选使用 stress-ng + elif check_command_exists('stress-ng'): + result["stress_test"] = run_memory_stress_ng(stress_duration) + # 最后使用 stress + elif check_command_exists('stress'): + result["stress_test"] = run_memory_stress(stress_duration) + else: + result["stress_test"] = { + "passed": False, + "error": "未找到内存压力测试工具 (memtester/stress-ng/stress)" + } + + if not result["stress_test"].get("passed", False): + result["status"] = "error" + + except Exception as e: + result["status"] = "error" + result["error"] = str(e) + + return result + + +def get_memory_summary() -> Dict[str, Any]: + """获取内存摘要信息。""" + result = { + "total_bytes": 0, + "total_gb": 0, + "available_bytes": 0, + "available_gb": 0, + "used_bytes": 0, + "used_gb": 0, + "free_bytes": 0, + "free_gb": 0, + "buffers_bytes": 0, + "cached_bytes": 0, + "swap_total_bytes": 0, + "swap_used_bytes": 0, + "swap_free_bytes": 0 + } + + try: + with open('/proc/meminfo', 'r') as f: + meminfo = f.read() + + # 解析 meminfo + patterns = { + "total_bytes": r'MemTotal:\s+(\d+)', + "free_bytes": r'MemFree:\s+(\d+)', + "available_bytes": r'MemAvailable:\s+(\d+)', + "buffers_bytes": r'Buffers:\s+(\d+)', + "cached_bytes": r'Cached:\s+(\d+)', + "swap_total_bytes": r'SwapTotal:\s+(\d+)', + "swap_free_bytes": r'SwapFree:\s+(\d+)' + } + + for key, pattern in patterns.items(): + match = re.search(pattern, meminfo) + if match: + kb = safe_int(match.group(1)) + bytes_val = kb * 1024 + result[key] = bytes_val + + # 同时设置 GB 版本 + gb_key = key.replace('bytes', 'gb') + result[gb_key] = round(bytes_val / (1024**3), 2) + + # 计算已用内存 + result["used_bytes"] = result["total_bytes"] - result["free_bytes"] - result["buffers_bytes"] - result["cached_bytes"] + result["used_gb"] = round(result["used_bytes"] / (1024**3), 2) + + # 计算交换空间使用情况 + result["swap_used_bytes"] = result["swap_total_bytes"] - result["swap_free_bytes"] + result["swap_used_gb"] = round(result["swap_used_bytes"] / (1024**3), 2) + result["swap_free_gb"] = round(result["swap_free_bytes"] / (1024**3), 2) + + # 计算使用百分比 + if result["total_bytes"] > 0: + result["usage_percent"] = round((result["used_bytes"] / result["total_bytes"]) * 100, 1) + + except Exception as e: + result["error"] = str(e) + + return result + + +def get_dimm_info() -> List[Dict[str, Any]]: + """获取 DIMM(内存条)详细信息。""" + dimms = [] + + if check_command_exists('dmidecode'): + try: + _, stdout, _ = execute_command( + ['dmidecode', '-t', 'memory'], + check_returncode=False, timeout=15 + ) + + # 分割每个内存设备 + devices = stdout.split('Memory Device') + + for device in devices[1:]: # 第一个是标题,跳过 + dimm = {} + + # 解析各项属性 + patterns = { + "array_handle": r'Array Handle:\s*(\S+)', + "error_handle": r'Error Information Handle:\s*(\S+)', + "total_width": r'Total Width:\s*(\d+)', + "data_width": r'Data Width:\s*(\d+)', + "size": r'Size:\s*(.*)', + "form_factor": r'Form Factor:\s*(\S+)', + "set": r'Set:\s*(\S+)', + "locator": r'Locator:\s*(.+)', + "bank_locator": r'Bank Locator:\s*(.+)', + "type": r'Type:\s*(\S+)', + "type_detail": r'Type Detail:\s*(.+)', + "speed": r'Speed:\s*(.*)', + "manufacturer": r'Manufacturer:\s*(\S+)', + "serial_number": r'Serial Number:\s*(\S+)', + "asset_tag": r'Asset Tag:\s*(\S+)', + "part_number": r'Part Number:\s*(\S+)', + "rank": r'Rank:\s*(\d+)', + "configured_speed": r'Configured Memory Speed:\s*(.*)', + "minimum_voltage": r'Minimum Voltage:\s*(.+)', + "maximum_voltage": r'Maximum Voltage:\s*(.+)', + "configured_voltage": r'Configured Voltage:\s*(.+)' + } + + for key, pattern in patterns.items(): + match = re.search(pattern, device, re.IGNORECASE) + if match: + value = match.group(1).strip() + # 跳过无效值 + if value not in ['Not Specified', 'To be filled by O.E.M.', 'None', 'No Module Installed', 'Unknown']: + dimm[key] = value + + # 解析大小 + if 'size' in dimm: + size_str = dimm['size'] + if 'MB' in size_str: + dimm["size_mb"] = safe_int(size_str.replace('MB', '').strip()) + elif 'GB' in size_str: + dimm["size_gb"] = safe_float(size_str.replace('GB', '').strip()) + dimm["size_mb"] = int(dimm["size_gb"] * 1024) + elif 'No Module' in size_str: + continue # 跳过空插槽 + + # 解析速度 + if 'speed' in dimm: + speed_str = dimm['speed'] + if 'MT/s' in speed_str: + dimm["speed_mts"] = safe_int(speed_str.replace('MT/s', '').strip()) + elif 'MHz' in speed_str: + dimm["speed_mhz"] = safe_int(speed_str.replace('MHz', '').strip()) + + if dimm: + dimms.append(dimm) + + except Exception as e: + pass + + return dimms + + +def check_ecc_status() -> Dict[str, Any]: + """检查 ECC(错误校正码)内存状态。""" + result = { + "supported": False, + "enabled": False, + "mode": "unknown", + "errors": 0 + } + + # 方法 1: 检查 /proc/meminfo + try: + with open('/proc/meminfo', 'r') as f: + content = f.read() + + if 'HardwareCorrupted' in content: + result["supported"] = True + match = re.search(r'HardwareCorrupted:\s+(\d+)\s+kB', content) + if match: + result["errors"] = safe_int(match.group(1)) + except: + pass + + # 方法 2: 使用 dmidecode 检查内存类型 + if check_command_exists('dmidecode'): + try: + _, stdout, _ = execute_command( + ['dmidecode', '-t', 'memory'], + check_returncode=False, timeout=10 + ) + + if 'ECC' in stdout or 'Error Correction' in stdout: + result["supported"] = True + + # 尝试提取 ECC 模式 + match = re.search(r'Error Correction Type:\s*(.+)', stdout) + if match: + result["mode"] = match.group(1).strip() + result["enabled"] = result["mode"] != 'None' + + except: + pass + + # 方法 3: 检查 EDAC + edac_path = '/sys/devices/system/edac/mc' + if os.path.exists(edac_path): + result["edac_available"] = True + try: + # 检查每个内存控制器 + for mc in os.listdir(edac_path): + if mc.startswith('mc'): + mc_path = os.path.join(edac_path, mc) + ce_file = os.path.join(mc_path, 'ce_count') # Correctable errors + ue_file = os.path.join(mc_path, 'ue_count') # Uncorrectable errors + + if os.path.exists(ce_file): + with open(ce_file, 'r') as f: + ce_count = safe_int(f.read().strip()) + result["correctable_errors"] = result.get("correctable_errors", 0) + ce_count + + if os.path.exists(ue_file): + with open(ue_file, 'r') as f: + ue_count = safe_int(f.read().strip()) + result["uncorrectable_errors"] = result.get("uncorrectable_errors", 0) + ue_count + except: + pass + + return result + + +def check_edac_errors() -> Dict[str, Any]: + """检查 EDAC(Error Detection and Correction)错误。""" + result = { + "total_errors": 0, + "correctable_errors": 0, + "uncorrectable_errors": 0, + "memory_controllers": [] + } + + edac_path = '/sys/devices/system/edac/mc' + + if not os.path.exists(edac_path): + result["note"] = "EDAC 不可用" + return result + + try: + for mc_name in os.listdir(edac_path): + if not mc_name.startswith('mc'): + continue + + mc_path = os.path.join(edac_path, mc_name) + mc_info = {"name": mc_name} + + # 读取 CE 计数 + ce_file = os.path.join(mc_path, 'ce_count') + if os.path.exists(ce_file): + with open(ce_file, 'r') as f: + ce = safe_int(f.read().strip()) + mc_info["correctable_errors"] = ce + result["correctable_errors"] += ce + + # 读取 UE 计数 + ue_file = os.path.join(mc_path, 'ue_count') + if os.path.exists(ue_file): + with open(ue_file, 'r') as f: + ue = safe_int(f.read().strip()) + mc_info["uncorrectable_errors"] = ue + result["uncorrectable_errors"] += ue + + # 读取内存控制器信息 + info_files = ['mc_name', 'size_mb', 'mem_type', 'edac_mc_mode'] + for info_file in info_files: + filepath = os.path.join(mc_path, info_file) + if os.path.exists(filepath): + with open(filepath, 'r') as f: + mc_info[info_file] = f.read().strip() + + result["memory_controllers"].append(mc_info) + + result["total_errors"] = result["correctable_errors"] + result["uncorrectable_errors"] + + except Exception as e: + result["error"] = str(e) + + return result + + +@require_root +def run_memtester(duration: int = 300) -> Dict[str, Any]: + """ + 运行内存压力测试。 + + Args: + duration: 测试持续时间(秒),实际 memtester 是基于大小而非时间 + + Returns: + Dict[str, Any]: 测试结果 + """ + result = { + "passed": False, + "size_mb": 0, + "iterations": 1, + "start_time": None, + "end_time": None, + "duration_seconds": 0, + "errors": [], + "tests_run": [] + } + + if not check_command_exists('memtester'): + result["errors"].append("memtester 未安装") + return result + + try: + # 计算测试内存大小 + # 留出一些内存给系统和 stress-ng 使用 + with open('/proc/meminfo', 'r') as f: + content = f.read() + + match = re.search(r'MemAvailable:\s+(\d+)', content) + if match: + available_mb = safe_int(match.group(1)) // 1024 + # 使用可用内存的 70% + test_size_mb = max(64, int(available_mb * 0.7)) + else: + test_size_mb = 256 + + result["size_mb"] = test_size_mb + result["start_time"] = time.strftime('%Y-%m-%d %H:%M:%S') + start_ts = time.time() + + # 运行 memtester + cmd = ['memtester', f'{test_size_mb}M', '1'] + + _, stdout, stderr = execute_command( + cmd, + timeout=max(300, test_size_mb), # 根据内存大小调整超时 + check_returncode=False + ) + + result["end_time"] = time.strftime('%Y-%m-%d %H:%M:%S') + result["duration_seconds"] = round(time.time() - start_ts, 2) + + output = stdout + stderr + result["raw_output"] = output[:2000] # 保存部分原始输出 + + # 分析结果 + if 'FAILURE' in output.upper(): + result["passed"] = False + # 提取错误信息 + for line in output.split('\n'): + if 'FAILURE' in line.upper() or 'error' in line.lower(): + result["errors"].append(line.strip()) + elif 'SUCCESS' in output.upper() or 'ok' in output.lower() or 'finished' in output.lower(): + result["passed"] = True + else: + # 检查是否完成所有测试 + if 'Done' in output or 'finished' in output.lower(): + result["passed"] = True + else: + result["passed"] = False + result["errors"].append("测试可能未完成") + + # 提取运行的测试 + test_names = [ + 'Stuck Address', 'Random Value', 'Compare XOR', + 'Compare SUB', 'Compare MUL', 'Compare DIV', + 'Compare OR', 'Compare AND', 'Sequential Increment', + 'Solid Bits', 'Block Sequential', 'Checkerboard', + 'Bit Spread', 'Bit Flip', 'Walking Ones', 'Walking Zeroes' + ] + + for test in test_names: + if test in output: + result["tests_run"].append(test) + + except Exception as e: + result["passed"] = False + result["errors"].append(str(e)) + + return result + + +@require_root +def run_memory_stress_ng(duration: int = 300) -> Dict[str, Any]: + """ + 使用 stress-ng 进行内存压力测试。 + + Args: + duration: 测试持续时间(秒) + + Returns: + Dict[str, Any]: 测试结果 + """ + result = { + "passed": False, + "tool": "stress-ng", + "duration_seconds": duration, + "start_time": None, + "end_time": None, + "errors": [] + } + + if not check_command_exists('stress-ng'): + result["errors"].append("stress-ng 未安装") + return result + + try: + result["start_time"] = time.strftime('%Y-%m-%d %H:%M:%S') + + # 运行 stress-ng 内存测试 + cmd = [ + 'stress-ng', + '--vm', '4', # 4 个 vm worker + '--vm-bytes', '80%', # 每个 worker 使用 80% 可用内存 + '--vm-method', 'all', # 使用所有测试方法 + '--timeout', str(duration), + '--metrics-brief' + ] + + _, stdout, stderr = execute_command( + cmd, + timeout=duration + 30, + check_returncode=False + ) + + result["end_time"] = time.strftime('%Y-%m-%d %H:%M:%S') + + output = stdout + stderr + + if 'error' in output.lower() or 'fail' in output.lower(): + result["passed"] = False + else: + result["passed"] = True + + # 提取指标 + bogo_ops = re.search(r'stress-ng:\s+vm:\s+(\d+)\s+bogo ops', output) + if bogo_ops: + result["bogo_ops"] = safe_int(bogo_ops.group(1)) + + except Exception as e: + result["passed"] = False + result["errors"].append(str(e)) + + return result + + +@require_root +def run_memory_stress(duration: int = 300) -> Dict[str, Any]: + """ + 使用 stress 进行内存压力测试(备选方案)。 + + Args: + duration: 测试持续时间(秒) + + Returns: + Dict[str, Any]: 测试结果 + """ + result = { + "passed": False, + "tool": "stress", + "duration_seconds": duration, + "start_time": None, + "end_time": None, + "workers": 4, + "errors": [] + } + + if not check_command_exists('stress'): + result["errors"].append("stress 未安装") + return result + + try: + result["start_time"] = time.strftime('%Y-%m-%d %H:%M:%S') + + # 运行 stress 内存测试 + # --vm: 内存分配 worker 数量 + # --vm-bytes: 每个 worker 分配的内存 + # --vm-keep: 保持内存占用 + # --timeout: 超时时间 + cmd = [ + 'stress', + '--vm', '4', + '--vm-bytes', '80%', + '--vm-keep', + '--timeout', str(duration) + ] + + _, stdout, stderr = execute_command( + cmd, + timeout=duration + 30, + check_returncode=False + ) + + result["end_time"] = time.strftime('%Y-%m-%d %H:%M:%S') + + output = stdout + stderr + + # stress 的成功退出码通常是 0 + # 如果有错误输出,可能是失败的 + if 'error' in output.lower() or 'fail' in output.lower(): + result["passed"] = False + else: + result["passed"] = True + + except Exception as e: + result["passed"] = False + result["errors"].append(str(e)) + + return result + + +if __name__ == '__main__': + import json + print(json.dumps(run_memory_check(stress_test=False), indent=2, ensure_ascii=False)) diff --git a/modules/sensors.py b/modules/sensors.py new file mode 100644 index 0000000..645e52d --- /dev/null +++ b/modules/sensors.py @@ -0,0 +1,545 @@ +""" +ServerGuard - 电源与主板传感器监控模块 + +监控电源、主板传感器数据,包括温度、电压、风扇转速等。 +""" + +import os +import re +from typing import Dict, Any, List, Optional + +import sys +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from utils import ( + execute_command, check_command_exists, parse_key_value_output, + safe_int, safe_float, require_root +) + + +def run_sensors_check() -> Dict[str, Any]: + """ + 执行传感器检测。 + + Returns: + Dict[str, Any]: 检测结果 + """ + result = { + "status": "success", + "lm_sensors": {}, + "ipmi_sensors": {}, + "thermal_zones": {}, + "power_supplies": {}, + "ipmi_sel": {} + } + + try: + # 获取 lm-sensors 数据 + result["lm_sensors"] = get_lm_sensors_data() + + # 获取 IPMI 传感器数据 + result["ipmi_sensors"] = get_ipmi_sensors_data() + + # 获取 thermal zone 数据 + result["thermal_zones"] = get_thermal_zones() + + # 获取电源信息 + result["power_supplies"] = get_power_supply_info() + + # 获取 IPMI SEL 日志 + result["ipmi_sel"] = get_ipmi_sel_logs() + + # 检查警告条件 + warnings = check_sensor_warnings(result) + if warnings: + result["warnings"] = warnings + result["status"] = "warning" + + except Exception as e: + result["status"] = "error" + result["error"] = str(e) + + return result + + +def get_lm_sensors_data() -> Dict[str, Any]: + """获取 lm-sensors 传感器数据。""" + result = { + "available": False, + "chips": {} + } + + if not check_command_exists('sensors'): + result["error"] = "lm-sensors 未安装" + return result + + try: + # 检测传感器芯片 + _, stdout, _ = execute_command( + ['sensors', '-u'], + check_returncode=False, timeout=15 + ) + + if not stdout.strip(): + result["error"] = "无传感器数据,可能需要运行 sensors-detect" + return result + + result["available"] = True + + # 解析 sensors -u 输出 + current_chip = None + current_adapter = None + current_feature = None + + for line in stdout.split('\n'): + line = line.rstrip() + + if not line: + continue + + # 检测芯片名称行(以冒号结尾的非缩进行) + if not line.startswith(' ') and line.endswith(':'): + current_chip = line.rstrip(':') + result["chips"][current_chip] = { + "features": {} + } + current_feature = None + continue + + # 检测 Adapter 行 + if line.strip().startswith('Adapter:'): + current_adapter = line.split(':', 1)[1].strip() + if current_chip: + result["chips"][current_chip]["adapter"] = current_adapter + continue + + # 检测功能名称行(缩进的非冒号结尾行) + if line.startswith(' ') and not line.startswith(' ') and not line.endswith(':'): + current_feature = line.strip().rstrip(':') + if current_chip: + result["chips"][current_chip]["features"][current_feature] = {} + continue + + # 检测属性行(四个空格缩进) + if line.startswith(' ') and ':' in line and current_chip and current_feature: + key_value = line.strip().split(':', 1) + if len(key_value) == 2: + key = key_value[0].strip() + value_str = key_value[1].strip() + + # 提取数值 + value_match = re.search(r'([\d.]+)', value_str) + if value_match: + value = safe_float(value_match.group(1)) + + feature_data = result["chips"][current_chip]["features"][current_feature] + + # 分类存储 + if '_input' in key: + feature_data["value"] = value + elif '_max' in key: + feature_data["max"] = value + elif '_min' in key: + feature_data["min"] = value + elif '_crit' in key: + feature_data["critical"] = value + elif '_alarm' in key: + feature_data["alarm"] = value > 0 + else: + feature_data[key] = value + + # 提取常用传感器的汇总数据 + result["summary"] = extract_sensor_summary(result["chips"]) + + except Exception as e: + result["error"] = str(e) + + return result + + +def extract_sensor_summary(chips: Dict[str, Any]) -> Dict[str, Any]: + """从传感器数据中提取常用指标的汇总。""" + summary = { + "temperatures": {}, + "voltages": {}, + "fans": {}, + "powers": {}, + "currents": {} + } + + for chip_name, chip_data in chips.items(): + for feature_name, feature_data in chip_data.get("features", {}).items(): + value = feature_data.get("value") + if value is None: + continue + + feature_lower = feature_name.lower() + + # 温度传感器 + if 'temp' in feature_lower or 'thermal' in feature_lower: + # 提取传感器编号 + temp_match = re.search(r'temp(\d+)', feature_lower) + if temp_match: + temp_id = temp_match.group(1) + summary["temperatures"][f"{chip_name}_temp{temp_id}"] = { + "value": value, + "max": feature_data.get("max"), + "critical": feature_data.get("critical"), + "alarm": feature_data.get("alarm", False) + } + + # 电压传感器 + elif 'in' in feature_lower or 'voltage' in feature_lower or 'vcc' in feature_lower: + summary["voltages"][f"{chip_name}_{feature_name}"] = { + "value": value, + "min": feature_data.get("min"), + "max": feature_data.get("max"), + "alarm": feature_data.get("alarm", False) + } + + # 风扇转速 + elif 'fan' in feature_lower: + fan_match = re.search(r'fan(\d+)', feature_lower) + if fan_match: + fan_id = fan_match.group(1) + summary["fans"][f"{chip_name}_fan{fan_id}"] = { + "rpm": value, + "min": feature_data.get("min"), + "alarm": feature_data.get("alarm", False) + } + + # 功率传感器 + elif 'power' in feature_lower or 'watt' in feature_lower: + summary["powers"][f"{chip_name}_{feature_name}"] = { + "value": value, + "max": feature_data.get("max") + } + + # 电流传感器 + elif 'curr' in feature_lower or 'amp' in feature_lower: + summary["currents"][f"{chip_name}_{feature_name}"] = { + "value": value, + "max": feature_data.get("max") + } + + return summary + + +def get_ipmi_sensors_data() -> Dict[str, Any]: + """获取 IPMI 传感器数据。""" + result = { + "available": False, + "sensors": {} + } + + if not check_command_exists('ipmitool'): + result["note"] = "ipmitool 未安装" + return result + + try: + # 检查 IPMI 是否可用 + _, stdout, stderr = execute_command( + ['ipmitool', 'sensor'], + check_returncode=False, timeout=10 + ) + + if 'Could not open device' in stderr or 'Driver not found' in stderr: + result["note"] = "IPMI 设备不可用" + return result + + result["available"] = True + + # 解析传感器列表 + for line in stdout.split('\n'): + if not line.strip() or '|' not in line: + continue + + parts = [p.strip() for p in line.split('|')] + if len(parts) >= 4: + sensor_name = parts[0] + sensor_value = parts[1] + sensor_unit = parts[2] + sensor_status = parts[3] + + result["sensors"][sensor_name] = { + "value": sensor_value, + "unit": sensor_unit, + "status": sensor_status + } + + # 分类传感器 + result["categories"] = categorize_ipmi_sensors(result["sensors"]) + + except Exception as e: + result["error"] = str(e) + + return result + + +def categorize_ipmi_sensors(sensors: Dict[str, Any]) -> Dict[str, Dict[str, Any]]: + """将 IPMI 传感器分类。""" + categories = { + "temperatures": {}, + "voltages": {}, + "fans": {}, + "power": {}, + "currents": {}, + "other": {} + } + + for name, data in sensors.items(): + name_lower = name.lower() + unit = data.get("unit", "").lower() + + if 'temp' in name_lower or unit == 'degrees c': + categories["temperatures"][name] = data + elif 'volt' in name_lower or unit == 'volts' or 'vcc' in name_lower or '3.3v' in name_lower or '5v' in name_lower or '12v' in name_lower: + categories["voltages"][name] = data + elif 'fan' in name_lower or 'rpm' in unit: + categories["fans"][name] = data + elif 'power' in name_lower or 'watt' in unit: + categories["power"][name] = data + elif 'current' in name_lower or 'amp' in unit: + categories["currents"][name] = data + else: + categories["other"][name] = data + + return categories + + +def get_thermal_zones() -> Dict[str, Any]: + """从 thermal zone 获取温度信息。""" + result = { + "zones": {}, + "policies": {} + } + + thermal_path = '/sys/class/thermal' + + if not os.path.exists(thermal_path): + return result + + try: + for zone_name in os.listdir(thermal_path): + if not zone_name.startswith('thermal_zone'): + continue + + zone_path = os.path.join(thermal_path, zone_name) + zone_info = {} + + # 读取类型 + type_file = os.path.join(zone_path, 'type') + if os.path.exists(type_file): + with open(type_file, 'r') as f: + zone_info["type"] = f.read().strip() + + # 读取温度 (毫摄氏度转换为摄氏度) + temp_file = os.path.join(zone_path, 'temp') + if os.path.exists(temp_file): + with open(temp_file, 'r') as f: + temp_mc = safe_int(f.read().strip()) + zone_info["temperature_c"] = temp_mc / 1000.0 + + # 读取策略 + policy_file = os.path.join(zone_path, 'policy') + if os.path.exists(policy_file): + with open(policy_file, 'r') as f: + zone_info["policy"] = f.read().strip() + + # 读取临界温度 + trip_point_file = os.path.join(zone_path, 'trip_point_0_temp') + if os.path.exists(trip_point_file): + with open(trip_point_file, 'r') as f: + zone_info["critical_temp_c"] = safe_int(f.read().strip()) / 1000.0 + + result["zones"][zone_name] = zone_info + + # 读取 thermal 策略 + for policy_file in os.listdir('/sys/class/thermal'): + if policy_file.startswith('cooling_device'): + policy_path = os.path.join('/sys/class/thermal', policy_file) + policy_info = {} + + type_file = os.path.join(policy_path, 'type') + if os.path.exists(type_file): + with open(type_file, 'r') as f: + policy_info["type"] = f.read().strip() + + cur_state_file = os.path.join(policy_path, 'cur_state') + if os.path.exists(cur_state_file): + with open(cur_state_file, 'r') as f: + policy_info["current_state"] = safe_int(f.read().strip()) + + max_state_file = os.path.join(policy_path, 'max_state') + if os.path.exists(max_state_file): + with open(max_state_file, 'r') as f: + policy_info["max_state"] = safe_int(f.read().strip()) + + result["policies"][policy_file] = policy_info + + except Exception as e: + result["error"] = str(e) + + return result + + +def get_power_supply_info() -> Dict[str, Any]: + """获取电源信息。""" + result = { + "supplies": [] + } + + power_supply_path = '/sys/class/power_supply' + + if not os.path.exists(power_supply_path): + return result + + try: + for supply_name in os.listdir(power_supply_path): + supply_path = os.path.join(power_supply_path, supply_name) + supply_info = {"name": supply_name} + + # 读取所有属性文件 + for attr in os.listdir(supply_path): + attr_path = os.path.join(supply_path, attr) + if os.path.isfile(attr_path): + try: + with open(attr_path, 'r') as f: + value = f.read().strip() + # 尝试转换为数字 + if value.isdigit(): + supply_info[attr] = safe_int(value) + else: + try: + supply_info[attr] = safe_float(value) + except: + supply_info[attr] = value + except: + pass + + result["supplies"].append(supply_info) + + except Exception as e: + result["error"] = str(e) + + return result + + +def get_ipmi_sel_logs() -> Dict[str, Any]: + """获取 IPMI SEL(System Event Log)日志。""" + result = { + "available": False, + "entries": [], + "hardware_errors": [], + "critical_events": [] + } + + if not check_command_exists('ipmitool'): + result["note"] = "ipmitool 未安装" + return result + + try: + # 获取 SEL 列表 + _, stdout, stderr = execute_command( + ['ipmitool', 'sel', 'elist'], + check_returncode=False, timeout=15 + ) + + if 'Could not open device' in stderr or 'Driver not found' in stderr: + result["note"] = "IPMI 设备不可用" + return result + + result["available"] = True + + # 解析 SEL 条目 + critical_keywords = ['critical', 'failure', 'error', 'thermal', 'voltage', 'power'] + hardware_keywords = ['memory', 'processor', 'hard drive', 'fan', 'power supply', 'temperature'] + + for line in stdout.split('\n'): + if not line.strip(): + continue + + # SEL 格式: ID | Date/Time | Source | Event + parts = [p.strip() for p in line.split('|')] + if len(parts) >= 4: + entry = { + "id": parts[0], + "datetime": parts[1], + "source": parts[2], + "event": parts[3] + } + + result["entries"].append(entry) + + # 检查是否为关键事件 + event_lower = entry["event"].lower() + if any(kw in event_lower for kw in critical_keywords): + result["critical_events"].append(entry) + + # 检查是否为硬件错误 + if any(kw in event_lower for kw in hardware_keywords): + result["hardware_errors"].append(entry) + + result["total_entries"] = len(result["entries"]) + result["critical_count"] = len(result["critical_events"]) + result["hardware_error_count"] = len(result["hardware_errors"]) + + except Exception as e: + result["error"] = str(e) + + return result + + +def check_sensor_warnings(sensor_data: Dict[str, Any]) -> List[str]: + """检查传感器警告条件。""" + warnings = [] + + # 检查 lm-sensors 告警 + lm_sensors = sensor_data.get("lm_sensors", {}) + summary = lm_sensors.get("summary", {}) + + # 温度告警 + for name, temp_data in summary.get("temperatures", {}).items(): + if temp_data.get("alarm"): + warnings.append(f"温度传感器 {name} 告警: {temp_data.get('value')}°C") + elif temp_data.get("value", 0) > 90: + warnings.append(f"温度传感器 {name} 温度过高: {temp_data.get('value')}°C") + + # 电压告警 + for name, volt_data in summary.get("voltages", {}).items(): + if volt_data.get("alarm"): + warnings.append(f"电压传感器 {name} 告警: {volt_data.get('value')}V") + + # 风扇告警 + for name, fan_data in summary.get("fans", {}).items(): + if fan_data.get("alarm"): + warnings.append(f"风扇 {name} 告警: {fan_data.get('rpm')} RPM") + elif fan_data.get("rpm", 0) == 0 and fan_data.get("min", 0) > 0: + warnings.append(f"风扇 {name} 可能已停止: {fan_data.get('rpm')} RPM") + + # 检查 IPMI 告警 + ipmi_sensors = sensor_data.get("ipmi_sensors", {}) + for name, data in ipmi_sensors.get("sensors", {}).items(): + status = data.get("status", "").lower() + if status in ['critical', 'non-recoverable', 'warning']: + warnings.append(f"IPMI 传感器 {name} 状态异常: {data.get('status')}") + + # 检查 IPMI SEL 关键事件 + ipmi_sel = sensor_data.get("ipmi_sel", {}) + if ipmi_sel.get("critical_count", 0) > 0: + warnings.append(f"IPMI SEL 中有 {ipmi_sel['critical_count']} 个关键事件") + + # 检查 thermal zone 温度 + thermal_zones = sensor_data.get("thermal_zones", {}) + for zone_name, zone_data in thermal_zones.get("zones", {}).items(): + temp = zone_data.get("temperature_c", 0) + critical = zone_data.get("critical_temp_c", 100) + if temp > critical * 0.9: # 超过临界温度的 90% + warnings.append(f"Thermal zone {zone_name} 温度接近临界值: {temp}°C (临界: {critical}°C)") + + return warnings + + +if __name__ == '__main__': + import json + print(json.dumps(run_sensors_check(), indent=2, ensure_ascii=False)) diff --git a/modules/storage.py b/modules/storage.py new file mode 100644 index 0000000..0e7948e --- /dev/null +++ b/modules/storage.py @@ -0,0 +1,602 @@ +""" +ServerGuard - 存储设备检测模块 + +检查硬盘/SSD 的健康状况、SMART 数据、RAID 状态。 +""" + +import os +import re +import json +from typing import Dict, Any, List, Optional + +import sys +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from utils import ( + execute_command, check_command_exists, parse_key_value_output, + safe_int, safe_float, format_bytes, require_root +) + + +def run_storage_check() -> Dict[str, Any]: + """ + 执行存储设备检测。 + + Returns: + Dict[str, Any]: 检测结果 + """ + result = { + "status": "success", + "devices": [], + "raid_status": {}, + "io_stats": {} + } + + try: + # 获取存储设备列表 + devices = get_storage_devices() + + # 检测每个设备 + for device in devices: + device_info = check_device(device) + result["devices"].append(device_info) + + # 如果有严重问题,标记警告状态 + if device_info.get("health") in ['FAILED', 'WARNING']: + result["status"] = "warning" + + # 检查 RAID 状态 + result["raid_status"] = check_raid_status() + + # 获取 I/O 统计 + result["io_stats"] = get_io_statistics() + + except Exception as e: + result["status"] = "error" + result["error"] = str(e) + + return result + + +def get_storage_devices() -> List[Dict[str, str]]: + """获取存储设备列表。""" + devices = [] + + # 方法 1: 使用 lsblk + if check_command_exists('lsblk'): + try: + _, stdout, _ = execute_command( + ['lsblk', '-d', '-n', '-o', 'NAME,TYPE,ROTA', '-J'], + check_returncode=False, timeout=10 + ) + + data = json.loads(stdout) + for dev in data.get('blockdevices', []): + if dev.get('type') == 'disk': + devices.append({ + "name": dev['name'], + "path": f"/dev/{dev['name']}", + "type": "hdd" if dev.get('rota') else "ssd" + }) + except: + pass + + # 方法 2: 扫描 /sys/block + if not devices: + try: + for name in os.listdir('/sys/block'): + if name.startswith(('sd', 'hd', 'nvme', 'vd', 'xvd', 'mmcblk')): + dev_type = "unknown" + try: + with open(f'/sys/block/{name}/queue/rotational', 'r') as f: + dev_type = "hdd" if f.read().strip() == '1' else "ssd" + except: + pass + + devices.append({ + "name": name, + "path": f"/dev/{name}", + "type": dev_type + }) + except: + pass + + return devices + + +def check_device(device: Dict[str, str]) -> Dict[str, Any]: + """检查单个存储设备。""" + result = { + "name": device["name"], + "path": device["path"], + "type": device.get("type", "unknown"), + "model": "Unknown", + "serial": "Unknown", + "firmware": "Unknown", + "size_bytes": 0, + "size_human": "Unknown", + "health": "UNKNOWN", + "smart_status": {}, + "temperature_c": None, + "power_on_hours": None, + "start_stop_count": None, + "reallocated_sectors": None, + "pending_sectors": None, + "test_result": None + } + + # 获取设备基本信息 + result.update(get_device_info(device["path"])) + + # 获取 SMART 数据 + smart_data = get_smart_data(device["path"]) + result["smart_status"] = smart_data + + # 分析健康状态 + result["health"] = analyze_health(smart_data) + + # 提取关键属性 + if "attributes" in smart_data: + attrs = smart_data["attributes"] + + # 温度 + for temp_attr in ['194 Temperature_Celsius', '190 Airflow_Temperature_Cel', 'Temperature']: + if temp_attr in attrs: + temp_val = attrs[temp_attr].get('raw_value') + if temp_val: + result["temperature_c"] = safe_int(temp_val.split()[0]) + break + + # 运行时间 + if '9 Power_On_Hours' in attrs: + result["power_on_hours"] = safe_int(attrs['9 Power_On_Hours'].get('raw_value', 0)) + + # 启动次数 + if '4 Start_Stop_Count' in attrs: + result["start_stop_count"] = safe_int(attrs['4 Start_Stop_Count'].get('raw_value', 0)) + + # 重映射扇区 + if '5 Reallocated_Sector_Ct' in attrs: + result["reallocated_sectors"] = safe_int(attrs['5 Reallocated_Sector_Ct'].get('raw_value', 0)) + + # 待处理扇区 + if '197 Current_Pending_Sector' in attrs: + result["pending_sectors"] = safe_int(attrs['197 Current_Pending_Sector'].get('raw_value', 0)) + + # NVMe 特殊处理 + if device["name"].startswith('nvme'): + nvme_data = get_nvme_data(device["path"]) + result["nvme_data"] = nvme_data + if nvme_data.get("temperature"): + result["temperature_c"] = nvme_data["temperature"] + if nvme_data.get("health"): + result["health"] = nvme_data["health"] + + return result + + +def get_device_info(device_path: str) -> Dict[str, Any]: + """获取设备基本信息。""" + info = {} + + # 使用 smartctl -i 获取信息 + if check_command_exists('smartctl'): + try: + _, stdout, _ = execute_command( + ['smartctl', '-i', device_path], + check_returncode=False, timeout=10 + ) + + patterns = { + "model": r'Device Model:\s*(.+)', + "serial": r'Serial Number:\s*(\S+)', + "firmware": r'Firmware Version:\s*(\S+)', + "size_human": r'User Capacity:\s*(.+)', + "sector_size": r'Sector Size:\s*(.+)', + "rotation_rate": r'Rotation Rate:\s*(.+)', + "form_factor": r'Form Factor:\s*(.+)', + "transport": r'Transport protocol:\s*(.+)' + } + + for key, pattern in patterns.items(): + match = re.search(pattern, stdout) + if match: + info[key] = match.group(1).strip() + + # 提取容量字节数 + size_match = re.search(r'User Capacity:\s*[\d,]+\s*bytes\s*\[(\d+)\]', stdout) + if size_match: + info["size_bytes"] = safe_int(size_match.group(1)) + + # 是否为 SSD + if 'Solid State Device' in stdout or 'Rotation Rate: Solid State Device' in stdout: + info["is_ssd"] = True + elif 'Rotation Rate' in stdout and 'Solid State' not in stdout: + info["is_ssd"] = False + + except: + pass + + # 备用:从 /sys 获取大小 + if "size_bytes" not in info or info["size_bytes"] == 0: + try: + dev_name = os.path.basename(device_path) + with open(f'/sys/block/{dev_name}/size', 'r') as f: + sectors = safe_int(f.read().strip()) + info["size_bytes"] = sectors * 512 + info["size_human"] = format_bytes(info["size_bytes"]) + except: + pass + + return info + + +def get_smart_data(device_path: str) -> Dict[str, Any]: + """获取 SMART 数据。""" + result = { + "supported": False, + "enabled": False, + "overall": "UNKNOWN", + "attributes": {}, + "self_tests": [] + } + + if not check_command_exists('smartctl'): + result["error"] = "smartctl 未安装" + return result + + try: + # 检查 SMART 支持 + _, stdout, _ = execute_command( + ['smartctl', '-i', device_path], + check_returncode=False, timeout=10 + ) + + if 'SMART support is: Available' in stdout: + result["supported"] = True + if 'SMART support is: Enabled' in stdout: + result["enabled"] = True + + # 获取所有 SMART 数据 + _, stdout, _ = execute_command( + ['smartctl', '-a', device_path], + check_returncode=False, timeout=15 + ) + + # 解析整体健康状态 + if 'PASSED' in stdout or 'OK' in stdout: + result["overall"] = "PASSED" + elif 'FAILED' in stdout: + result["overall"] = "FAILED" + + # 解析 SMART 属性表 (ATA 设备) + if 'ID#' in stdout and 'ATTRIBUTE_NAME' in stdout: + lines = stdout.split('\n') + in_attributes = False + + for line in lines: + if 'ID#' in line and 'ATTRIBUTE_NAME' in line: + in_attributes = True + continue + + if in_attributes: + if not line.strip() or line.startswith('SMART'): + break + + # 解析属性行 + # 格式: ID# ATTRIBUTE_NAME FLAG VALUE WORST THRESH TYPE UPDATED WHEN_FAILED RAW_VALUE + parts = line.split() + if len(parts) >= 10: + attr_id = parts[0] + attr_name = parts[1] + attr_key = f"{attr_id} {attr_name}" + + result["attributes"][attr_key] = { + "flag": parts[2], + "value": safe_int(parts[3]), + "worst": safe_int(parts[4]), + "thresh": safe_int(parts[5]), + "type": parts[6], + "updated": parts[7], + "when_failed": parts[8] if parts[8] != '-' else None, + "raw_value": ' '.join(parts[9:]) + } + + # 解析自检日志 + if 'SMART Self-test log' in stdout: + self_test_section = False + for line in stdout.split('\n'): + if 'SMART Self-test log' in line: + self_test_section = True + continue + if self_test_section and line.strip() and not line.startswith('SMART'): + if '#' in line: + result["self_tests"].append(line.strip()) + + # 解析错误日志 + if 'SMART Error Log' in stdout: + error_match = re.search(r'Error (\d+)\s+occurred at', stdout) + if error_match: + result["error_count"] = safe_int(error_match.group(1)) + + except Exception as e: + result["error"] = str(e) + + return result + + +def get_nvme_data(device_path: str) -> Dict[str, Any]: + """获取 NVMe 设备特有数据。""" + result = { + "health": "UNKNOWN", + "temperature": None, + "available_spare": None, + "percentage_used": None, + "data_units_read": None, + "data_units_written": None, + "host_reads": None, + "host_writes": None + } + + if not check_command_exists('nvme'): + return result + + try: + # 获取 SMART 日志 + _, stdout, _ = execute_command( + ['nvme', 'smart-log', device_path], + check_returncode=False, timeout=10 + ) + + # 解析关键指标 + temp_match = re.search(r'temperature\s*:\s*(\d+)', stdout) + if temp_match: + result["temperature"] = safe_int(temp_match.group(1)) - 273 # 转换为摄氏度 + + spare_match = re.search(r'available spare\s*:\s*(\d+)%', stdout) + if spare_match: + result["available_spare"] = safe_int(spare_match.group(1)) + + used_match = re.search(r'percentage used\s*:\s*(\d+)%', stdout) + if used_match: + result["percentage_used"] = safe_int(used_match.group(1)) + + # 评估健康状态 + if result["percentage_used"] is not None: + if result["percentage_used"] < 90: + result["health"] = "PASSED" + else: + result["health"] = "WARNING" + + if result["available_spare"] is not None and result["available_spare"] < 10: + result["health"] = "WARNING" + + except: + pass + + return result + + +def analyze_health(smart_data: Dict[str, Any]) -> str: + """分析设备健康状态。""" + if not smart_data.get("supported"): + return "UNKNOWN" + + if smart_data.get("overall") == "FAILED": + return "FAILED" + + # 检查关键属性 + attrs = smart_data.get("attributes", {}) + + critical_attrs = { + '5 Reallocated_Sector_Ct': 'reallocated_sectors', + '197 Current_Pending_Sector': 'pending_sectors', + '198 Offline_Uncorrectable': 'offline_uncorrectable', + '196 Reallocation_Event_Count': 'reallocation_events' + } + + for attr_name, description in critical_attrs.items(): + if attr_name in attrs: + raw_value = attrs[attr_name].get('raw_value', '0') + value = safe_int(raw_value.split()[0]) + if value > 0: + return "WARNING" + + # 检查温度 + for temp_attr in ['194 Temperature_Celsius', '190 Airflow_Temperature_Cel']: + if temp_attr in attrs: + temp = attrs[temp_attr].get('value', 0) + if temp > 60: # 温度阈值 + return "WARNING" + + return "PASSED" + + +def check_raid_status() -> Dict[str, Any]: + """检查 RAID 阵列状态。""" + result = { + "raid_available": False, + "controllers": [], + "arrays": [] + } + + # 检查软件 RAID (mdadm) + if check_command_exists('mdadm'): + try: + _, stdout, _ = execute_command( + ['mdadm', '--detail', '--scan'], + check_returncode=False, timeout=10 + ) + + if stdout.strip(): + result["software_raid"] = True + result["mdadm_config"] = stdout.strip() + + # 获取详细信息 + _, detail, _ = execute_command( + ['cat', '/proc/mdstat'], + check_returncode=False, timeout=5 + ) + result["mdstat"] = detail + + # 解析每个阵列 + for line in detail.split('\n'): + if line.startswith('md'): + parts = line.split() + array_info = { + "name": parts[0], + "status": "active" if "active" in line else "inactive" + } + + # 检查是否有降级 + if '_' in line or 'recovery' in line: + array_info["degraded"] = True + result["status"] = "warning" + + result["arrays"].append(array_info) + + except: + pass + + # 检查硬件 RAID (MegaCli/storcli) + if check_command_exists('storcli'): + try: + _, stdout, _ = execute_command( + ['storcli', '/c0', 'show'], + check_returncode=False, timeout=10 + ) + result["hardware_raid"] = True + result["controller_type"] = "LSI/Broadcom" + result["storcli_output"] = stdout[:500] # 保存部分输出 + except: + pass + elif check_command_exists('MegaCli'): + try: + _, stdout, _ = execute_command( + ['MegaCli', '-AdpAllInfo', '-aALL'], + check_returncode=False, timeout=10 + ) + result["hardware_raid"] = True + result["controller_type"] = "LSI" + result["megacli_output"] = stdout[:500] + except: + pass + + return result + + +def get_io_statistics() -> Dict[str, Any]: + """获取 I/O 统计信息。""" + result = {} + + # 从 /proc/diskstats 获取 + try: + with open('/proc/diskstats', 'r') as f: + for line in f: + parts = line.split() + if len(parts) >= 14: + device = parts[2] + # 只关注物理磁盘 + if device.startswith(('sd', 'hd', 'nvme', 'vd')) and not device[-1].isdigit(): + result[device] = { + "reads_completed": safe_int(parts[3]), + "reads_merged": safe_int(parts[4]), + "sectors_read": safe_int(parts[5]), + "time_reading_ms": safe_int(parts[6]), + "writes_completed": safe_int(parts[7]), + "writes_merged": safe_int(parts[8]), + "sectors_written": safe_int(parts[9]), + "time_writing_ms": safe_int(parts[10]), + "ios_in_progress": safe_int(parts[11]), + "time_doing_ios_ms": safe_int(parts[12]), + "weighted_time_ios_ms": safe_int(parts[13]) + } + except: + pass + + return result + + +@require_root +def run_io_test(device_path: str, test_size_mb: int = 100) -> Dict[str, Any]: + """ + 运行简单的 I/O 性能测试。 + + Args: + device_path: 设备路径 + test_size_mb: 测试大小(MB) + + Returns: + Dict[str, Any]: 测试结果 + """ + result = { + "passed": False, + "device": device_path, + "test_size_mb": test_size_mb, + "read_speed_mbps": None, + "write_speed_mbps": None, + "errors": [] + } + + # 使用 fio 进行测试 + if check_command_exists('fio'): + try: + import tempfile + + with tempfile.NamedTemporaryFile(mode='w', suffix='.fio', delete=False) as f: + fio_config = f""" +[global] +directory=/tmp +filename=serverguard_test +direct=1 +size={test_size_mb}M +unlink=1 + +[seq_read] +stonewall +rw=read +bs=1M + +[seq_write] +stonewall +rw=write +bs=1M +""" + f.write(fio_config) + fio_file = f.name + + try: + _, stdout, stderr = execute_command( + ['fio', fio_file, '--output-format=json'], + timeout=120, + check_returncode=False + ) + + data = json.loads(stdout) + + for job in data.get('jobs', []): + job_name = job.get('jobname', '') + read_bw = job.get('read', {}).get('bw', 0) / 1024 # 转换为 MB/s + write_bw = job.get('write', {}).get('bw', 0) / 1024 + + if 'read' in job_name.lower() and read_bw > 0: + result["read_speed_mbps"] = round(read_bw, 2) + if 'write' in job_name.lower() and write_bw > 0: + result["write_speed_mbps"] = round(write_bw, 2) + + result["passed"] = True + + finally: + os.unlink(fio_file) + + except Exception as e: + result["errors"].append(str(e)) + else: + result["errors"].append("fio 未安装") + + return result + + +if __name__ == '__main__': + import json + print(json.dumps(run_storage_check(), indent=2, ensure_ascii=False)) diff --git a/modules/system_info.py b/modules/system_info.py new file mode 100644 index 0000000..0d7620b --- /dev/null +++ b/modules/system_info.py @@ -0,0 +1,476 @@ +""" +ServerGuard - 系统信息概览模块 + +收集服务器的硬件和操作系统基本信息。 +""" + +import os +import re +import platform +from typing import Dict, Any, List, Optional + +import sys +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from utils import ( + execute_command, parse_key_value_output, check_command_exists, + safe_int, safe_float, format_bytes +) + + +def get_system_info() -> Dict[str, Any]: + """ + 获取系统硬件和操作系统信息。 + + Returns: + Dict[str, Any]: 系统信息字典 + """ + result = { + "status": "success", + "os": {}, + "cpu": {}, + "memory": {}, + "motherboard": {}, + "storage": [], + "network": [], + "gpu": [] + } + + try: + result["os"] = get_os_info() + result["cpu"] = get_cpu_info() + result["memory"] = get_memory_info() + result["motherboard"] = get_motherboard_info() + result["storage"] = get_storage_list() + result["network"] = get_network_info() + result["gpu"] = get_gpu_list() + except Exception as e: + result["status"] = "error" + result["error"] = str(e) + + return result + + +def get_os_info() -> Dict[str, str]: + """获取操作系统信息。""" + info = { + "platform": platform.system(), + "release": platform.release(), + "version": platform.version(), + "machine": platform.machine(), + "processor": platform.processor() + } + + # 尝试获取 Linux 发行版信息 + if os.path.exists('/etc/os-release'): + try: + with open('/etc/os-release', 'r') as f: + for line in f: + if line.startswith('PRETTY_NAME='): + info["distribution"] = line.split('=', 1)[1].strip().strip('"') + break + except: + pass + + # 获取主机名 + try: + _, hostname, _ = execute_command(['hostname'], check_returncode=False) + info["hostname"] = hostname.strip() + except: + info["hostname"] = "unknown" + + # 获取 uptime + try: + with open('/proc/uptime', 'r') as f: + uptime_seconds = float(f.readline().split()[0]) + days = int(uptime_seconds // 86400) + hours = int((uptime_seconds % 86400) // 3600) + minutes = int((uptime_seconds % 3600) // 60) + info["uptime"] = f"{days}天 {hours}小时 {minutes}分钟" + except: + info["uptime"] = "unknown" + + return info + + +def get_cpu_info() -> Dict[str, Any]: + """获取 CPU 信息。""" + info = { + "model": "Unknown", + "vendor": "Unknown", + "architecture": "Unknown", + "cores": 0, + "threads": 0, + "frequency_mhz": 0, + "cache_size_kb": {} + } + + # 从 /proc/cpuinfo 获取 + try: + cpu_data = {} + with open('/proc/cpuinfo', 'r') as f: + for line in f: + if ':' in line: + key, value = line.split(':', 1) + cpu_data[key.strip()] = value.strip() + + info["model"] = cpu_data.get('model name', 'Unknown') + info["vendor"] = cpu_data.get('vendor_id', 'Unknown') + info["architecture"] = cpu_data.get('cpu architecture', platform.machine()) + info["cores"] = safe_int(cpu_data.get('cpu cores', 0)) + info["threads"] = safe_int(cpu_data.get('siblings', 0)) + info["frequency_mhz"] = safe_int(cpu_data.get('cpu MHz', 0)) + + # 缓存信息 + if 'cache size' in cpu_data: + cache = cpu_data['cache size'] + info["cache_size_kb"] = {"general": cache} + except Exception as e: + pass + + # 使用 lscpu 获取更详细的信息 + if check_command_exists('lscpu'): + try: + _, stdout, _ = execute_command(['lscpu'], check_returncode=False, timeout=10) + lscpu_data = parse_key_value_output(stdout) + + if 'Model name' in lscpu_data: + info["model"] = lscpu_data['Model name'] + if 'Architecture' in lscpu_data: + info["architecture"] = lscpu_data['Architecture'] + if 'CPU(s)' in lscpu_data: + info["threads"] = safe_int(lscpu_data['CPU(s)']) + if 'Core(s) per socket' in lscpu_data and 'Socket(s)' in lscpu_data: + cores_per_socket = safe_int(lscpu_data['Core(s) per socket']) + sockets = safe_int(lscpu_data['Socket(s)']) + info["cores"] = cores_per_socket * sockets + if 'CPU max MHz' in lscpu_data: + info["max_frequency_mhz"] = safe_float(lscpu_data['CPU max MHz']) + if 'CPU min MHz' in lscpu_data: + info["min_frequency_mhz"] = safe_float(lscpu_data['CPU min MHz']) + if 'Virtualization' in lscpu_data: + info["virtualization"] = lscpu_data['Virtualization'] + except: + pass + + return info + + +def get_memory_info() -> Dict[str, Any]: + """获取内存信息。""" + info = { + "total_gb": 0, + "available_gb": 0, + "slots_total": 0, + "slots_used": 0, + "slots": [], + "type": "Unknown", + "speed_mhz": 0, + "ecc_supported": False + } + + # 从 /proc/meminfo 获取总内存 + try: + with open('/proc/meminfo', 'r') as f: + for line in f: + if line.startswith('MemTotal:'): + kb = safe_int(line.split()[1]) + info["total_gb"] = round(kb / 1024 / 1024, 2) + elif line.startswith('MemAvailable:'): + kb = safe_int(line.split()[1]) + info["available_gb"] = round(kb / 1024 / 1024, 2) + except: + pass + + # 使用 dmidecode 获取详细内存信息 + if check_command_exists('dmidecode'): + try: + _, stdout, _ = execute_command( + ['dmidecode', '-t', 'memory'], + check_returncode=False, timeout=15 + ) + + memory_devices = stdout.split('Memory Device') + slots = [] + + for device in memory_devices[1:]: # 第一个是标题,跳过 + slot = {} + + # 解析各项属性 + size_match = re.search(r'Size:\s*(\d+)\s*MB', device) + if size_match: + slot["size_gb"] = round(safe_int(size_match.group(1)) / 1024, 2) + + type_match = re.search(r'Type:\s*(DDR\d+)', device) + if type_match: + slot["type"] = type_match.group(1) + info["type"] = type_match.group(1) + + speed_match = re.search(r'Speed:\s*(\d+)\s*MT/s', device) + if speed_match: + slot["speed_mhz"] = safe_int(speed_match.group(1)) + + manufacturer_match = re.search(r'Manufacturer:\s*(\S+)', device) + if manufacturer_match: + slot["manufacturer"] = manufacturer_match.group(1) + + locator_match = re.search(r'Locator:\s*(.+)', device) + if locator_match: + slot["locator"] = locator_match.group(1).strip() + + if slot and slot.get("size_gb", 0) > 0: + slots.append(slot) + + info["slots"] = slots + info["slots_used"] = len(slots) + + # 计算总插槽数 + array_match = re.search(r'Number Of Devices:\s*(\d+)', stdout) + if array_match: + info["slots_total"] = safe_int(array_match.group(1)) + else: + info["slots_total"] = len(slots) + + except: + pass + + # 使用 free 命令作为备用 + if info["total_gb"] == 0 and check_command_exists('free'): + try: + _, stdout, _ = execute_command(['free', '-m'], check_returncode=False) + lines = stdout.strip().split('\n') + if len(lines) > 1: + parts = lines[1].split() + if len(parts) >= 2: + info["total_gb"] = round(safe_int(parts[1]) / 1024, 2) + except: + pass + + # 检查 ECC 支持 + try: + with open('/proc/meminfo', 'r') as f: + content = f.read() + if 'HardwareCorrupted' in content: + info["ecc_supported"] = True + except: + pass + + return info + + +def get_motherboard_info() -> Dict[str, str]: + """获取主板信息。""" + info = { + "manufacturer": "Unknown", + "product_name": "Unknown", + "version": "Unknown", + "serial_number": "Unknown", + "bios_vendor": "Unknown", + "bios_version": "Unknown", + "bios_date": "Unknown" + } + + if check_command_exists('dmidecode'): + try: + # 获取主板信息 + _, stdout, _ = execute_command( + ['dmidecode', '-t', 'baseboard'], + check_returncode=False, timeout=10 + ) + + patterns = { + "manufacturer": r'Manufacturer:\s*(.+)', + "product_name": r'Product Name:\s*(.+)', + "version": r'Version:\s*(.+)', + "serial_number": r'Serial Number:\s*(.+)' + } + + for key, pattern in patterns.items(): + match = re.search(pattern, stdout) + if match: + value = match.group(1).strip() + if value not in ['Not Specified', 'To be filled by O.E.M.', 'None']: + info[key] = value + + # 获取 BIOS 信息 + _, stdout, _ = execute_command( + ['dmidecode', '-t', 'bios'], + check_returncode=False, timeout=10 + ) + + bios_patterns = { + "bios_vendor": r'Vendor:\s*(.+)', + "bios_version": r'Version:\s*(.+)', + "bios_date": r'Release Date:\s*(.+)' + } + + for key, pattern in bios_patterns.items(): + match = re.search(pattern, stdout) + if match: + info[key] = match.group(1).strip() + + except: + pass + + return info + + +def get_storage_list() -> List[Dict[str, Any]]: + """获取存储设备列表。""" + devices = [] + + # 使用 lsblk 获取块设备列表 + if check_command_exists('lsblk'): + try: + _, stdout, _ = execute_command( + ['lsblk', '-d', '-o', 'NAME,SIZE,TYPE,MODEL,VENDOR,ROTA', '-n', '-J'], + check_returncode=False, timeout=10 + ) + + import json + data = json.loads(stdout) + + for device in data.get('blockdevices', []): + dev_info = { + "name": device.get('name', 'unknown'), + "path": f"/dev/{device.get('name', 'unknown')}", + "size": device.get('size', 'unknown'), + "type": device.get('type', 'unknown'), + "model": device.get('model', 'unknown'), + "vendor": device.get('vendor', 'unknown'), + "is_rotational": device.get('rota', True) + } + devices.append(dev_info) + + except: + pass + + # 备用方法:直接读取 /sys/block + if not devices: + try: + for name in os.listdir('/sys/block'): + if name.startswith(('sd', 'hd', 'nvme', 'vd')): + dev_info = {"name": name, "path": f"/dev/{name}"} + + # 尝试读取大小 + try: + with open(f'/sys/block/{name}/size', 'r') as f: + sectors = safe_int(f.read().strip()) + size_bytes = sectors * 512 + dev_info["size"] = format_bytes(size_bytes) + except: + dev_info["size"] = "unknown" + + # 判断是否为 SSD + try: + with open(f'/sys/block/{name}/queue/rotational', 'r') as f: + dev_info["is_rotational"] = f.read().strip() == '1' + dev_info["type"] = 'hdd' if dev_info["is_rotational"] else 'ssd' + except: + dev_info["type"] = 'unknown' + + devices.append(dev_info) + except: + pass + + return devices + + +def get_network_info() -> List[Dict[str, Any]]: + """获取网络接口信息。""" + interfaces = [] + + # 使用 ip 命令 + if check_command_exists('ip'): + try: + _, stdout, _ = execute_command( + ['ip', '-j', 'link', 'show'], + check_returncode=False, timeout=10 + ) + + import json + data = json.loads(stdout) + + for iface in data: + iface_info = { + "name": iface.get('ifname', 'unknown'), + "state": iface.get('operstate', 'unknown'), + "mac_address": iface.get('address', 'unknown'), + "type": iface.get('link_type', 'unknown') + } + + # 获取 IP 地址 + if 'addr_info' in iface: + ips = [] + for addr in iface['addr_info']: + if addr.get('family') == 'inet': + ips.append(f"{addr.get('local')}/{addr.get('prefixlen', '')}") + if ips: + iface_info["ip_addresses"] = ips + + interfaces.append(iface_info) + + except: + pass + + return interfaces + + +def get_gpu_list() -> List[Dict[str, Any]]: + """获取显卡列表。""" + gpus = [] + + # 使用 lspci 查找 VGA 和 3D 控制器 + if check_command_exists('lspci'): + try: + _, stdout, _ = execute_command( + ['lspci', '-nn'], + check_returncode=False, timeout=10 + ) + + for line in stdout.split('\n'): + if 'VGA' in line or '3D controller' in line or 'Display controller' in line: + # 提取设备信息 + parts = line.split(': ', 1) + if len(parts) == 2: + bus_id = parts[0].split()[0] + description = parts[1] + + gpu_info = { + "bus_id": bus_id, + "description": description, + "type": "integrated" if "Intel" in description else "discrete" + } + + # 尝试获取更详细的信息 + try: + _, detail, _ = execute_command( + ['lspci', '-v', '-s', bus_id], + check_returncode=False, timeout=5 + ) + + # 提取驱动信息 + driver_match = re.search(r'Kernel driver in use:\s*(\S+)', detail) + if driver_match: + gpu_info["driver"] = driver_match.group(1) + + # 提取模块信息 + modules_match = re.search(r'Kernel modules:\s*(.+)', detail) + if modules_match: + gpu_info["modules"] = modules_match.group(1).strip() + + except: + pass + + gpus.append(gpu_info) + + except: + pass + + return gpus + + +if __name__ == '__main__': + # 测试模块 + import json + print(json.dumps(get_system_info(), indent=2, ensure_ascii=False)) diff --git a/quick_test.py b/quick_test.py new file mode 100755 index 0000000..bde656b --- /dev/null +++ b/quick_test.py @@ -0,0 +1,189 @@ +#!/usr/bin/env python3 +""" +ServerGuard - 快速测试脚本 + +用于快速验证各模块是否正常工作,不进行压力测试。 +""" + +import sys +import os + +# 设置日志级别为警告,减少输出 +import logging +logging.basicConfig(level=logging.WARNING) + +def test_imports(): + """测试所有模块是否能正常导入""" + print("测试模块导入...") + modules_to_test = [ + 'utils', + 'reporter', + 'modules.system_info', + 'modules.cpu', + 'modules.memory', + 'modules.storage', + 'modules.sensors', + 'modules.gpu', + 'modules.log_analyzer' + ] + + failed = [] + for module in modules_to_test: + try: + __import__(module) + print(f" ✓ {module}") + except Exception as e: + print(f" ✗ {module}: {e}") + failed.append(module) + + if failed: + print(f"\n有 {len(failed)} 个模块导入失败") + return False + else: + print("\n所有模块导入成功!") + return True + + +def test_basic_functions(): + """测试基本功能""" + print("\n测试基本功能...") + + from modules import system_info, cpu, memory, storage, sensors, gpu, log_analyzer + + # 返回字典的测试函数 + dict_tests = [ + ("系统信息", system_info.get_system_info), + ("CPU 信息", cpu.get_cpu_details), + ("内存信息", memory.get_memory_summary), + ("传感器数据", sensors.get_lm_sensors_data), + ("日志分析", log_analyzer.analyze_logs), + ] + + # 返回列表的测试函数 + list_tests = [ + ("存储设备", storage.get_storage_devices), + ("GPU 信息", gpu.check_generic_gpus), + ] + + # 测试返回字典的函数 + for name, func in dict_tests: + try: + result = func() + if isinstance(result, dict): + status = result.get("status", "unknown") + if status == "error": + print(f" ⚠ {name}: 有错误 - {result.get('error', 'Unknown')}") + else: + print(f" ✓ {name}: 正常") + else: + print(f" ✓ {name}: 正常 (返回 {type(result).__name__})") + except Exception as e: + print(f" ✗ {name}: 异常 - {e}") + + # 测试返回列表的函数 + for name, func in list_tests: + try: + result = func() + if isinstance(result, list): + print(f" ✓ {name}: 正常 (找到 {len(result)} 个项目)") + else: + print(f" ⚠ {name}: 返回类型异常 - {type(result).__name__}") + except Exception as e: + print(f" ✗ {name}: 异常 - {e}") + + print("\n基本功能测试完成") + + +def test_utils(): + """测试工具函数""" + print("\n测试工具函数...") + + from utils import safe_int, safe_float, format_bytes + + # 测试 safe_int + assert safe_int("123") == 123 + assert safe_int("32 GB") == 32 + assert safe_int("invalid", -1) == -1 + print(" ✓ safe_int") + + # 测试 safe_float + assert safe_float("123.5") == 123.5 + assert safe_float("2.5GHz") == 2.5 + print(" ✓ safe_float") + + # 测试 format_bytes + assert format_bytes(1024) == "1.00 KB" + assert format_bytes(1024**2) == "1.00 MB" + print(" ✓ format_bytes") + + print("\n工具函数测试通过") + + +def test_report_generation(): + """测试报告生成""" + print("\n测试报告生成...") + + from reporter import ReportGenerator + + generator = ReportGenerator() + + test_data = { + "scan_type": "test", + "timestamp": "2024-01-01 00:00:00", + "modules": { + "cpu": { + "status": "success", + "temperature": {"current_c": 45} + }, + "memory": { + "status": "success", + "total_gb": 32 + } + } + } + + formats = ['text', 'json', 'html'] + for fmt in formats: + try: + report = generator.generate_report(test_data, fmt) + print(f" ✓ {fmt.upper()} 格式: {len(report)} 字符") + except Exception as e: + print(f" ✗ {fmt.upper()} 格式: {e}") + + print("\n报告生成测试完成") + + +def main(): + """主函数""" + print("=" * 60) + print("ServerGuard 快速测试") + print("=" * 60) + print() + + # 测试导入 + if not test_imports(): + print("\n模块导入测试失败,请检查依赖安装") + sys.exit(1) + + # 测试工具函数 + test_utils() + + # 测试报告生成 + test_report_generation() + + # 测试基本功能 + test_basic_functions() + + print() + print("=" * 60) + print("测试完成!") + print("=" * 60) + print() + print("运行完整诊断命令:") + print(" sudo python3 main.py --quick # 快速检测") + print(" sudo python3 main.py --full # 全面诊断(含压力测试)") + print() + + +if __name__ == '__main__': + main() diff --git a/reporter.py b/reporter.py new file mode 100644 index 0000000..7ae2afd --- /dev/null +++ b/reporter.py @@ -0,0 +1,387 @@ +""" +ServerGuard - 报告生成模块 + +负责将检测结果格式化为各种输出格式。 +""" + +import json +import csv +import os +from typing import Dict, Any, List +from datetime import datetime +from io import StringIO + +try: + from rich.console import Console + from rich.table import Table + from rich.panel import Panel + from rich import box + HAS_RICH = True +except ImportError: + HAS_RICH = False + + +class ReportGenerator: + """报告生成器类""" + + def __init__(self): + self.console = Console() if HAS_RICH else None + + def generate_report(self, data: Dict[str, Any], format_type: str = 'text') -> str: + """ + 根据指定格式生成报告。 + + Args: + data: 检测结果数据 + format_type: 报告格式 (text, json, csv, html) + + Returns: + str: 格式化的报告内容 + """ + if format_type == 'json': + return self._format_json_report(data) + elif format_type == 'csv': + return self._format_csv_report(data) + elif format_type == 'html': + return self._format_html_report(data) + else: + return self._format_text_report(data) + + def save_report(self, data: Dict[str, Any], format_type: str, filepath: str): + """ + 保存报告到文件。 + + Args: + data: 检测结果数据 + format_type: 报告格式 + filepath: 输出文件路径 + """ + report = self.generate_report(data, format_type) + + # 确保目录存在 + os.makedirs(os.path.dirname(filepath) or '.', exist_ok=True) + + with open(filepath, 'w', encoding='utf-8') as f: + f.write(report) + + def _format_json_report(self, data: Dict[str, Any]) -> str: + """生成 JSON 格式报告。""" + return json.dumps(data, indent=2, ensure_ascii=False, default=str) + + def _format_csv_report(self, data: Dict[str, Any]) -> str: + """生成 CSV 格式报告。""" + output = StringIO() + writer = csv.writer(output) + + # 写入基本信息 + writer.writerow(['ServerGuard Diagnostic Report']) + writer.writerow(['Scan Type', data.get('scan_type', 'unknown')]) + writer.writerow(['Timestamp', data.get('timestamp', '')]) + writer.writerow([]) + + # 写入各模块数据 + for module_name, module_data in data.get('modules', {}).items(): + writer.writerow([f'Module: {module_name.upper()}']) + writer.writerow(['Status', module_data.get('status', 'unknown')]) + + # 展平嵌套字典 + self._write_dict_to_csv(writer, module_data, prefix='') + writer.writerow([]) + + return output.getvalue() + + def _write_dict_to_csv(self, writer, data: Dict[str, Any], prefix: str = ''): + """辅助函数:将字典写入 CSV""" + for key, value in data.items(): + if key == 'status': + continue + full_key = f"{prefix}.{key}" if prefix else key + + if isinstance(value, dict): + self._write_dict_to_csv(writer, value, full_key) + elif isinstance(value, list): + writer.writerow([full_key, ', '.join(str(v) for v in value)]) + else: + writer.writerow([full_key, value]) + + def _format_text_report(self, data: Dict[str, Any]) -> str: + """生成纯文本格式报告。""" + lines = [] + + # 报告头部 + lines.append("=" * 70) + lines.append("ServerGuard 硬件健康诊断报告") + lines.append("=" * 70) + lines.append(f"扫描类型: {data.get('scan_type', 'unknown').upper()}") + lines.append(f"生成时间: {data.get('timestamp', '')}") + if 'stress_duration' in data: + lines.append(f"压力测试时长: {data['stress_duration']} 秒") + lines.append("=" * 70) + lines.append("") + + # 各模块结果 + for module_name, module_data in data.get('modules', {}).items(): + lines.append(f"\n[{module_name.upper()}]") + lines.append("-" * 70) + + status = module_data.get('status', 'unknown') + status_symbol = '✓' if status == 'success' else '⚠' if status == 'warning' else '✗' + lines.append(f"状态: {status_symbol} {status.upper()}") + + if 'error' in module_data: + lines.append(f"错误: {module_data['error']}") + + # 格式化模块特定数据 + self._format_module_text(lines, module_name, module_data) + + lines.append("") + + # 报告尾部 + lines.append("=" * 70) + lines.append("报告结束") + lines.append("=" * 70) + + return '\n'.join(lines) + + def _format_module_text(self, lines: List[str], module_name: str, data: Dict[str, Any]): + """格式化特定模块的文本输出""" + if module_name == 'system': + if 'cpu' in data: + cpu = data['cpu'] + lines.append(f"CPU: {cpu.get('model', 'N/A')}") + lines.append(f" 核心数: {cpu.get('cores', 'N/A')} 核 / {cpu.get('threads', 'N/A')} 线程") + + if 'memory' in data: + mem = data['memory'] + lines.append(f"内存: 总计 {mem.get('total_gb', 'N/A')} GB, {mem.get('slots_used', 'N/A')} 个插槽") + + if 'storage' in data: + lines.append(f"存储设备: {len(data['storage'])} 个设备") + + elif module_name == 'cpu': + if 'temperature' in data: + temp = data['temperature'] + lines.append(f"CPU 温度: {temp.get('current_c', 'N/A')}°C") + if 'mce_errors' in data: + mce = data['mce_errors'] + lines.append(f"MCE 错误: {mce.get('count', 0)} 个") + if 'stress_test' in data: + stress = data['stress_test'] + lines.append(f"压力测试: {'通过' if stress.get('passed') else '失败'}") + lines.append(f" 运行时长: {stress.get('duration_seconds', 'N/A')} 秒") + + elif module_name == 'memory': + if 'ecc_status' in data: + ecc = data['ecc_status'] + lines.append(f"ECC 支持: {'是' if ecc.get('supported') else '否'}") + if ecc.get('errors', 0) > 0: + lines.append(f"ECC 错误: {ecc['errors']} 个") + if 'stress_test' in data: + st = data['stress_test'] + lines.append(f"内存压力测试: {'通过' if st.get('passed') else '失败'}") + if st.get('tool'): + lines.append(f" 使用工具: {st.get('tool')}") + if st.get('size_mb'): + lines.append(f" 测试大小: {st.get('size_mb')} MB") + + elif module_name == 'storage': + for device in data.get('devices', []): + lines.append(f"设备 {device.get('name', 'N/A')}:") + lines.append(f" 型号: {device.get('model', 'N/A')}") + lines.append(f" 健康状态: {device.get('health', 'N/A')}") + if 'smart_status' in device: + smart = device['smart_status'] + lines.append(f" SMART: {smart.get('overall', 'N/A')}") + + elif module_name == 'sensors': + if 'temperatures' in data: + lines.append("温度传感器:") + for name, value in data['temperatures'].items(): + lines.append(f" {name}: {value}°C") + if 'voltages' in data: + lines.append("电压传感器:") + for name, value in data['voltages'].items(): + lines.append(f" {name}: {value}V") + + elif module_name == 'logs': + if 'hardware_errors' in data: + errors = data['hardware_errors'] + total = sum(errors.values()) + lines.append(f"硬件错误总计: {total} 个") + for error_type, count in errors.items(): + if count > 0: + lines.append(f" {error_type}: {count} 个") + + def _format_html_report(self, data: Dict[str, Any]) -> str: + """生成 HTML 格式报告。""" + html_parts = [] + + # HTML 头部 + html_parts.append(""" + + + + + ServerGuard 诊断报告 + + +""") + + # 报告头部 + html_parts.append(f""" +
+

🔧 ServerGuard 硬件健康诊断报告

+
+ 扫描类型: {data.get('scan_type', 'unknown').upper()} | + 生成时间: {data.get('timestamp', '')} +
+
""") + + # 各模块结果 + for module_name, module_data in data.get('modules', {}).items(): + status = module_data.get('status', 'unknown') + status_class = f'status-{status}' + + html_parts.append(f""" +
+
+ {module_name.upper()} + {status.upper()} +
""") + + if 'error' in module_data: + html_parts.append(f""" +
+ 错误: {module_data['error']} +
""") + else: + html_parts.append('
') + self._format_module_html(html_parts, module_name, module_data) + html_parts.append('
') + + html_parts.append('
') + + # 报告尾部 + html_parts.append(""" + + +""") + + return '\n'.join(html_parts) + + def _format_module_html(self, html_parts: List[str], module_name: str, data: Dict[str, Any]): + """格式化特定模块的 HTML 输出""" + for key, value in data.items(): + if key == 'status': + continue + + display_key = key.replace('_', ' ').title() + + if isinstance(value, dict): + html_parts.append(f""" +
+
{display_key}
+
{len(value)} 项数据
+
""") + elif isinstance(value, list): + html_parts.append(f""" +
+
{display_key}
+
{len(value)} 个项目
+
""") + else: + html_parts.append(f""" +
+
{display_key}
+
{value}
+
""") diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..e41ba85 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +rich>=13.0.0 +psutil>=5.9.0 diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..06c9bff --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1,3 @@ +""" +ServerGuard 测试模块 +""" diff --git a/tests/test_modules.py b/tests/test_modules.py new file mode 100644 index 0000000..b19cefd --- /dev/null +++ b/tests/test_modules.py @@ -0,0 +1,175 @@ +""" +测试各个硬件检测模块 +""" + +import unittest +from unittest.mock import patch, MagicMock +import sys +import os + +# 添加父目录到路径 +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from modules import system_info, cpu, memory, storage, sensors, gpu, log_analyzer + + +class TestSystemInfo(unittest.TestCase): + """测试系统信息模块""" + + @patch('modules.system_info.execute_command') + def test_get_os_info(self, mock_exec): + mock_exec.return_value = (0, "test-hostname\n", "") + result = system_info.get_os_info() + self.assertIn("platform", result) + self.assertIn("machine", result) + + def test_get_cpu_info(self): + result = system_info.get_cpu_info() + self.assertIn("model", result) + # 在大多数系统上应该能获取到一些信息 + self.assertIsInstance(result["model"], str) + + def test_get_memory_info(self): + result = system_info.get_memory_info() + self.assertIn("total_gb", result) + self.assertIsInstance(result["total_gb"], (int, float)) + + def test_get_system_info(self): + result = system_info.get_system_info() + self.assertIn("status", result) + self.assertIn("cpu", result) + self.assertIn("memory", result) + + +class TestCPU(unittest.TestCase): + """测试 CPU 模块""" + + def test_get_cpu_details(self): + result = cpu.get_cpu_details() + self.assertIn("model", result) + self.assertIn("cores", result) + self.assertIsInstance(result["cores"], int) + + def test_get_cpu_temperature(self): + result = cpu.get_cpu_temperature() + self.assertIn("status", result) + self.assertIn("sensors", result) + + def test_get_load_average(self): + result = cpu.get_load_average() + self.assertIn("1min", result) + self.assertIn("5min", result) + self.assertIn("15min", result) + + def test_check_mce_errors(self): + result = cpu.check_mce_errors() + self.assertIn("count", result) + self.assertIn("status", result) + + +class TestMemory(unittest.TestCase): + """测试内存模块""" + + def test_get_memory_summary(self): + result = memory.get_memory_summary() + self.assertIn("total_bytes", result) + self.assertIn("total_gb", result) + self.assertIsInstance(result["total_gb"], (int, float)) + + def test_get_dimm_info(self): + result = memory.get_dimm_info() + self.assertIsInstance(result, list) + + def test_check_ecc_status(self): + result = memory.check_ecc_status() + self.assertIn("supported", result) + self.assertIsInstance(result["supported"], bool) + + def test_check_edac_errors(self): + result = memory.check_edac_errors() + self.assertIn("total_errors", result) + self.assertIsInstance(result["total_errors"], int) + + +class TestStorage(unittest.TestCase): + """测试存储模块""" + + def test_get_storage_devices(self): + result = storage.get_storage_devices() + self.assertIsInstance(result, list) + + def test_check_raid_status(self): + result = storage.check_raid_status() + self.assertIn("arrays", result) + self.assertIsInstance(result["arrays"], list) + + def test_get_io_statistics(self): + result = storage.get_io_statistics() + self.assertIsInstance(result, dict) + + +class TestSensors(unittest.TestCase): + """测试传感器模块""" + + def test_get_lm_sensors_data(self): + result = sensors.get_lm_sensors_data() + self.assertIn("available", result) + + def test_get_thermal_zones(self): + result = sensors.get_thermal_zones() + self.assertIn("zones", result) + self.assertIsInstance(result["zones"], dict) + + def test_get_power_supply_info(self): + result = sensors.get_power_supply_info() + self.assertIn("supplies", result) + self.assertIsInstance(result["supplies"], list) + + +class TestGPU(unittest.TestCase): + """测试 GPU 模块""" + + def test_check_generic_gpus(self): + result = gpu.check_generic_gpus() + self.assertIsInstance(result, list) + + def test_check_gpu_dmesg_errors(self): + result = gpu.check_gpu_dmesg_errors() + self.assertIsInstance(result, list) + + +class TestLogAnalyzer(unittest.TestCase): + """测试日志分析模块""" + + def test_get_kernel_panic_logs(self): + result = log_analyzer.get_kernel_panic_logs() + self.assertIsInstance(result, list) + + def test_get_hardware_error_logs(self): + result = log_analyzer.get_hardware_error_logs() + self.assertIn("mce_errors", result) + self.assertIn("ecc_errors", result) + self.assertIn("io_errors", result) + + def test_summarize_errors(self): + test_data = { + "dmesg_analysis": { + "error_counts": { + "cpu_errors": 5, + "memory_errors": 3 + } + }, + "journal_analysis": { + "error_counts": { + "cpu_errors": 2, + "memory_errors": 1 + } + } + } + result = log_analyzer.summarize_errors(test_data) + self.assertEqual(result["cpu_errors"], 7) + self.assertEqual(result["memory_errors"], 4) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 0000000..58d0fa8 --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,94 @@ +""" +测试 utils 模块 +""" + +import unittest +import sys +import os + +# 添加父目录到路径 +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from utils import ( + parse_key_value_output, parse_table_output, extract_with_regex, + safe_int, safe_float, format_bytes, sanitize_filename, + merge_dicts, check_command_exists +) + + +class TestParseFunctions(unittest.TestCase): + """测试解析函数""" + + def test_parse_key_value_output(self): + text = """ +Key1: Value1 +Key2: Value2 +# Comment line +Key3: Value with: colon +""" + result = parse_key_value_output(text) + self.assertEqual(result["Key1"], "Value1") + self.assertEqual(result["Key2"], "Value2") + self.assertEqual(result["Key3"], "Value with: colon") + + def test_parse_table_output(self): + text = """ +NAME SIZE TYPE MODEL +sda 1T disk Samsung SSD +sdb 2T disk WD HDD +""" + result = parse_table_output(text, headers=["NAME", "SIZE", "TYPE", "MODEL"]) + self.assertEqual(len(result), 2) + self.assertEqual(result[0]["NAME"], "sda") + self.assertEqual(result[1]["TYPE"], "disk") + + def test_extract_with_regex(self): + text = "Temperature: 45.5 degrees" + result = extract_with_regex(text, r'Temperature:\s*([\d.]+)') + self.assertEqual(result, "45.5") + + def test_safe_int(self): + self.assertEqual(safe_int("123"), 123) + self.assertEqual(safe_int("123.5"), 123) + self.assertEqual(safe_int("1,234"), 1234) + self.assertEqual(safe_int("32 GB"), 32) + self.assertEqual(safe_int("invalid"), 0) + self.assertEqual(safe_int("invalid", -1), -1) + + def test_safe_float(self): + self.assertEqual(safe_float("123.5"), 123.5) + self.assertEqual(safe_float("2.5GHz"), 2.5) + self.assertEqual(safe_float("invalid"), 0.0) + + def test_format_bytes(self): + self.assertEqual(format_bytes(0), "0 B") + self.assertEqual(format_bytes(1024), "1.00 KB") + self.assertEqual(format_bytes(1024**2), "1.00 MB") + self.assertEqual(format_bytes(1024**3), "1.00 GB") + + def test_sanitize_filename(self): + self.assertEqual(sanitize_filename("file.txt"), "file_name_.txt") + self.assertEqual(sanitize_filename("path/to/file"), "path/to/file") + + def test_merge_dicts(self): + base = {"a": 1, "b": {"c": 2}} + update = {"b": {"d": 3}, "e": 4} + result = merge_dicts(base, update) + self.assertEqual(result["a"], 1) + self.assertEqual(result["b"]["c"], 2) + self.assertEqual(result["b"]["d"], 3) + self.assertEqual(result["e"], 4) + + +class TestCommandFunctions(unittest.TestCase): + """测试命令相关函数""" + + def test_check_command_exists(self): + # ls 应该存在 + self.assertTrue(check_command_exists("ls")) + # 不存在的命令 + self.assertFalse(check_command_exists("nonexistent_command_12345")) + + +if __name__ == '__main__': + unittest.main() diff --git a/utils.py b/utils.py new file mode 100644 index 0000000..8a835b4 --- /dev/null +++ b/utils.py @@ -0,0 +1,419 @@ +""" +ServerGuard - 通用工具库 + +提供命令执行、日志配置、输出解析等通用功能。 +""" + +import subprocess +import logging +import sys +import os +import re +import json +from typing import List, Dict, Any, Optional, Tuple, Union +from datetime import datetime + + +class ServerGuardError(Exception): + """ServerGuard 基础异常类""" + pass + + +class CommandExecutionError(ServerGuardError): + """命令执行异常""" + pass + + +class PermissionError(ServerGuardError): + """权限异常""" + pass + + +def execute_command( + cmd_list: List[str], + timeout: int = 60, + check_returncode: bool = True, + capture_output: bool = True, + shell: bool = False, + input_data: Optional[str] = None +) -> Tuple[int, str, str]: + """ + 安全地执行外部命令。 + + Args: + cmd_list: 命令及其参数的列表 + timeout: 命令超时时间(秒) + check_returncode: 是否在非零返回码时抛出异常 + capture_output: 是否捕获输出 + shell: 是否使用 shell 执行 + input_data: 输入到命令的字符串数据 + + Returns: + Tuple[returncode, stdout, stderr] + + Raises: + CommandExecutionError: 命令执行失败 + PermissionError: 权限不足 + """ + logger = logging.getLogger(__name__) + + # 安全:禁止使用 shell=True 时传递未经验证的命令字符串 + if shell and isinstance(cmd_list, list): + cmd_str = ' '.join(cmd_list) + logger.warning(f"Using shell=True with command: {cmd_str}") + + try: + logger.debug(f"Executing command: {' '.join(cmd_list)}") + + kwargs = { + 'timeout': timeout, + 'shell': shell, + 'universal_newlines': True # Python 3.6 compatible version of text=True + } + if capture_output: + kwargs['stdout'] = subprocess.PIPE + kwargs['stderr'] = subprocess.PIPE + if input_data: + kwargs['input'] = input_data + + result = subprocess.run(cmd_list, **kwargs) + + stdout = result.stdout if result.stdout else "" + stderr = result.stderr if result.stderr else "" + + if check_returncode and result.returncode != 0: + error_msg = f"Command failed with code {result.returncode}: {' '.join(cmd_list)}\nstderr: {stderr}" + logger.error(error_msg) + raise CommandExecutionError(error_msg) + + return result.returncode, stdout, stderr + + except subprocess.TimeoutExpired: + error_msg = f"Command timed out after {timeout}s: {' '.join(cmd_list)}" + logger.error(error_msg) + raise CommandExecutionError(error_msg) + except FileNotFoundError: + error_msg = f"Command not found: {cmd_list[0]}" + logger.error(error_msg) + raise CommandExecutionError(error_msg) + except PermissionError as e: + error_msg = f"Permission denied executing: {' '.join(cmd_list)}" + logger.error(error_msg) + raise PermissionError(error_msg) from e + + +def check_root_privileges() -> bool: + """ + 检查当前是否以 root 用户运行。 + + Returns: + bool: 是否为 root 用户 + """ + return os.geteuid() == 0 + + +def require_root(func): + """ + 装饰器:要求函数必须以 root 权限运行。 + """ + def wrapper(*args, **kwargs): + if not check_root_privileges(): + logging.warning(f"Function {func.__name__} requires root privileges") + return { + "status": "error", + "error": "This function requires root privileges. Please run with sudo." + } + return func(*args, **kwargs) + return wrapper + + +def setup_logging( + log_file: Optional[str] = None, + level: int = logging.INFO, + console_output: bool = True +) -> logging.Logger: + """ + 配置日志系统。 + + Args: + log_file: 日志文件路径,None 则不写入文件 + level: 日志级别 + console_output: 是否输出到控制台 + + Returns: + logging.Logger: 配置好的 logger 实例 + """ + logger = logging.getLogger() + logger.setLevel(level) + + # 清除已有的 handlers + logger.handlers = [] + + formatter = logging.Formatter( + '%(asctime)s - %(name)s - %(levelname)s - %(message)s', + datefmt='%Y-%m-%d %H:%M:%S' + ) + + if console_output: + console_handler = logging.StreamHandler(sys.stdout) + console_handler.setFormatter(formatter) + logger.addHandler(console_handler) + + if log_file: + os.makedirs(os.path.dirname(log_file) or '.', exist_ok=True) + file_handler = logging.FileHandler(log_file) + file_handler.setFormatter(formatter) + logger.addHandler(file_handler) + + return logger + + +def parse_key_value_output(text: str, delimiter: str = ':') -> Dict[str, str]: + """ + 解析 key: value 格式的文本输出。 + + Args: + text: 要解析的文本 + delimiter: 键值分隔符 + + Returns: + Dict[str, str]: 解析后的字典 + """ + result = {} + for line in text.strip().split('\n'): + line = line.strip() + if not line or line.startswith('#'): + continue + + parts = line.split(delimiter, 1) + if len(parts) == 2: + key = parts[0].strip() + value = parts[1].strip() + result[key] = value + + return result + + +def parse_table_output(text: str, headers: Optional[List[str]] = None) -> List[Dict[str, str]]: + """ + 解析表格格式的文本输出。 + + Args: + text: 要解析的文本 + headers: 表头列表,None 则从第一行自动提取 + + Returns: + List[Dict[str, str]]: 解析后的列表 + """ + lines = [line.strip() for line in text.strip().split('\n') if line.strip()] + if not lines: + return [] + + if headers is None: + # 尝试自动检测表头 + headers = [h.strip() for h in lines[0].split() if h.strip()] + data_lines = lines[1:] + else: + data_lines = lines + + result = [] + for line in data_lines: + values = line.split() + if len(values) >= len(headers): + row = {headers[i]: values[i] for i in range(len(headers))} + result.append(row) + + return result + + +def extract_with_regex(text: str, pattern: str, group: int = 1, default: Any = None) -> Any: + """ + 使用正则表达式从文本中提取内容。 + + Args: + text: 要搜索的文本 + pattern: 正则表达式模式 + group: 捕获组索引 + default: 未匹配时的默认值 + + Returns: + 匹配结果或默认值 + """ + match = re.search(pattern, text) + if match: + try: + return match.group(group) + except IndexError: + return default + return default + + +def safe_int(value: Any, default: int = 0) -> int: + """ + 安全地将值转换为整数。 + + Args: + value: 要转换的值 + default: 转换失败时的默认值 + + Returns: + int: 转换后的整数 + """ + try: + # 移除常见单位后缀 + if isinstance(value, str): + value = value.strip().lower() + value = re.sub(r'[\s,]', '', value) + # 处理带单位的数值 (如 "32 GB", "2.5GHz") + value = re.sub(r'[^\d.-]', '', value) + return int(float(value)) + except (ValueError, TypeError): + return default + + +def safe_float(value: Any, default: float = 0.0) -> float: + """ + 安全地将值转换为浮点数。 + + Args: + value: 要转换的值 + default: 转换失败时的默认值 + + Returns: + float: 转换后的浮点数 + """ + try: + if isinstance(value, str): + value = value.strip().lower() + value = re.sub(r'[\s,]', '', value) + value = re.sub(r'[^\d.-]', '', value) + return float(value) + except (ValueError, TypeError): + return default + + +def get_timestamp() -> str: + """ + 获取当前时间戳字符串。 + + Returns: + str: 格式化的时间戳 + """ + return datetime.now().strftime('%Y-%m-%d %H:%M:%S') + + +def get_file_timestamp() -> str: + """ + 获取适合文件名的当前时间戳字符串。 + + Returns: + str: 格式化的文件名时间戳 + """ + return datetime.now().strftime('%Y%m%d_%H%M%S') + + +def read_file_lines(filepath: str, max_lines: int = 1000) -> List[str]: + """ + 安全地读取文件内容。 + + Args: + filepath: 文件路径 + max_lines: 最大读取行数 + + Returns: + List[str]: 文件行列表 + """ + try: + with open(filepath, 'r', encoding='utf-8', errors='ignore') as f: + lines = [] + for i, line in enumerate(f): + if i >= max_lines: + break + lines.append(line.rstrip('\n')) + return lines + except (IOError, OSError) as e: + logging.getLogger(__name__).warning(f"Failed to read file {filepath}: {e}") + return [] + + +def check_command_exists(command: str) -> bool: + """ + 检查命令是否存在。 + + Args: + command: 命令名称 + + Returns: + bool: 命令是否存在 + """ + try: + # Python 3.6 compatible version + subprocess.run( + ['which', command], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + check=True + ) + return True + except (subprocess.CalledProcessError, FileNotFoundError): + return False + + +def format_bytes(size_bytes: int) -> str: + """ + 将字节数格式化为人类可读的字符串。 + + Args: + size_bytes: 字节数 + + Returns: + str: 格式化后的字符串 + """ + if size_bytes == 0: + return "0 B" + + units = ['B', 'KB', 'MB', 'GB', 'TB', 'PB'] + unit_index = 0 + size = float(size_bytes) + + while size >= 1024 and unit_index < len(units) - 1: + size /= 1024 + unit_index += 1 + + return f"{size:.2f} {units[unit_index]}" + + +def sanitize_filename(filename: str) -> str: + """ + 清理文件名,移除不安全字符。 + + Args: + filename: 原始文件名 + + Returns: + str: 清理后的文件名 + """ + # 移除或替换不安全字符 + filename = re.sub(r'[<>:"/\\|?*]', '_', filename) + filename = filename.strip('. ') + return filename + + +def merge_dicts(base: Dict[str, Any], update: Dict[str, Any]) -> Dict[str, Any]: + """ + 递归合并两个字典。 + + Args: + base: 基础字典 + update: 更新字典 + + Returns: + Dict[str, Any]: 合并后的字典 + """ + result = base.copy() + for key, value in update.items(): + if key in result and isinstance(result[key], dict) and isinstance(value, dict): + result[key] = merge_dicts(result[key], value) + else: + result[key] = value + return result