first commit

This commit is contained in:
zj
2026-03-02 14:14:40 +08:00
commit c4f4fefa0a
20 changed files with 6037 additions and 0 deletions

497
modules/gpu.py Normal file
View File

@@ -0,0 +1,497 @@
"""
ServerGuard - 显卡检测模块
检测 GPU 信息、温度、驱动状态等。
"""
import os
import re
from typing import Dict, Any, List, Optional
import sys
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from utils import (
execute_command, check_command_exists, parse_key_value_output,
safe_int, safe_float, format_bytes
)
def run_gpu_check() -> Dict[str, Any]:
"""
执行 GPU 检测。
Returns:
Dict[str, Any]: 检测结果
"""
result = {
"status": "success",
"gpus": [],
"errors": []
}
try:
# 检测 NVIDIA GPU
nvidia_gpus = check_nvidia_gpus()
if nvidia_gpus:
result["gpus"].extend(nvidia_gpus)
# 检测 AMD GPU
amd_gpus = check_amd_gpus()
if amd_gpus:
result["gpus"].extend(amd_gpus)
# 检测 Intel GPU
intel_gpus = check_intel_gpus()
if intel_gpus:
result["gpus"].extend(intel_gpus)
# 如果没有找到 GPU使用 lspci 基础检测
if not result["gpus"]:
result["gpus"] = check_generic_gpus()
# 检查系统日志中的 GPU 错误
result["dmesg_errors"] = check_gpu_dmesg_errors()
# 如果有错误,更新状态
if result["dmesg_errors"]:
result["status"] = "warning"
if not result["gpus"]:
result["status"] = "unknown"
result["note"] = "未检测到 GPU 设备"
except Exception as e:
result["status"] = "error"
result["error"] = str(e)
return result
def check_nvidia_gpus() -> List[Dict[str, Any]]:
"""检测 NVIDIA GPU。"""
gpus = []
if not check_command_exists('nvidia-smi'):
return gpus
try:
# 获取 GPU 列表和基本信息
_, stdout, _ = execute_command(
['nvidia-smi', '--query-gpu=gpu_name,gpu_bus_id,pci.bus_id,driver_version,pstate,pcie.link.gen.max,pcie.link.gen.current',
'--format=csv,noheader'],
check_returncode=False, timeout=10
)
for i, line in enumerate(stdout.strip().split('\n')):
if not line.strip():
continue
parts = [p.strip() for p in line.split(',')]
if len(parts) >= 4:
gpu_info = {
"vendor": "NVIDIA",
"index": i,
"name": parts[0],
"bus_id": parts[1] if len(parts) > 1 else "unknown",
"pci_bus_id": parts[2] if len(parts) > 2 else "unknown",
"driver_version": parts[3],
"pstate": parts[4] if len(parts) > 4 else "unknown",
"pcie_max_gen": parts[5] if len(parts) > 5 else "unknown",
"pcie_current_gen": parts[6] if len(parts) > 6 else "unknown"
}
# 获取详细信息
gpu_info.update(get_nvidia_gpu_details(i))
gpus.append(gpu_info)
except Exception as e:
pass
return gpus
def get_nvidia_gpu_details(gpu_index: int) -> Dict[str, Any]:
"""获取单个 NVIDIA GPU 的详细信息。"""
details = {}
try:
# 获取温度和功耗
_, stdout, _ = execute_command(
['nvidia-smi', '--query-gpu=temperature.gpu,power.draw,power.limit,clocks.gr,clocks.mem,utilization.gpu,utilization.memory,memory.total,memory.used,memory.free,serial,uuid,vbios_version',
'--format=csv,noheader,nounits', '-i', str(gpu_index)],
check_returncode=False, timeout=10
)
parts = [p.strip() for p in stdout.split(',')]
if len(parts) >= 10:
details["temperature_c"] = safe_int(parts[0]) if parts[0] != '[Not Supported]' else None
details["power_draw_w"] = safe_float(parts[1]) if parts[1] != '[Not Supported]' else None
details["power_limit_w"] = safe_float(parts[2]) if parts[2] != '[Not Supported]' else None
details["graphics_clock_mhz"] = safe_int(parts[3]) if parts[3] != '[Not Supported]' else None
details["memory_clock_mhz"] = safe_int(parts[4]) if parts[4] != '[Not Supported]' else None
details["gpu_utilization_percent"] = safe_int(parts[5]) if parts[5] != '[Not Supported]' else None
details["memory_utilization_percent"] = safe_int(parts[6]) if parts[6] != '[Not Supported]' else None
details["memory_total_mb"] = safe_int(parts[7]) if parts[7] != '[Not Supported]' else None
details["memory_used_mb"] = safe_int(parts[8]) if parts[8] != '[Not Supported]' else None
details["memory_free_mb"] = safe_int(parts[9]) if parts[9] != '[Not Supported]' else None
if len(parts) > 10:
details["serial"] = parts[10] if parts[10] != '[Not Supported]' else None
if len(parts) > 11:
details["uuid"] = parts[11] if parts[11] != '[Not Supported]' else None
if len(parts) > 12:
details["vbios_version"] = parts[12] if parts[12] != '[Not Supported]' else None
# 获取 ECC 状态
_, ecc_output, _ = execute_command(
['nvidia-smi', '--query-gpu=ecc.mode.current,ecc.mode.pending,ecc.errors.corrected.volatile.total,ecc.errors.uncorrected.volatile.total',
'--format=csv,noheader', '-i', str(gpu_index)],
check_returncode=False, timeout=10
)
ecc_parts = [p.strip() for p in ecc_output.split(',')]
if len(ecc_parts) >= 4:
details["ecc_mode"] = ecc_parts[0] if ecc_parts[0] != '[Not Supported]' else None
details["ecc_pending"] = ecc_parts[1] if ecc_parts[1] != '[Not Supported]' else None
details["ecc_corrected_errors"] = safe_int(ecc_parts[2]) if ecc_parts[2] != '[Not Supported]' else 0
details["ecc_uncorrected_errors"] = safe_int(ecc_parts[3]) if ecc_parts[3] != '[Not Supported]' else 0
# 获取进程信息
_, proc_output, _ = execute_command(
['nvidia-smi', 'pmon', '-s', 'um', '-c', '1', '-i', str(gpu_index)],
check_returncode=False, timeout=5
)
processes = []
for line in proc_output.split('\n')[2:]: # 跳过表头
if line.strip() and not line.startswith('#'):
proc_parts = line.split()
if len(proc_parts) >= 5:
processes.append({
"pid": proc_parts[1],
"type": proc_parts[2],
"sm_util": proc_parts[3],
"mem_util": proc_parts[4]
})
if processes:
details["processes"] = processes
except:
pass
return details
def check_amd_gpus() -> List[Dict[str, Any]]:
"""检测 AMD GPU。"""
gpus = []
# 使用 radeontop 获取信息
if check_command_exists('radeontop'):
try:
# radeontop 需要图形环境,使用 -d 参数输出到文件
import tempfile
with tempfile.NamedTemporaryFile(mode='r', suffix='.txt', delete=False) as f:
dump_file = f.name
try:
_, stdout, _ = execute_command(
['radeontop', '-d', dump_file, '-l', '1'],
check_returncode=False, timeout=5
)
with open(dump_file, 'r') as f:
output = f.read()
gpu_info = {"vendor": "AMD"}
# 解析 radeontop 输出
for line in output.split('\n'):
if 'GPU' in line and ':' in line:
parts = line.split(':')
if len(parts) == 2:
key = parts[0].strip().lower().replace(' ', '_')
value = parts[1].strip()
gpu_info[key] = value
if gpu_info:
gpus.append(gpu_info)
finally:
if os.path.exists(dump_file):
os.unlink(dump_file)
except:
pass
# 尝试从 sysfs 获取 AMD GPU 信息
try:
for card in os.listdir('/sys/class/drm'):
if card.startswith('card') and not card[-1].isdigit() or (card.startswith('card') and os.path.exists(f'/sys/class/drm/{card}/device/vendor')):
vendor_path = f'/sys/class/drm/{card}/device/vendor'
if os.path.exists(vendor_path):
with open(vendor_path, 'r') as f:
vendor_id = f.read().strip()
# AMD vendor ID 是 0x1002
if vendor_id == '0x1002':
gpu_info = {
"vendor": "AMD",
"card": card
}
# 获取设备信息
device_path = f'/sys/class/drm/{card}/device/device'
if os.path.exists(device_path):
with open(device_path, 'r') as f:
gpu_info["device_id"] = f.read().strip()
# 获取驱动
driver_path = f'/sys/class/drm/{card}/device/driver'
if os.path.exists(driver_path):
driver = os.path.basename(os.readlink(driver_path))
gpu_info["driver"] = driver
# 获取温度
temp_path = f'/sys/class/drm/{card}/device/hwmon/hwmon0/temp1_input'
if os.path.exists(temp_path):
with open(temp_path, 'r') as f:
temp_mc = safe_int(f.read().strip())
gpu_info["temperature_c"] = temp_mc / 1000.0
# 获取频率
freq_path = f'/sys/class/drm/{card}/device/pp_dpm_sclk'
if os.path.exists(freq_path):
with open(freq_path, 'r') as f:
gpu_info["core_clock_levels"] = f.read().strip()
gpus.append(gpu_info)
except:
pass
return gpus
def check_intel_gpus() -> List[Dict[str, Any]]:
"""检测 Intel GPU。"""
gpus = []
# 从 sysfs 获取 Intel GPU 信息
try:
for card in os.listdir('/sys/class/drm'):
if not card.startswith('card'):
continue
vendor_path = f'/sys/class/drm/{card}/device/vendor'
if not os.path.exists(vendor_path):
continue
with open(vendor_path, 'r') as f:
vendor_id = f.read().strip()
# Intel vendor ID 是 0x8086
if vendor_id == '0x8086':
gpu_info = {
"vendor": "Intel",
"card": card
}
# 获取设备信息
device_path = f'/sys/class/drm/{card}/device/device'
if os.path.exists(device_path):
with open(device_path, 'r') as f:
gpu_info["device_id"] = f.read().strip()
# 获取驱动
driver_path = f'/sys/class/drm/{card}/device/driver'
if os.path.exists(driver_path):
driver = os.path.basename(os.readlink(driver_path))
gpu_info["driver"] = driver
# Intel GPU 通常集成,标记为集成显卡
gpu_info["type"] = "integrated"
gpus.append(gpu_info)
except:
pass
return gpus
def check_generic_gpus() -> List[Dict[str, Any]]:
"""使用 lspci 进行通用 GPU 检测。"""
gpus = []
if not check_command_exists('lspci'):
return gpus
try:
_, stdout, _ = execute_command(
['lspci', '-nn'],
check_returncode=False, timeout=10
)
for line in stdout.split('\n'):
if 'VGA' in line or '3D controller' in line or 'Display controller' in line:
parts = line.split(': ', 1)
if len(parts) == 2:
bus_id = parts[0].split()[0]
description = parts[1]
gpu_info = {
"bus_id": bus_id,
"description": description
}
# 识别厂商
desc_lower = description.lower()
if 'nvidia' in desc_lower:
gpu_info["vendor"] = "NVIDIA"
elif 'amd' in desc_lower or 'ati' in desc_lower:
gpu_info["vendor"] = "AMD"
elif 'intel' in desc_lower:
gpu_info["vendor"] = "Intel"
else:
gpu_info["vendor"] = "Unknown"
# 识别类型
if 'VGA' in line:
gpu_info["type"] = "vga"
elif '3D controller' in line:
gpu_info["type"] = "3d"
elif 'Display controller' in line:
gpu_info["type"] = "display"
# 获取详细信息
try:
_, detail, _ = execute_command(
['lspci', '-v', '-s', bus_id],
check_returncode=False, timeout=5
)
# 提取驱动信息
driver_match = re.search(r'Kernel driver in use:\s*(\S+)', detail)
if driver_match:
gpu_info["driver"] = driver_match.group(1)
# 提取模块信息
modules_match = re.search(r'Kernel modules:\s*(.+)', detail)
if modules_match:
gpu_info["modules"] = modules_match.group(1).strip()
except:
pass
gpus.append(gpu_info)
except:
pass
return gpus
def check_gpu_dmesg_errors() -> List[Dict[str, str]]:
"""检查 dmesg 中的 GPU 相关错误。"""
errors = []
if not check_command_exists('dmesg'):
return errors
try:
_, stdout, _ = execute_command(
['dmesg'],
check_returncode=False, timeout=10
)
# GPU 相关错误关键词
gpu_error_patterns = [
r'GPU has fallen off the bus',
r'NVRM: Xid',
r'nvidia.*error',
r'amdgpu.*error',
r'i915.*error',
r'GPU hang',
r'ring.*timeout',
r'Failed to load firmware',
r'VRAM lost',
r'gpu.*fault',
r' thermal ',
]
for line in stdout.split('\n'):
line_lower = line.lower()
# 检查是否包含 GPU 相关错误
is_gpu_error = any(
re.search(pattern, line, re.IGNORECASE)
for pattern in gpu_error_patterns
)
if is_gpu_error and ('error' in line_lower or 'fail' in line_lower or 'warn' in line_lower or 'Xid' in line):
# 提取时间戳
timestamp_match = re.match(r'\[\s*([\d.]+)\]', line)
timestamp = timestamp_match.group(1) if timestamp_match else "unknown"
errors.append({
"timestamp": timestamp,
"message": line.strip()
})
# 去重并限制数量
seen = set()
unique_errors = []
for error in errors:
msg = error["message"]
if msg not in seen and len(unique_errors) < 20:
seen.add(msg)
unique_errors.append(error)
return unique_errors
except:
return []
def get_gpu_processes() -> List[Dict[str, Any]]:
"""获取使用 GPU 的进程列表(仅 NVIDIA"""
processes = []
if not check_command_exists('nvidia-smi'):
return processes
try:
_, stdout, _ = execute_command(
['nvidia-smi', 'pmon', '-s', 'um', '-c', '1'],
check_returncode=False, timeout=5
)
lines = stdout.strip().split('\n')
# 跳过前两行(表头)
for line in lines[2:]:
if line.strip() and not line.startswith('#'):
parts = line.split()
if len(parts) >= 8:
processes.append({
"gpu_index": safe_int(parts[0]),
"pid": parts[1],
"type": parts[2],
"sm_util": parts[3],
"mem_util": parts[4],
"enc_util": parts[5],
"dec_util": parts[6],
"command": parts[7]
})
except:
pass
return processes
if __name__ == '__main__':
import json
print(json.dumps(run_gpu_check(), indent=2, ensure_ascii=False))