Files
ServerGuard/modules/gpu.py
2026-03-02 14:14:40 +08:00

498 lines
18 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
ServerGuard - 显卡检测模块
检测 GPU 信息、温度、驱动状态等。
"""
import os
import re
from typing import Dict, Any, List, Optional
import sys
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from utils import (
execute_command, check_command_exists, parse_key_value_output,
safe_int, safe_float, format_bytes
)
def run_gpu_check() -> Dict[str, Any]:
"""
执行 GPU 检测。
Returns:
Dict[str, Any]: 检测结果
"""
result = {
"status": "success",
"gpus": [],
"errors": []
}
try:
# 检测 NVIDIA GPU
nvidia_gpus = check_nvidia_gpus()
if nvidia_gpus:
result["gpus"].extend(nvidia_gpus)
# 检测 AMD GPU
amd_gpus = check_amd_gpus()
if amd_gpus:
result["gpus"].extend(amd_gpus)
# 检测 Intel GPU
intel_gpus = check_intel_gpus()
if intel_gpus:
result["gpus"].extend(intel_gpus)
# 如果没有找到 GPU使用 lspci 基础检测
if not result["gpus"]:
result["gpus"] = check_generic_gpus()
# 检查系统日志中的 GPU 错误
result["dmesg_errors"] = check_gpu_dmesg_errors()
# 如果有错误,更新状态
if result["dmesg_errors"]:
result["status"] = "warning"
if not result["gpus"]:
result["status"] = "unknown"
result["note"] = "未检测到 GPU 设备"
except Exception as e:
result["status"] = "error"
result["error"] = str(e)
return result
def check_nvidia_gpus() -> List[Dict[str, Any]]:
"""检测 NVIDIA GPU。"""
gpus = []
if not check_command_exists('nvidia-smi'):
return gpus
try:
# 获取 GPU 列表和基本信息
_, stdout, _ = execute_command(
['nvidia-smi', '--query-gpu=gpu_name,gpu_bus_id,pci.bus_id,driver_version,pstate,pcie.link.gen.max,pcie.link.gen.current',
'--format=csv,noheader'],
check_returncode=False, timeout=10
)
for i, line in enumerate(stdout.strip().split('\n')):
if not line.strip():
continue
parts = [p.strip() for p in line.split(',')]
if len(parts) >= 4:
gpu_info = {
"vendor": "NVIDIA",
"index": i,
"name": parts[0],
"bus_id": parts[1] if len(parts) > 1 else "unknown",
"pci_bus_id": parts[2] if len(parts) > 2 else "unknown",
"driver_version": parts[3],
"pstate": parts[4] if len(parts) > 4 else "unknown",
"pcie_max_gen": parts[5] if len(parts) > 5 else "unknown",
"pcie_current_gen": parts[6] if len(parts) > 6 else "unknown"
}
# 获取详细信息
gpu_info.update(get_nvidia_gpu_details(i))
gpus.append(gpu_info)
except Exception as e:
pass
return gpus
def get_nvidia_gpu_details(gpu_index: int) -> Dict[str, Any]:
"""获取单个 NVIDIA GPU 的详细信息。"""
details = {}
try:
# 获取温度和功耗
_, stdout, _ = execute_command(
['nvidia-smi', '--query-gpu=temperature.gpu,power.draw,power.limit,clocks.gr,clocks.mem,utilization.gpu,utilization.memory,memory.total,memory.used,memory.free,serial,uuid,vbios_version',
'--format=csv,noheader,nounits', '-i', str(gpu_index)],
check_returncode=False, timeout=10
)
parts = [p.strip() for p in stdout.split(',')]
if len(parts) >= 10:
details["temperature_c"] = safe_int(parts[0]) if parts[0] != '[Not Supported]' else None
details["power_draw_w"] = safe_float(parts[1]) if parts[1] != '[Not Supported]' else None
details["power_limit_w"] = safe_float(parts[2]) if parts[2] != '[Not Supported]' else None
details["graphics_clock_mhz"] = safe_int(parts[3]) if parts[3] != '[Not Supported]' else None
details["memory_clock_mhz"] = safe_int(parts[4]) if parts[4] != '[Not Supported]' else None
details["gpu_utilization_percent"] = safe_int(parts[5]) if parts[5] != '[Not Supported]' else None
details["memory_utilization_percent"] = safe_int(parts[6]) if parts[6] != '[Not Supported]' else None
details["memory_total_mb"] = safe_int(parts[7]) if parts[7] != '[Not Supported]' else None
details["memory_used_mb"] = safe_int(parts[8]) if parts[8] != '[Not Supported]' else None
details["memory_free_mb"] = safe_int(parts[9]) if parts[9] != '[Not Supported]' else None
if len(parts) > 10:
details["serial"] = parts[10] if parts[10] != '[Not Supported]' else None
if len(parts) > 11:
details["uuid"] = parts[11] if parts[11] != '[Not Supported]' else None
if len(parts) > 12:
details["vbios_version"] = parts[12] if parts[12] != '[Not Supported]' else None
# 获取 ECC 状态
_, ecc_output, _ = execute_command(
['nvidia-smi', '--query-gpu=ecc.mode.current,ecc.mode.pending,ecc.errors.corrected.volatile.total,ecc.errors.uncorrected.volatile.total',
'--format=csv,noheader', '-i', str(gpu_index)],
check_returncode=False, timeout=10
)
ecc_parts = [p.strip() for p in ecc_output.split(',')]
if len(ecc_parts) >= 4:
details["ecc_mode"] = ecc_parts[0] if ecc_parts[0] != '[Not Supported]' else None
details["ecc_pending"] = ecc_parts[1] if ecc_parts[1] != '[Not Supported]' else None
details["ecc_corrected_errors"] = safe_int(ecc_parts[2]) if ecc_parts[2] != '[Not Supported]' else 0
details["ecc_uncorrected_errors"] = safe_int(ecc_parts[3]) if ecc_parts[3] != '[Not Supported]' else 0
# 获取进程信息
_, proc_output, _ = execute_command(
['nvidia-smi', 'pmon', '-s', 'um', '-c', '1', '-i', str(gpu_index)],
check_returncode=False, timeout=5
)
processes = []
for line in proc_output.split('\n')[2:]: # 跳过表头
if line.strip() and not line.startswith('#'):
proc_parts = line.split()
if len(proc_parts) >= 5:
processes.append({
"pid": proc_parts[1],
"type": proc_parts[2],
"sm_util": proc_parts[3],
"mem_util": proc_parts[4]
})
if processes:
details["processes"] = processes
except:
pass
return details
def check_amd_gpus() -> List[Dict[str, Any]]:
"""检测 AMD GPU。"""
gpus = []
# 使用 radeontop 获取信息
if check_command_exists('radeontop'):
try:
# radeontop 需要图形环境,使用 -d 参数输出到文件
import tempfile
with tempfile.NamedTemporaryFile(mode='r', suffix='.txt', delete=False) as f:
dump_file = f.name
try:
_, stdout, _ = execute_command(
['radeontop', '-d', dump_file, '-l', '1'],
check_returncode=False, timeout=5
)
with open(dump_file, 'r') as f:
output = f.read()
gpu_info = {"vendor": "AMD"}
# 解析 radeontop 输出
for line in output.split('\n'):
if 'GPU' in line and ':' in line:
parts = line.split(':')
if len(parts) == 2:
key = parts[0].strip().lower().replace(' ', '_')
value = parts[1].strip()
gpu_info[key] = value
if gpu_info:
gpus.append(gpu_info)
finally:
if os.path.exists(dump_file):
os.unlink(dump_file)
except:
pass
# 尝试从 sysfs 获取 AMD GPU 信息
try:
for card in os.listdir('/sys/class/drm'):
if card.startswith('card') and not card[-1].isdigit() or (card.startswith('card') and os.path.exists(f'/sys/class/drm/{card}/device/vendor')):
vendor_path = f'/sys/class/drm/{card}/device/vendor'
if os.path.exists(vendor_path):
with open(vendor_path, 'r') as f:
vendor_id = f.read().strip()
# AMD vendor ID 是 0x1002
if vendor_id == '0x1002':
gpu_info = {
"vendor": "AMD",
"card": card
}
# 获取设备信息
device_path = f'/sys/class/drm/{card}/device/device'
if os.path.exists(device_path):
with open(device_path, 'r') as f:
gpu_info["device_id"] = f.read().strip()
# 获取驱动
driver_path = f'/sys/class/drm/{card}/device/driver'
if os.path.exists(driver_path):
driver = os.path.basename(os.readlink(driver_path))
gpu_info["driver"] = driver
# 获取温度
temp_path = f'/sys/class/drm/{card}/device/hwmon/hwmon0/temp1_input'
if os.path.exists(temp_path):
with open(temp_path, 'r') as f:
temp_mc = safe_int(f.read().strip())
gpu_info["temperature_c"] = temp_mc / 1000.0
# 获取频率
freq_path = f'/sys/class/drm/{card}/device/pp_dpm_sclk'
if os.path.exists(freq_path):
with open(freq_path, 'r') as f:
gpu_info["core_clock_levels"] = f.read().strip()
gpus.append(gpu_info)
except:
pass
return gpus
def check_intel_gpus() -> List[Dict[str, Any]]:
"""检测 Intel GPU。"""
gpus = []
# 从 sysfs 获取 Intel GPU 信息
try:
for card in os.listdir('/sys/class/drm'):
if not card.startswith('card'):
continue
vendor_path = f'/sys/class/drm/{card}/device/vendor'
if not os.path.exists(vendor_path):
continue
with open(vendor_path, 'r') as f:
vendor_id = f.read().strip()
# Intel vendor ID 是 0x8086
if vendor_id == '0x8086':
gpu_info = {
"vendor": "Intel",
"card": card
}
# 获取设备信息
device_path = f'/sys/class/drm/{card}/device/device'
if os.path.exists(device_path):
with open(device_path, 'r') as f:
gpu_info["device_id"] = f.read().strip()
# 获取驱动
driver_path = f'/sys/class/drm/{card}/device/driver'
if os.path.exists(driver_path):
driver = os.path.basename(os.readlink(driver_path))
gpu_info["driver"] = driver
# Intel GPU 通常集成,标记为集成显卡
gpu_info["type"] = "integrated"
gpus.append(gpu_info)
except:
pass
return gpus
def check_generic_gpus() -> List[Dict[str, Any]]:
"""使用 lspci 进行通用 GPU 检测。"""
gpus = []
if not check_command_exists('lspci'):
return gpus
try:
_, stdout, _ = execute_command(
['lspci', '-nn'],
check_returncode=False, timeout=10
)
for line in stdout.split('\n'):
if 'VGA' in line or '3D controller' in line or 'Display controller' in line:
parts = line.split(': ', 1)
if len(parts) == 2:
bus_id = parts[0].split()[0]
description = parts[1]
gpu_info = {
"bus_id": bus_id,
"description": description
}
# 识别厂商
desc_lower = description.lower()
if 'nvidia' in desc_lower:
gpu_info["vendor"] = "NVIDIA"
elif 'amd' in desc_lower or 'ati' in desc_lower:
gpu_info["vendor"] = "AMD"
elif 'intel' in desc_lower:
gpu_info["vendor"] = "Intel"
else:
gpu_info["vendor"] = "Unknown"
# 识别类型
if 'VGA' in line:
gpu_info["type"] = "vga"
elif '3D controller' in line:
gpu_info["type"] = "3d"
elif 'Display controller' in line:
gpu_info["type"] = "display"
# 获取详细信息
try:
_, detail, _ = execute_command(
['lspci', '-v', '-s', bus_id],
check_returncode=False, timeout=5
)
# 提取驱动信息
driver_match = re.search(r'Kernel driver in use:\s*(\S+)', detail)
if driver_match:
gpu_info["driver"] = driver_match.group(1)
# 提取模块信息
modules_match = re.search(r'Kernel modules:\s*(.+)', detail)
if modules_match:
gpu_info["modules"] = modules_match.group(1).strip()
except:
pass
gpus.append(gpu_info)
except:
pass
return gpus
def check_gpu_dmesg_errors() -> List[Dict[str, str]]:
"""检查 dmesg 中的 GPU 相关错误。"""
errors = []
if not check_command_exists('dmesg'):
return errors
try:
_, stdout, _ = execute_command(
['dmesg'],
check_returncode=False, timeout=10
)
# GPU 相关错误关键词
gpu_error_patterns = [
r'GPU has fallen off the bus',
r'NVRM: Xid',
r'nvidia.*error',
r'amdgpu.*error',
r'i915.*error',
r'GPU hang',
r'ring.*timeout',
r'Failed to load firmware',
r'VRAM lost',
r'gpu.*fault',
r' thermal ',
]
for line in stdout.split('\n'):
line_lower = line.lower()
# 检查是否包含 GPU 相关错误
is_gpu_error = any(
re.search(pattern, line, re.IGNORECASE)
for pattern in gpu_error_patterns
)
if is_gpu_error and ('error' in line_lower or 'fail' in line_lower or 'warn' in line_lower or 'Xid' in line):
# 提取时间戳
timestamp_match = re.match(r'\[\s*([\d.]+)\]', line)
timestamp = timestamp_match.group(1) if timestamp_match else "unknown"
errors.append({
"timestamp": timestamp,
"message": line.strip()
})
# 去重并限制数量
seen = set()
unique_errors = []
for error in errors:
msg = error["message"]
if msg not in seen and len(unique_errors) < 20:
seen.add(msg)
unique_errors.append(error)
return unique_errors
except:
return []
def get_gpu_processes() -> List[Dict[str, Any]]:
"""获取使用 GPU 的进程列表(仅 NVIDIA"""
processes = []
if not check_command_exists('nvidia-smi'):
return processes
try:
_, stdout, _ = execute_command(
['nvidia-smi', 'pmon', '-s', 'um', '-c', '1'],
check_returncode=False, timeout=5
)
lines = stdout.strip().split('\n')
# 跳过前两行(表头)
for line in lines[2:]:
if line.strip() and not line.startswith('#'):
parts = line.split()
if len(parts) >= 8:
processes.append({
"gpu_index": safe_int(parts[0]),
"pid": parts[1],
"type": parts[2],
"sm_util": parts[3],
"mem_util": parts[4],
"enc_util": parts[5],
"dec_util": parts[6],
"command": parts[7]
})
except:
pass
return processes
if __name__ == '__main__':
import json
print(json.dumps(run_gpu_check(), indent=2, ensure_ascii=False))