498 lines
18 KiB
Python
498 lines
18 KiB
Python
"""
|
||
ServerGuard - 显卡检测模块
|
||
|
||
检测 GPU 信息、温度、驱动状态等。
|
||
"""
|
||
|
||
import os
|
||
import re
|
||
from typing import Dict, Any, List, Optional
|
||
|
||
import sys
|
||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||
|
||
from utils import (
|
||
execute_command, check_command_exists, parse_key_value_output,
|
||
safe_int, safe_float, format_bytes
|
||
)
|
||
|
||
|
||
def run_gpu_check() -> Dict[str, Any]:
|
||
"""
|
||
执行 GPU 检测。
|
||
|
||
Returns:
|
||
Dict[str, Any]: 检测结果
|
||
"""
|
||
result = {
|
||
"status": "success",
|
||
"gpus": [],
|
||
"errors": []
|
||
}
|
||
|
||
try:
|
||
# 检测 NVIDIA GPU
|
||
nvidia_gpus = check_nvidia_gpus()
|
||
if nvidia_gpus:
|
||
result["gpus"].extend(nvidia_gpus)
|
||
|
||
# 检测 AMD GPU
|
||
amd_gpus = check_amd_gpus()
|
||
if amd_gpus:
|
||
result["gpus"].extend(amd_gpus)
|
||
|
||
# 检测 Intel GPU
|
||
intel_gpus = check_intel_gpus()
|
||
if intel_gpus:
|
||
result["gpus"].extend(intel_gpus)
|
||
|
||
# 如果没有找到 GPU,使用 lspci 基础检测
|
||
if not result["gpus"]:
|
||
result["gpus"] = check_generic_gpus()
|
||
|
||
# 检查系统日志中的 GPU 错误
|
||
result["dmesg_errors"] = check_gpu_dmesg_errors()
|
||
|
||
# 如果有错误,更新状态
|
||
if result["dmesg_errors"]:
|
||
result["status"] = "warning"
|
||
|
||
if not result["gpus"]:
|
||
result["status"] = "unknown"
|
||
result["note"] = "未检测到 GPU 设备"
|
||
|
||
except Exception as e:
|
||
result["status"] = "error"
|
||
result["error"] = str(e)
|
||
|
||
return result
|
||
|
||
|
||
def check_nvidia_gpus() -> List[Dict[str, Any]]:
|
||
"""检测 NVIDIA GPU。"""
|
||
gpus = []
|
||
|
||
if not check_command_exists('nvidia-smi'):
|
||
return gpus
|
||
|
||
try:
|
||
# 获取 GPU 列表和基本信息
|
||
_, stdout, _ = execute_command(
|
||
['nvidia-smi', '--query-gpu=gpu_name,gpu_bus_id,pci.bus_id,driver_version,pstate,pcie.link.gen.max,pcie.link.gen.current',
|
||
'--format=csv,noheader'],
|
||
check_returncode=False, timeout=10
|
||
)
|
||
|
||
for i, line in enumerate(stdout.strip().split('\n')):
|
||
if not line.strip():
|
||
continue
|
||
|
||
parts = [p.strip() for p in line.split(',')]
|
||
if len(parts) >= 4:
|
||
gpu_info = {
|
||
"vendor": "NVIDIA",
|
||
"index": i,
|
||
"name": parts[0],
|
||
"bus_id": parts[1] if len(parts) > 1 else "unknown",
|
||
"pci_bus_id": parts[2] if len(parts) > 2 else "unknown",
|
||
"driver_version": parts[3],
|
||
"pstate": parts[4] if len(parts) > 4 else "unknown",
|
||
"pcie_max_gen": parts[5] if len(parts) > 5 else "unknown",
|
||
"pcie_current_gen": parts[6] if len(parts) > 6 else "unknown"
|
||
}
|
||
|
||
# 获取详细信息
|
||
gpu_info.update(get_nvidia_gpu_details(i))
|
||
gpus.append(gpu_info)
|
||
|
||
except Exception as e:
|
||
pass
|
||
|
||
return gpus
|
||
|
||
|
||
def get_nvidia_gpu_details(gpu_index: int) -> Dict[str, Any]:
|
||
"""获取单个 NVIDIA GPU 的详细信息。"""
|
||
details = {}
|
||
|
||
try:
|
||
# 获取温度和功耗
|
||
_, stdout, _ = execute_command(
|
||
['nvidia-smi', '--query-gpu=temperature.gpu,power.draw,power.limit,clocks.gr,clocks.mem,utilization.gpu,utilization.memory,memory.total,memory.used,memory.free,serial,uuid,vbios_version',
|
||
'--format=csv,noheader,nounits', '-i', str(gpu_index)],
|
||
check_returncode=False, timeout=10
|
||
)
|
||
|
||
parts = [p.strip() for p in stdout.split(',')]
|
||
if len(parts) >= 10:
|
||
details["temperature_c"] = safe_int(parts[0]) if parts[0] != '[Not Supported]' else None
|
||
details["power_draw_w"] = safe_float(parts[1]) if parts[1] != '[Not Supported]' else None
|
||
details["power_limit_w"] = safe_float(parts[2]) if parts[2] != '[Not Supported]' else None
|
||
details["graphics_clock_mhz"] = safe_int(parts[3]) if parts[3] != '[Not Supported]' else None
|
||
details["memory_clock_mhz"] = safe_int(parts[4]) if parts[4] != '[Not Supported]' else None
|
||
details["gpu_utilization_percent"] = safe_int(parts[5]) if parts[5] != '[Not Supported]' else None
|
||
details["memory_utilization_percent"] = safe_int(parts[6]) if parts[6] != '[Not Supported]' else None
|
||
details["memory_total_mb"] = safe_int(parts[7]) if parts[7] != '[Not Supported]' else None
|
||
details["memory_used_mb"] = safe_int(parts[8]) if parts[8] != '[Not Supported]' else None
|
||
details["memory_free_mb"] = safe_int(parts[9]) if parts[9] != '[Not Supported]' else None
|
||
|
||
if len(parts) > 10:
|
||
details["serial"] = parts[10] if parts[10] != '[Not Supported]' else None
|
||
if len(parts) > 11:
|
||
details["uuid"] = parts[11] if parts[11] != '[Not Supported]' else None
|
||
if len(parts) > 12:
|
||
details["vbios_version"] = parts[12] if parts[12] != '[Not Supported]' else None
|
||
|
||
# 获取 ECC 状态
|
||
_, ecc_output, _ = execute_command(
|
||
['nvidia-smi', '--query-gpu=ecc.mode.current,ecc.mode.pending,ecc.errors.corrected.volatile.total,ecc.errors.uncorrected.volatile.total',
|
||
'--format=csv,noheader', '-i', str(gpu_index)],
|
||
check_returncode=False, timeout=10
|
||
)
|
||
|
||
ecc_parts = [p.strip() for p in ecc_output.split(',')]
|
||
if len(ecc_parts) >= 4:
|
||
details["ecc_mode"] = ecc_parts[0] if ecc_parts[0] != '[Not Supported]' else None
|
||
details["ecc_pending"] = ecc_parts[1] if ecc_parts[1] != '[Not Supported]' else None
|
||
details["ecc_corrected_errors"] = safe_int(ecc_parts[2]) if ecc_parts[2] != '[Not Supported]' else 0
|
||
details["ecc_uncorrected_errors"] = safe_int(ecc_parts[3]) if ecc_parts[3] != '[Not Supported]' else 0
|
||
|
||
# 获取进程信息
|
||
_, proc_output, _ = execute_command(
|
||
['nvidia-smi', 'pmon', '-s', 'um', '-c', '1', '-i', str(gpu_index)],
|
||
check_returncode=False, timeout=5
|
||
)
|
||
|
||
processes = []
|
||
for line in proc_output.split('\n')[2:]: # 跳过表头
|
||
if line.strip() and not line.startswith('#'):
|
||
proc_parts = line.split()
|
||
if len(proc_parts) >= 5:
|
||
processes.append({
|
||
"pid": proc_parts[1],
|
||
"type": proc_parts[2],
|
||
"sm_util": proc_parts[3],
|
||
"mem_util": proc_parts[4]
|
||
})
|
||
|
||
if processes:
|
||
details["processes"] = processes
|
||
|
||
except:
|
||
pass
|
||
|
||
return details
|
||
|
||
|
||
def check_amd_gpus() -> List[Dict[str, Any]]:
|
||
"""检测 AMD GPU。"""
|
||
gpus = []
|
||
|
||
# 使用 radeontop 获取信息
|
||
if check_command_exists('radeontop'):
|
||
try:
|
||
# radeontop 需要图形环境,使用 -d 参数输出到文件
|
||
import tempfile
|
||
|
||
with tempfile.NamedTemporaryFile(mode='r', suffix='.txt', delete=False) as f:
|
||
dump_file = f.name
|
||
|
||
try:
|
||
_, stdout, _ = execute_command(
|
||
['radeontop', '-d', dump_file, '-l', '1'],
|
||
check_returncode=False, timeout=5
|
||
)
|
||
|
||
with open(dump_file, 'r') as f:
|
||
output = f.read()
|
||
|
||
gpu_info = {"vendor": "AMD"}
|
||
|
||
# 解析 radeontop 输出
|
||
for line in output.split('\n'):
|
||
if 'GPU' in line and ':' in line:
|
||
parts = line.split(':')
|
||
if len(parts) == 2:
|
||
key = parts[0].strip().lower().replace(' ', '_')
|
||
value = parts[1].strip()
|
||
gpu_info[key] = value
|
||
|
||
if gpu_info:
|
||
gpus.append(gpu_info)
|
||
|
||
finally:
|
||
if os.path.exists(dump_file):
|
||
os.unlink(dump_file)
|
||
|
||
except:
|
||
pass
|
||
|
||
# 尝试从 sysfs 获取 AMD GPU 信息
|
||
try:
|
||
for card in os.listdir('/sys/class/drm'):
|
||
if card.startswith('card') and not card[-1].isdigit() or (card.startswith('card') and os.path.exists(f'/sys/class/drm/{card}/device/vendor')):
|
||
vendor_path = f'/sys/class/drm/{card}/device/vendor'
|
||
if os.path.exists(vendor_path):
|
||
with open(vendor_path, 'r') as f:
|
||
vendor_id = f.read().strip()
|
||
|
||
# AMD vendor ID 是 0x1002
|
||
if vendor_id == '0x1002':
|
||
gpu_info = {
|
||
"vendor": "AMD",
|
||
"card": card
|
||
}
|
||
|
||
# 获取设备信息
|
||
device_path = f'/sys/class/drm/{card}/device/device'
|
||
if os.path.exists(device_path):
|
||
with open(device_path, 'r') as f:
|
||
gpu_info["device_id"] = f.read().strip()
|
||
|
||
# 获取驱动
|
||
driver_path = f'/sys/class/drm/{card}/device/driver'
|
||
if os.path.exists(driver_path):
|
||
driver = os.path.basename(os.readlink(driver_path))
|
||
gpu_info["driver"] = driver
|
||
|
||
# 获取温度
|
||
temp_path = f'/sys/class/drm/{card}/device/hwmon/hwmon0/temp1_input'
|
||
if os.path.exists(temp_path):
|
||
with open(temp_path, 'r') as f:
|
||
temp_mc = safe_int(f.read().strip())
|
||
gpu_info["temperature_c"] = temp_mc / 1000.0
|
||
|
||
# 获取频率
|
||
freq_path = f'/sys/class/drm/{card}/device/pp_dpm_sclk'
|
||
if os.path.exists(freq_path):
|
||
with open(freq_path, 'r') as f:
|
||
gpu_info["core_clock_levels"] = f.read().strip()
|
||
|
||
gpus.append(gpu_info)
|
||
|
||
except:
|
||
pass
|
||
|
||
return gpus
|
||
|
||
|
||
def check_intel_gpus() -> List[Dict[str, Any]]:
|
||
"""检测 Intel GPU。"""
|
||
gpus = []
|
||
|
||
# 从 sysfs 获取 Intel GPU 信息
|
||
try:
|
||
for card in os.listdir('/sys/class/drm'):
|
||
if not card.startswith('card'):
|
||
continue
|
||
|
||
vendor_path = f'/sys/class/drm/{card}/device/vendor'
|
||
if not os.path.exists(vendor_path):
|
||
continue
|
||
|
||
with open(vendor_path, 'r') as f:
|
||
vendor_id = f.read().strip()
|
||
|
||
# Intel vendor ID 是 0x8086
|
||
if vendor_id == '0x8086':
|
||
gpu_info = {
|
||
"vendor": "Intel",
|
||
"card": card
|
||
}
|
||
|
||
# 获取设备信息
|
||
device_path = f'/sys/class/drm/{card}/device/device'
|
||
if os.path.exists(device_path):
|
||
with open(device_path, 'r') as f:
|
||
gpu_info["device_id"] = f.read().strip()
|
||
|
||
# 获取驱动
|
||
driver_path = f'/sys/class/drm/{card}/device/driver'
|
||
if os.path.exists(driver_path):
|
||
driver = os.path.basename(os.readlink(driver_path))
|
||
gpu_info["driver"] = driver
|
||
|
||
# Intel GPU 通常集成,标记为集成显卡
|
||
gpu_info["type"] = "integrated"
|
||
|
||
gpus.append(gpu_info)
|
||
|
||
except:
|
||
pass
|
||
|
||
return gpus
|
||
|
||
|
||
def check_generic_gpus() -> List[Dict[str, Any]]:
|
||
"""使用 lspci 进行通用 GPU 检测。"""
|
||
gpus = []
|
||
|
||
if not check_command_exists('lspci'):
|
||
return gpus
|
||
|
||
try:
|
||
_, stdout, _ = execute_command(
|
||
['lspci', '-nn'],
|
||
check_returncode=False, timeout=10
|
||
)
|
||
|
||
for line in stdout.split('\n'):
|
||
if 'VGA' in line or '3D controller' in line or 'Display controller' in line:
|
||
parts = line.split(': ', 1)
|
||
if len(parts) == 2:
|
||
bus_id = parts[0].split()[0]
|
||
description = parts[1]
|
||
|
||
gpu_info = {
|
||
"bus_id": bus_id,
|
||
"description": description
|
||
}
|
||
|
||
# 识别厂商
|
||
desc_lower = description.lower()
|
||
if 'nvidia' in desc_lower:
|
||
gpu_info["vendor"] = "NVIDIA"
|
||
elif 'amd' in desc_lower or 'ati' in desc_lower:
|
||
gpu_info["vendor"] = "AMD"
|
||
elif 'intel' in desc_lower:
|
||
gpu_info["vendor"] = "Intel"
|
||
else:
|
||
gpu_info["vendor"] = "Unknown"
|
||
|
||
# 识别类型
|
||
if 'VGA' in line:
|
||
gpu_info["type"] = "vga"
|
||
elif '3D controller' in line:
|
||
gpu_info["type"] = "3d"
|
||
elif 'Display controller' in line:
|
||
gpu_info["type"] = "display"
|
||
|
||
# 获取详细信息
|
||
try:
|
||
_, detail, _ = execute_command(
|
||
['lspci', '-v', '-s', bus_id],
|
||
check_returncode=False, timeout=5
|
||
)
|
||
|
||
# 提取驱动信息
|
||
driver_match = re.search(r'Kernel driver in use:\s*(\S+)', detail)
|
||
if driver_match:
|
||
gpu_info["driver"] = driver_match.group(1)
|
||
|
||
# 提取模块信息
|
||
modules_match = re.search(r'Kernel modules:\s*(.+)', detail)
|
||
if modules_match:
|
||
gpu_info["modules"] = modules_match.group(1).strip()
|
||
|
||
except:
|
||
pass
|
||
|
||
gpus.append(gpu_info)
|
||
|
||
except:
|
||
pass
|
||
|
||
return gpus
|
||
|
||
|
||
def check_gpu_dmesg_errors() -> List[Dict[str, str]]:
|
||
"""检查 dmesg 中的 GPU 相关错误。"""
|
||
errors = []
|
||
|
||
if not check_command_exists('dmesg'):
|
||
return errors
|
||
|
||
try:
|
||
_, stdout, _ = execute_command(
|
||
['dmesg'],
|
||
check_returncode=False, timeout=10
|
||
)
|
||
|
||
# GPU 相关错误关键词
|
||
gpu_error_patterns = [
|
||
r'GPU has fallen off the bus',
|
||
r'NVRM: Xid',
|
||
r'nvidia.*error',
|
||
r'amdgpu.*error',
|
||
r'i915.*error',
|
||
r'GPU hang',
|
||
r'ring.*timeout',
|
||
r'Failed to load firmware',
|
||
r'VRAM lost',
|
||
r'gpu.*fault',
|
||
r' thermal ',
|
||
]
|
||
|
||
for line in stdout.split('\n'):
|
||
line_lower = line.lower()
|
||
|
||
# 检查是否包含 GPU 相关错误
|
||
is_gpu_error = any(
|
||
re.search(pattern, line, re.IGNORECASE)
|
||
for pattern in gpu_error_patterns
|
||
)
|
||
|
||
if is_gpu_error and ('error' in line_lower or 'fail' in line_lower or 'warn' in line_lower or 'Xid' in line):
|
||
# 提取时间戳
|
||
timestamp_match = re.match(r'\[\s*([\d.]+)\]', line)
|
||
timestamp = timestamp_match.group(1) if timestamp_match else "unknown"
|
||
|
||
errors.append({
|
||
"timestamp": timestamp,
|
||
"message": line.strip()
|
||
})
|
||
|
||
# 去重并限制数量
|
||
seen = set()
|
||
unique_errors = []
|
||
for error in errors:
|
||
msg = error["message"]
|
||
if msg not in seen and len(unique_errors) < 20:
|
||
seen.add(msg)
|
||
unique_errors.append(error)
|
||
|
||
return unique_errors
|
||
|
||
except:
|
||
return []
|
||
|
||
|
||
def get_gpu_processes() -> List[Dict[str, Any]]:
|
||
"""获取使用 GPU 的进程列表(仅 NVIDIA)。"""
|
||
processes = []
|
||
|
||
if not check_command_exists('nvidia-smi'):
|
||
return processes
|
||
|
||
try:
|
||
_, stdout, _ = execute_command(
|
||
['nvidia-smi', 'pmon', '-s', 'um', '-c', '1'],
|
||
check_returncode=False, timeout=5
|
||
)
|
||
|
||
lines = stdout.strip().split('\n')
|
||
# 跳过前两行(表头)
|
||
for line in lines[2:]:
|
||
if line.strip() and not line.startswith('#'):
|
||
parts = line.split()
|
||
if len(parts) >= 8:
|
||
processes.append({
|
||
"gpu_index": safe_int(parts[0]),
|
||
"pid": parts[1],
|
||
"type": parts[2],
|
||
"sm_util": parts[3],
|
||
"mem_util": parts[4],
|
||
"enc_util": parts[5],
|
||
"dec_util": parts[6],
|
||
"command": parts[7]
|
||
})
|
||
except:
|
||
pass
|
||
|
||
return processes
|
||
|
||
|
||
if __name__ == '__main__':
|
||
import json
|
||
print(json.dumps(run_gpu_check(), indent=2, ensure_ascii=False))
|