first commit
This commit is contained in:
497
modules/gpu.py
Normal file
497
modules/gpu.py
Normal file
@@ -0,0 +1,497 @@
|
||||
"""
|
||||
ServerGuard - 显卡检测模块
|
||||
|
||||
检测 GPU 信息、温度、驱动状态等。
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
from typing import Dict, Any, List, Optional
|
||||
|
||||
import sys
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
from utils import (
|
||||
execute_command, check_command_exists, parse_key_value_output,
|
||||
safe_int, safe_float, format_bytes
|
||||
)
|
||||
|
||||
|
||||
def run_gpu_check() -> Dict[str, Any]:
|
||||
"""
|
||||
执行 GPU 检测。
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: 检测结果
|
||||
"""
|
||||
result = {
|
||||
"status": "success",
|
||||
"gpus": [],
|
||||
"errors": []
|
||||
}
|
||||
|
||||
try:
|
||||
# 检测 NVIDIA GPU
|
||||
nvidia_gpus = check_nvidia_gpus()
|
||||
if nvidia_gpus:
|
||||
result["gpus"].extend(nvidia_gpus)
|
||||
|
||||
# 检测 AMD GPU
|
||||
amd_gpus = check_amd_gpus()
|
||||
if amd_gpus:
|
||||
result["gpus"].extend(amd_gpus)
|
||||
|
||||
# 检测 Intel GPU
|
||||
intel_gpus = check_intel_gpus()
|
||||
if intel_gpus:
|
||||
result["gpus"].extend(intel_gpus)
|
||||
|
||||
# 如果没有找到 GPU,使用 lspci 基础检测
|
||||
if not result["gpus"]:
|
||||
result["gpus"] = check_generic_gpus()
|
||||
|
||||
# 检查系统日志中的 GPU 错误
|
||||
result["dmesg_errors"] = check_gpu_dmesg_errors()
|
||||
|
||||
# 如果有错误,更新状态
|
||||
if result["dmesg_errors"]:
|
||||
result["status"] = "warning"
|
||||
|
||||
if not result["gpus"]:
|
||||
result["status"] = "unknown"
|
||||
result["note"] = "未检测到 GPU 设备"
|
||||
|
||||
except Exception as e:
|
||||
result["status"] = "error"
|
||||
result["error"] = str(e)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def check_nvidia_gpus() -> List[Dict[str, Any]]:
|
||||
"""检测 NVIDIA GPU。"""
|
||||
gpus = []
|
||||
|
||||
if not check_command_exists('nvidia-smi'):
|
||||
return gpus
|
||||
|
||||
try:
|
||||
# 获取 GPU 列表和基本信息
|
||||
_, stdout, _ = execute_command(
|
||||
['nvidia-smi', '--query-gpu=gpu_name,gpu_bus_id,pci.bus_id,driver_version,pstate,pcie.link.gen.max,pcie.link.gen.current',
|
||||
'--format=csv,noheader'],
|
||||
check_returncode=False, timeout=10
|
||||
)
|
||||
|
||||
for i, line in enumerate(stdout.strip().split('\n')):
|
||||
if not line.strip():
|
||||
continue
|
||||
|
||||
parts = [p.strip() for p in line.split(',')]
|
||||
if len(parts) >= 4:
|
||||
gpu_info = {
|
||||
"vendor": "NVIDIA",
|
||||
"index": i,
|
||||
"name": parts[0],
|
||||
"bus_id": parts[1] if len(parts) > 1 else "unknown",
|
||||
"pci_bus_id": parts[2] if len(parts) > 2 else "unknown",
|
||||
"driver_version": parts[3],
|
||||
"pstate": parts[4] if len(parts) > 4 else "unknown",
|
||||
"pcie_max_gen": parts[5] if len(parts) > 5 else "unknown",
|
||||
"pcie_current_gen": parts[6] if len(parts) > 6 else "unknown"
|
||||
}
|
||||
|
||||
# 获取详细信息
|
||||
gpu_info.update(get_nvidia_gpu_details(i))
|
||||
gpus.append(gpu_info)
|
||||
|
||||
except Exception as e:
|
||||
pass
|
||||
|
||||
return gpus
|
||||
|
||||
|
||||
def get_nvidia_gpu_details(gpu_index: int) -> Dict[str, Any]:
|
||||
"""获取单个 NVIDIA GPU 的详细信息。"""
|
||||
details = {}
|
||||
|
||||
try:
|
||||
# 获取温度和功耗
|
||||
_, stdout, _ = execute_command(
|
||||
['nvidia-smi', '--query-gpu=temperature.gpu,power.draw,power.limit,clocks.gr,clocks.mem,utilization.gpu,utilization.memory,memory.total,memory.used,memory.free,serial,uuid,vbios_version',
|
||||
'--format=csv,noheader,nounits', '-i', str(gpu_index)],
|
||||
check_returncode=False, timeout=10
|
||||
)
|
||||
|
||||
parts = [p.strip() for p in stdout.split(',')]
|
||||
if len(parts) >= 10:
|
||||
details["temperature_c"] = safe_int(parts[0]) if parts[0] != '[Not Supported]' else None
|
||||
details["power_draw_w"] = safe_float(parts[1]) if parts[1] != '[Not Supported]' else None
|
||||
details["power_limit_w"] = safe_float(parts[2]) if parts[2] != '[Not Supported]' else None
|
||||
details["graphics_clock_mhz"] = safe_int(parts[3]) if parts[3] != '[Not Supported]' else None
|
||||
details["memory_clock_mhz"] = safe_int(parts[4]) if parts[4] != '[Not Supported]' else None
|
||||
details["gpu_utilization_percent"] = safe_int(parts[5]) if parts[5] != '[Not Supported]' else None
|
||||
details["memory_utilization_percent"] = safe_int(parts[6]) if parts[6] != '[Not Supported]' else None
|
||||
details["memory_total_mb"] = safe_int(parts[7]) if parts[7] != '[Not Supported]' else None
|
||||
details["memory_used_mb"] = safe_int(parts[8]) if parts[8] != '[Not Supported]' else None
|
||||
details["memory_free_mb"] = safe_int(parts[9]) if parts[9] != '[Not Supported]' else None
|
||||
|
||||
if len(parts) > 10:
|
||||
details["serial"] = parts[10] if parts[10] != '[Not Supported]' else None
|
||||
if len(parts) > 11:
|
||||
details["uuid"] = parts[11] if parts[11] != '[Not Supported]' else None
|
||||
if len(parts) > 12:
|
||||
details["vbios_version"] = parts[12] if parts[12] != '[Not Supported]' else None
|
||||
|
||||
# 获取 ECC 状态
|
||||
_, ecc_output, _ = execute_command(
|
||||
['nvidia-smi', '--query-gpu=ecc.mode.current,ecc.mode.pending,ecc.errors.corrected.volatile.total,ecc.errors.uncorrected.volatile.total',
|
||||
'--format=csv,noheader', '-i', str(gpu_index)],
|
||||
check_returncode=False, timeout=10
|
||||
)
|
||||
|
||||
ecc_parts = [p.strip() for p in ecc_output.split(',')]
|
||||
if len(ecc_parts) >= 4:
|
||||
details["ecc_mode"] = ecc_parts[0] if ecc_parts[0] != '[Not Supported]' else None
|
||||
details["ecc_pending"] = ecc_parts[1] if ecc_parts[1] != '[Not Supported]' else None
|
||||
details["ecc_corrected_errors"] = safe_int(ecc_parts[2]) if ecc_parts[2] != '[Not Supported]' else 0
|
||||
details["ecc_uncorrected_errors"] = safe_int(ecc_parts[3]) if ecc_parts[3] != '[Not Supported]' else 0
|
||||
|
||||
# 获取进程信息
|
||||
_, proc_output, _ = execute_command(
|
||||
['nvidia-smi', 'pmon', '-s', 'um', '-c', '1', '-i', str(gpu_index)],
|
||||
check_returncode=False, timeout=5
|
||||
)
|
||||
|
||||
processes = []
|
||||
for line in proc_output.split('\n')[2:]: # 跳过表头
|
||||
if line.strip() and not line.startswith('#'):
|
||||
proc_parts = line.split()
|
||||
if len(proc_parts) >= 5:
|
||||
processes.append({
|
||||
"pid": proc_parts[1],
|
||||
"type": proc_parts[2],
|
||||
"sm_util": proc_parts[3],
|
||||
"mem_util": proc_parts[4]
|
||||
})
|
||||
|
||||
if processes:
|
||||
details["processes"] = processes
|
||||
|
||||
except:
|
||||
pass
|
||||
|
||||
return details
|
||||
|
||||
|
||||
def check_amd_gpus() -> List[Dict[str, Any]]:
|
||||
"""检测 AMD GPU。"""
|
||||
gpus = []
|
||||
|
||||
# 使用 radeontop 获取信息
|
||||
if check_command_exists('radeontop'):
|
||||
try:
|
||||
# radeontop 需要图形环境,使用 -d 参数输出到文件
|
||||
import tempfile
|
||||
|
||||
with tempfile.NamedTemporaryFile(mode='r', suffix='.txt', delete=False) as f:
|
||||
dump_file = f.name
|
||||
|
||||
try:
|
||||
_, stdout, _ = execute_command(
|
||||
['radeontop', '-d', dump_file, '-l', '1'],
|
||||
check_returncode=False, timeout=5
|
||||
)
|
||||
|
||||
with open(dump_file, 'r') as f:
|
||||
output = f.read()
|
||||
|
||||
gpu_info = {"vendor": "AMD"}
|
||||
|
||||
# 解析 radeontop 输出
|
||||
for line in output.split('\n'):
|
||||
if 'GPU' in line and ':' in line:
|
||||
parts = line.split(':')
|
||||
if len(parts) == 2:
|
||||
key = parts[0].strip().lower().replace(' ', '_')
|
||||
value = parts[1].strip()
|
||||
gpu_info[key] = value
|
||||
|
||||
if gpu_info:
|
||||
gpus.append(gpu_info)
|
||||
|
||||
finally:
|
||||
if os.path.exists(dump_file):
|
||||
os.unlink(dump_file)
|
||||
|
||||
except:
|
||||
pass
|
||||
|
||||
# 尝试从 sysfs 获取 AMD GPU 信息
|
||||
try:
|
||||
for card in os.listdir('/sys/class/drm'):
|
||||
if card.startswith('card') and not card[-1].isdigit() or (card.startswith('card') and os.path.exists(f'/sys/class/drm/{card}/device/vendor')):
|
||||
vendor_path = f'/sys/class/drm/{card}/device/vendor'
|
||||
if os.path.exists(vendor_path):
|
||||
with open(vendor_path, 'r') as f:
|
||||
vendor_id = f.read().strip()
|
||||
|
||||
# AMD vendor ID 是 0x1002
|
||||
if vendor_id == '0x1002':
|
||||
gpu_info = {
|
||||
"vendor": "AMD",
|
||||
"card": card
|
||||
}
|
||||
|
||||
# 获取设备信息
|
||||
device_path = f'/sys/class/drm/{card}/device/device'
|
||||
if os.path.exists(device_path):
|
||||
with open(device_path, 'r') as f:
|
||||
gpu_info["device_id"] = f.read().strip()
|
||||
|
||||
# 获取驱动
|
||||
driver_path = f'/sys/class/drm/{card}/device/driver'
|
||||
if os.path.exists(driver_path):
|
||||
driver = os.path.basename(os.readlink(driver_path))
|
||||
gpu_info["driver"] = driver
|
||||
|
||||
# 获取温度
|
||||
temp_path = f'/sys/class/drm/{card}/device/hwmon/hwmon0/temp1_input'
|
||||
if os.path.exists(temp_path):
|
||||
with open(temp_path, 'r') as f:
|
||||
temp_mc = safe_int(f.read().strip())
|
||||
gpu_info["temperature_c"] = temp_mc / 1000.0
|
||||
|
||||
# 获取频率
|
||||
freq_path = f'/sys/class/drm/{card}/device/pp_dpm_sclk'
|
||||
if os.path.exists(freq_path):
|
||||
with open(freq_path, 'r') as f:
|
||||
gpu_info["core_clock_levels"] = f.read().strip()
|
||||
|
||||
gpus.append(gpu_info)
|
||||
|
||||
except:
|
||||
pass
|
||||
|
||||
return gpus
|
||||
|
||||
|
||||
def check_intel_gpus() -> List[Dict[str, Any]]:
|
||||
"""检测 Intel GPU。"""
|
||||
gpus = []
|
||||
|
||||
# 从 sysfs 获取 Intel GPU 信息
|
||||
try:
|
||||
for card in os.listdir('/sys/class/drm'):
|
||||
if not card.startswith('card'):
|
||||
continue
|
||||
|
||||
vendor_path = f'/sys/class/drm/{card}/device/vendor'
|
||||
if not os.path.exists(vendor_path):
|
||||
continue
|
||||
|
||||
with open(vendor_path, 'r') as f:
|
||||
vendor_id = f.read().strip()
|
||||
|
||||
# Intel vendor ID 是 0x8086
|
||||
if vendor_id == '0x8086':
|
||||
gpu_info = {
|
||||
"vendor": "Intel",
|
||||
"card": card
|
||||
}
|
||||
|
||||
# 获取设备信息
|
||||
device_path = f'/sys/class/drm/{card}/device/device'
|
||||
if os.path.exists(device_path):
|
||||
with open(device_path, 'r') as f:
|
||||
gpu_info["device_id"] = f.read().strip()
|
||||
|
||||
# 获取驱动
|
||||
driver_path = f'/sys/class/drm/{card}/device/driver'
|
||||
if os.path.exists(driver_path):
|
||||
driver = os.path.basename(os.readlink(driver_path))
|
||||
gpu_info["driver"] = driver
|
||||
|
||||
# Intel GPU 通常集成,标记为集成显卡
|
||||
gpu_info["type"] = "integrated"
|
||||
|
||||
gpus.append(gpu_info)
|
||||
|
||||
except:
|
||||
pass
|
||||
|
||||
return gpus
|
||||
|
||||
|
||||
def check_generic_gpus() -> List[Dict[str, Any]]:
|
||||
"""使用 lspci 进行通用 GPU 检测。"""
|
||||
gpus = []
|
||||
|
||||
if not check_command_exists('lspci'):
|
||||
return gpus
|
||||
|
||||
try:
|
||||
_, stdout, _ = execute_command(
|
||||
['lspci', '-nn'],
|
||||
check_returncode=False, timeout=10
|
||||
)
|
||||
|
||||
for line in stdout.split('\n'):
|
||||
if 'VGA' in line or '3D controller' in line or 'Display controller' in line:
|
||||
parts = line.split(': ', 1)
|
||||
if len(parts) == 2:
|
||||
bus_id = parts[0].split()[0]
|
||||
description = parts[1]
|
||||
|
||||
gpu_info = {
|
||||
"bus_id": bus_id,
|
||||
"description": description
|
||||
}
|
||||
|
||||
# 识别厂商
|
||||
desc_lower = description.lower()
|
||||
if 'nvidia' in desc_lower:
|
||||
gpu_info["vendor"] = "NVIDIA"
|
||||
elif 'amd' in desc_lower or 'ati' in desc_lower:
|
||||
gpu_info["vendor"] = "AMD"
|
||||
elif 'intel' in desc_lower:
|
||||
gpu_info["vendor"] = "Intel"
|
||||
else:
|
||||
gpu_info["vendor"] = "Unknown"
|
||||
|
||||
# 识别类型
|
||||
if 'VGA' in line:
|
||||
gpu_info["type"] = "vga"
|
||||
elif '3D controller' in line:
|
||||
gpu_info["type"] = "3d"
|
||||
elif 'Display controller' in line:
|
||||
gpu_info["type"] = "display"
|
||||
|
||||
# 获取详细信息
|
||||
try:
|
||||
_, detail, _ = execute_command(
|
||||
['lspci', '-v', '-s', bus_id],
|
||||
check_returncode=False, timeout=5
|
||||
)
|
||||
|
||||
# 提取驱动信息
|
||||
driver_match = re.search(r'Kernel driver in use:\s*(\S+)', detail)
|
||||
if driver_match:
|
||||
gpu_info["driver"] = driver_match.group(1)
|
||||
|
||||
# 提取模块信息
|
||||
modules_match = re.search(r'Kernel modules:\s*(.+)', detail)
|
||||
if modules_match:
|
||||
gpu_info["modules"] = modules_match.group(1).strip()
|
||||
|
||||
except:
|
||||
pass
|
||||
|
||||
gpus.append(gpu_info)
|
||||
|
||||
except:
|
||||
pass
|
||||
|
||||
return gpus
|
||||
|
||||
|
||||
def check_gpu_dmesg_errors() -> List[Dict[str, str]]:
|
||||
"""检查 dmesg 中的 GPU 相关错误。"""
|
||||
errors = []
|
||||
|
||||
if not check_command_exists('dmesg'):
|
||||
return errors
|
||||
|
||||
try:
|
||||
_, stdout, _ = execute_command(
|
||||
['dmesg'],
|
||||
check_returncode=False, timeout=10
|
||||
)
|
||||
|
||||
# GPU 相关错误关键词
|
||||
gpu_error_patterns = [
|
||||
r'GPU has fallen off the bus',
|
||||
r'NVRM: Xid',
|
||||
r'nvidia.*error',
|
||||
r'amdgpu.*error',
|
||||
r'i915.*error',
|
||||
r'GPU hang',
|
||||
r'ring.*timeout',
|
||||
r'Failed to load firmware',
|
||||
r'VRAM lost',
|
||||
r'gpu.*fault',
|
||||
r' thermal ',
|
||||
]
|
||||
|
||||
for line in stdout.split('\n'):
|
||||
line_lower = line.lower()
|
||||
|
||||
# 检查是否包含 GPU 相关错误
|
||||
is_gpu_error = any(
|
||||
re.search(pattern, line, re.IGNORECASE)
|
||||
for pattern in gpu_error_patterns
|
||||
)
|
||||
|
||||
if is_gpu_error and ('error' in line_lower or 'fail' in line_lower or 'warn' in line_lower or 'Xid' in line):
|
||||
# 提取时间戳
|
||||
timestamp_match = re.match(r'\[\s*([\d.]+)\]', line)
|
||||
timestamp = timestamp_match.group(1) if timestamp_match else "unknown"
|
||||
|
||||
errors.append({
|
||||
"timestamp": timestamp,
|
||||
"message": line.strip()
|
||||
})
|
||||
|
||||
# 去重并限制数量
|
||||
seen = set()
|
||||
unique_errors = []
|
||||
for error in errors:
|
||||
msg = error["message"]
|
||||
if msg not in seen and len(unique_errors) < 20:
|
||||
seen.add(msg)
|
||||
unique_errors.append(error)
|
||||
|
||||
return unique_errors
|
||||
|
||||
except:
|
||||
return []
|
||||
|
||||
|
||||
def get_gpu_processes() -> List[Dict[str, Any]]:
|
||||
"""获取使用 GPU 的进程列表(仅 NVIDIA)。"""
|
||||
processes = []
|
||||
|
||||
if not check_command_exists('nvidia-smi'):
|
||||
return processes
|
||||
|
||||
try:
|
||||
_, stdout, _ = execute_command(
|
||||
['nvidia-smi', 'pmon', '-s', 'um', '-c', '1'],
|
||||
check_returncode=False, timeout=5
|
||||
)
|
||||
|
||||
lines = stdout.strip().split('\n')
|
||||
# 跳过前两行(表头)
|
||||
for line in lines[2:]:
|
||||
if line.strip() and not line.startswith('#'):
|
||||
parts = line.split()
|
||||
if len(parts) >= 8:
|
||||
processes.append({
|
||||
"gpu_index": safe_int(parts[0]),
|
||||
"pid": parts[1],
|
||||
"type": parts[2],
|
||||
"sm_util": parts[3],
|
||||
"mem_util": parts[4],
|
||||
"enc_util": parts[5],
|
||||
"dec_util": parts[6],
|
||||
"command": parts[7]
|
||||
})
|
||||
except:
|
||||
pass
|
||||
|
||||
return processes
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
import json
|
||||
print(json.dumps(run_gpu_check(), indent=2, ensure_ascii=False))
|
||||
Reference in New Issue
Block a user