""" ServerGuard - 显卡检测模块 检测 GPU 信息、温度、驱动状态等。 """ import os import re from typing import Dict, Any, List, Optional import sys sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from utils import ( execute_command, check_command_exists, parse_key_value_output, safe_int, safe_float, format_bytes ) def run_gpu_check() -> Dict[str, Any]: """ 执行 GPU 检测。 Returns: Dict[str, Any]: 检测结果 """ result = { "status": "success", "gpus": [], "errors": [] } try: # 检测 NVIDIA GPU nvidia_gpus = check_nvidia_gpus() if nvidia_gpus: result["gpus"].extend(nvidia_gpus) # 检测 AMD GPU amd_gpus = check_amd_gpus() if amd_gpus: result["gpus"].extend(amd_gpus) # 检测 Intel GPU intel_gpus = check_intel_gpus() if intel_gpus: result["gpus"].extend(intel_gpus) # 如果没有找到 GPU,使用 lspci 基础检测 if not result["gpus"]: result["gpus"] = check_generic_gpus() # 检查系统日志中的 GPU 错误 result["dmesg_errors"] = check_gpu_dmesg_errors() # 如果有错误,更新状态 if result["dmesg_errors"]: result["status"] = "warning" if not result["gpus"]: result["status"] = "unknown" result["note"] = "未检测到 GPU 设备" except Exception as e: result["status"] = "error" result["error"] = str(e) return result def check_nvidia_gpus() -> List[Dict[str, Any]]: """检测 NVIDIA GPU。""" gpus = [] if not check_command_exists('nvidia-smi'): return gpus try: # 获取 GPU 列表和基本信息 _, stdout, _ = execute_command( ['nvidia-smi', '--query-gpu=gpu_name,gpu_bus_id,pci.bus_id,driver_version,pstate,pcie.link.gen.max,pcie.link.gen.current', '--format=csv,noheader'], check_returncode=False, timeout=10 ) for i, line in enumerate(stdout.strip().split('\n')): if not line.strip(): continue parts = [p.strip() for p in line.split(',')] if len(parts) >= 4: gpu_info = { "vendor": "NVIDIA", "index": i, "name": parts[0], "bus_id": parts[1] if len(parts) > 1 else "unknown", "pci_bus_id": parts[2] if len(parts) > 2 else "unknown", "driver_version": parts[3], "pstate": parts[4] if len(parts) > 4 else "unknown", "pcie_max_gen": parts[5] if len(parts) > 5 else "unknown", "pcie_current_gen": parts[6] if len(parts) > 6 else "unknown" } # 获取详细信息 gpu_info.update(get_nvidia_gpu_details(i)) gpus.append(gpu_info) except Exception as e: pass return gpus def get_nvidia_gpu_details(gpu_index: int) -> Dict[str, Any]: """获取单个 NVIDIA GPU 的详细信息。""" details = {} try: # 获取温度和功耗 _, stdout, _ = execute_command( ['nvidia-smi', '--query-gpu=temperature.gpu,power.draw,power.limit,clocks.gr,clocks.mem,utilization.gpu,utilization.memory,memory.total,memory.used,memory.free,serial,uuid,vbios_version', '--format=csv,noheader,nounits', '-i', str(gpu_index)], check_returncode=False, timeout=10 ) parts = [p.strip() for p in stdout.split(',')] if len(parts) >= 10: details["temperature_c"] = safe_int(parts[0]) if parts[0] != '[Not Supported]' else None details["power_draw_w"] = safe_float(parts[1]) if parts[1] != '[Not Supported]' else None details["power_limit_w"] = safe_float(parts[2]) if parts[2] != '[Not Supported]' else None details["graphics_clock_mhz"] = safe_int(parts[3]) if parts[3] != '[Not Supported]' else None details["memory_clock_mhz"] = safe_int(parts[4]) if parts[4] != '[Not Supported]' else None details["gpu_utilization_percent"] = safe_int(parts[5]) if parts[5] != '[Not Supported]' else None details["memory_utilization_percent"] = safe_int(parts[6]) if parts[6] != '[Not Supported]' else None details["memory_total_mb"] = safe_int(parts[7]) if parts[7] != '[Not Supported]' else None details["memory_used_mb"] = safe_int(parts[8]) if parts[8] != '[Not Supported]' else None details["memory_free_mb"] = safe_int(parts[9]) if parts[9] != '[Not Supported]' else None if len(parts) > 10: details["serial"] = parts[10] if parts[10] != '[Not Supported]' else None if len(parts) > 11: details["uuid"] = parts[11] if parts[11] != '[Not Supported]' else None if len(parts) > 12: details["vbios_version"] = parts[12] if parts[12] != '[Not Supported]' else None # 获取 ECC 状态 _, ecc_output, _ = execute_command( ['nvidia-smi', '--query-gpu=ecc.mode.current,ecc.mode.pending,ecc.errors.corrected.volatile.total,ecc.errors.uncorrected.volatile.total', '--format=csv,noheader', '-i', str(gpu_index)], check_returncode=False, timeout=10 ) ecc_parts = [p.strip() for p in ecc_output.split(',')] if len(ecc_parts) >= 4: details["ecc_mode"] = ecc_parts[0] if ecc_parts[0] != '[Not Supported]' else None details["ecc_pending"] = ecc_parts[1] if ecc_parts[1] != '[Not Supported]' else None details["ecc_corrected_errors"] = safe_int(ecc_parts[2]) if ecc_parts[2] != '[Not Supported]' else 0 details["ecc_uncorrected_errors"] = safe_int(ecc_parts[3]) if ecc_parts[3] != '[Not Supported]' else 0 # 获取进程信息 _, proc_output, _ = execute_command( ['nvidia-smi', 'pmon', '-s', 'um', '-c', '1', '-i', str(gpu_index)], check_returncode=False, timeout=5 ) processes = [] for line in proc_output.split('\n')[2:]: # 跳过表头 if line.strip() and not line.startswith('#'): proc_parts = line.split() if len(proc_parts) >= 5: processes.append({ "pid": proc_parts[1], "type": proc_parts[2], "sm_util": proc_parts[3], "mem_util": proc_parts[4] }) if processes: details["processes"] = processes except: pass return details def check_amd_gpus() -> List[Dict[str, Any]]: """检测 AMD GPU。""" gpus = [] # 使用 radeontop 获取信息 if check_command_exists('radeontop'): try: # radeontop 需要图形环境,使用 -d 参数输出到文件 import tempfile with tempfile.NamedTemporaryFile(mode='r', suffix='.txt', delete=False) as f: dump_file = f.name try: _, stdout, _ = execute_command( ['radeontop', '-d', dump_file, '-l', '1'], check_returncode=False, timeout=5 ) with open(dump_file, 'r') as f: output = f.read() gpu_info = {"vendor": "AMD"} # 解析 radeontop 输出 for line in output.split('\n'): if 'GPU' in line and ':' in line: parts = line.split(':') if len(parts) == 2: key = parts[0].strip().lower().replace(' ', '_') value = parts[1].strip() gpu_info[key] = value if gpu_info: gpus.append(gpu_info) finally: if os.path.exists(dump_file): os.unlink(dump_file) except: pass # 尝试从 sysfs 获取 AMD GPU 信息 try: for card in os.listdir('/sys/class/drm'): if card.startswith('card') and not card[-1].isdigit() or (card.startswith('card') and os.path.exists(f'/sys/class/drm/{card}/device/vendor')): vendor_path = f'/sys/class/drm/{card}/device/vendor' if os.path.exists(vendor_path): with open(vendor_path, 'r') as f: vendor_id = f.read().strip() # AMD vendor ID 是 0x1002 if vendor_id == '0x1002': gpu_info = { "vendor": "AMD", "card": card } # 获取设备信息 device_path = f'/sys/class/drm/{card}/device/device' if os.path.exists(device_path): with open(device_path, 'r') as f: gpu_info["device_id"] = f.read().strip() # 获取驱动 driver_path = f'/sys/class/drm/{card}/device/driver' if os.path.exists(driver_path): driver = os.path.basename(os.readlink(driver_path)) gpu_info["driver"] = driver # 获取温度 temp_path = f'/sys/class/drm/{card}/device/hwmon/hwmon0/temp1_input' if os.path.exists(temp_path): with open(temp_path, 'r') as f: temp_mc = safe_int(f.read().strip()) gpu_info["temperature_c"] = temp_mc / 1000.0 # 获取频率 freq_path = f'/sys/class/drm/{card}/device/pp_dpm_sclk' if os.path.exists(freq_path): with open(freq_path, 'r') as f: gpu_info["core_clock_levels"] = f.read().strip() gpus.append(gpu_info) except: pass return gpus def check_intel_gpus() -> List[Dict[str, Any]]: """检测 Intel GPU。""" gpus = [] # 从 sysfs 获取 Intel GPU 信息 try: for card in os.listdir('/sys/class/drm'): if not card.startswith('card'): continue vendor_path = f'/sys/class/drm/{card}/device/vendor' if not os.path.exists(vendor_path): continue with open(vendor_path, 'r') as f: vendor_id = f.read().strip() # Intel vendor ID 是 0x8086 if vendor_id == '0x8086': gpu_info = { "vendor": "Intel", "card": card } # 获取设备信息 device_path = f'/sys/class/drm/{card}/device/device' if os.path.exists(device_path): with open(device_path, 'r') as f: gpu_info["device_id"] = f.read().strip() # 获取驱动 driver_path = f'/sys/class/drm/{card}/device/driver' if os.path.exists(driver_path): driver = os.path.basename(os.readlink(driver_path)) gpu_info["driver"] = driver # Intel GPU 通常集成,标记为集成显卡 gpu_info["type"] = "integrated" gpus.append(gpu_info) except: pass return gpus def check_generic_gpus() -> List[Dict[str, Any]]: """使用 lspci 进行通用 GPU 检测。""" gpus = [] if not check_command_exists('lspci'): return gpus try: _, stdout, _ = execute_command( ['lspci', '-nn'], check_returncode=False, timeout=10 ) for line in stdout.split('\n'): if 'VGA' in line or '3D controller' in line or 'Display controller' in line: parts = line.split(': ', 1) if len(parts) == 2: bus_id = parts[0].split()[0] description = parts[1] gpu_info = { "bus_id": bus_id, "description": description } # 识别厂商 desc_lower = description.lower() if 'nvidia' in desc_lower: gpu_info["vendor"] = "NVIDIA" elif 'amd' in desc_lower or 'ati' in desc_lower: gpu_info["vendor"] = "AMD" elif 'intel' in desc_lower: gpu_info["vendor"] = "Intel" else: gpu_info["vendor"] = "Unknown" # 识别类型 if 'VGA' in line: gpu_info["type"] = "vga" elif '3D controller' in line: gpu_info["type"] = "3d" elif 'Display controller' in line: gpu_info["type"] = "display" # 获取详细信息 try: _, detail, _ = execute_command( ['lspci', '-v', '-s', bus_id], check_returncode=False, timeout=5 ) # 提取驱动信息 driver_match = re.search(r'Kernel driver in use:\s*(\S+)', detail) if driver_match: gpu_info["driver"] = driver_match.group(1) # 提取模块信息 modules_match = re.search(r'Kernel modules:\s*(.+)', detail) if modules_match: gpu_info["modules"] = modules_match.group(1).strip() except: pass gpus.append(gpu_info) except: pass return gpus def check_gpu_dmesg_errors() -> List[Dict[str, str]]: """检查 dmesg 中的 GPU 相关错误。""" errors = [] if not check_command_exists('dmesg'): return errors try: _, stdout, _ = execute_command( ['dmesg'], check_returncode=False, timeout=10 ) # GPU 相关错误关键词 gpu_error_patterns = [ r'GPU has fallen off the bus', r'NVRM: Xid', r'nvidia.*error', r'amdgpu.*error', r'i915.*error', r'GPU hang', r'ring.*timeout', r'Failed to load firmware', r'VRAM lost', r'gpu.*fault', r' thermal ', ] for line in stdout.split('\n'): line_lower = line.lower() # 检查是否包含 GPU 相关错误 is_gpu_error = any( re.search(pattern, line, re.IGNORECASE) for pattern in gpu_error_patterns ) if is_gpu_error and ('error' in line_lower or 'fail' in line_lower or 'warn' in line_lower or 'Xid' in line): # 提取时间戳 timestamp_match = re.match(r'\[\s*([\d.]+)\]', line) timestamp = timestamp_match.group(1) if timestamp_match else "unknown" errors.append({ "timestamp": timestamp, "message": line.strip() }) # 去重并限制数量 seen = set() unique_errors = [] for error in errors: msg = error["message"] if msg not in seen and len(unique_errors) < 20: seen.add(msg) unique_errors.append(error) return unique_errors except: return [] def get_gpu_processes() -> List[Dict[str, Any]]: """获取使用 GPU 的进程列表(仅 NVIDIA)。""" processes = [] if not check_command_exists('nvidia-smi'): return processes try: _, stdout, _ = execute_command( ['nvidia-smi', 'pmon', '-s', 'um', '-c', '1'], check_returncode=False, timeout=5 ) lines = stdout.strip().split('\n') # 跳过前两行(表头) for line in lines[2:]: if line.strip() and not line.startswith('#'): parts = line.split() if len(parts) >= 8: processes.append({ "gpu_index": safe_int(parts[0]), "pid": parts[1], "type": parts[2], "sm_util": parts[3], "mem_util": parts[4], "enc_util": parts[5], "dec_util": parts[6], "command": parts[7] }) except: pass return processes if __name__ == '__main__': import json print(json.dumps(run_gpu_check(), indent=2, ensure_ascii=False))