""" ServerGuard - 内存检测与压力测试模块 深度检测内存的读写错误和稳定性。 """ import os import re import time from typing import Dict, Any, List, Optional import sys sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from utils import ( execute_command, check_command_exists, safe_int, safe_float, format_bytes, require_root ) def run_memory_check(stress_test: bool = False, stress_duration: int = 300) -> Dict[str, Any]: """ 执行内存检测。 Args: stress_test: 是否执行压力测试 stress_duration: 压力测试持续时间(秒) Returns: Dict[str, Any]: 检测结果 """ result = { "status": "success", "summary": {}, "dimm_info": [], "ecc_status": {}, "edac_errors": {}, "stress_test": {} } try: # 获取内存摘要信息 result["summary"] = get_memory_summary() # 获取 DIMM 详细信息 result["dimm_info"] = get_dimm_info() # 检查 ECC 状态 result["ecc_status"] = check_ecc_status() # 检查 EDAC 错误 result["edac_errors"] = check_edac_errors() if result["edac_errors"].get("total_errors", 0) > 0: result["status"] = "warning" # 执行内存压力测试 if stress_test: # 优先使用 memtester if check_command_exists('memtester'): result["stress_test"] = run_memtester(stress_duration) # 备选使用 stress-ng elif check_command_exists('stress-ng'): result["stress_test"] = run_memory_stress_ng(stress_duration) # 最后使用 stress elif check_command_exists('stress'): result["stress_test"] = run_memory_stress(stress_duration) else: result["stress_test"] = { "passed": False, "error": "未找到内存压力测试工具 (memtester/stress-ng/stress)" } if not result["stress_test"].get("passed", False): result["status"] = "error" except Exception as e: result["status"] = "error" result["error"] = str(e) return result def get_memory_summary() -> Dict[str, Any]: """获取内存摘要信息。""" result = { "total_bytes": 0, "total_gb": 0, "available_bytes": 0, "available_gb": 0, "used_bytes": 0, "used_gb": 0, "free_bytes": 0, "free_gb": 0, "buffers_bytes": 0, "cached_bytes": 0, "swap_total_bytes": 0, "swap_used_bytes": 0, "swap_free_bytes": 0 } try: with open('/proc/meminfo', 'r') as f: meminfo = f.read() # 解析 meminfo patterns = { "total_bytes": r'MemTotal:\s+(\d+)', "free_bytes": r'MemFree:\s+(\d+)', "available_bytes": r'MemAvailable:\s+(\d+)', "buffers_bytes": r'Buffers:\s+(\d+)', "cached_bytes": r'Cached:\s+(\d+)', "swap_total_bytes": r'SwapTotal:\s+(\d+)', "swap_free_bytes": r'SwapFree:\s+(\d+)' } for key, pattern in patterns.items(): match = re.search(pattern, meminfo) if match: kb = safe_int(match.group(1)) bytes_val = kb * 1024 result[key] = bytes_val # 同时设置 GB 版本 gb_key = key.replace('bytes', 'gb') result[gb_key] = round(bytes_val / (1024**3), 2) # 计算已用内存 result["used_bytes"] = result["total_bytes"] - result["free_bytes"] - result["buffers_bytes"] - result["cached_bytes"] result["used_gb"] = round(result["used_bytes"] / (1024**3), 2) # 计算交换空间使用情况 result["swap_used_bytes"] = result["swap_total_bytes"] - result["swap_free_bytes"] result["swap_used_gb"] = round(result["swap_used_bytes"] / (1024**3), 2) result["swap_free_gb"] = round(result["swap_free_bytes"] / (1024**3), 2) # 计算使用百分比 if result["total_bytes"] > 0: result["usage_percent"] = round((result["used_bytes"] / result["total_bytes"]) * 100, 1) except Exception as e: result["error"] = str(e) return result def get_dimm_info() -> List[Dict[str, Any]]: """获取 DIMM(内存条)详细信息。""" dimms = [] if check_command_exists('dmidecode'): try: _, stdout, _ = execute_command( ['dmidecode', '-t', 'memory'], check_returncode=False, timeout=15 ) # 分割每个内存设备 devices = stdout.split('Memory Device') for device in devices[1:]: # 第一个是标题,跳过 dimm = {} # 解析各项属性 patterns = { "array_handle": r'Array Handle:\s*(\S+)', "error_handle": r'Error Information Handle:\s*(\S+)', "total_width": r'Total Width:\s*(\d+)', "data_width": r'Data Width:\s*(\d+)', "size": r'Size:\s*(.*)', "form_factor": r'Form Factor:\s*(\S+)', "set": r'Set:\s*(\S+)', "locator": r'Locator:\s*(.+)', "bank_locator": r'Bank Locator:\s*(.+)', "type": r'Type:\s*(\S+)', "type_detail": r'Type Detail:\s*(.+)', "speed": r'Speed:\s*(.*)', "manufacturer": r'Manufacturer:\s*(\S+)', "serial_number": r'Serial Number:\s*(\S+)', "asset_tag": r'Asset Tag:\s*(\S+)', "part_number": r'Part Number:\s*(\S+)', "rank": r'Rank:\s*(\d+)', "configured_speed": r'Configured Memory Speed:\s*(.*)', "minimum_voltage": r'Minimum Voltage:\s*(.+)', "maximum_voltage": r'Maximum Voltage:\s*(.+)', "configured_voltage": r'Configured Voltage:\s*(.+)' } for key, pattern in patterns.items(): match = re.search(pattern, device, re.IGNORECASE) if match: value = match.group(1).strip() # 跳过无效值 if value not in ['Not Specified', 'To be filled by O.E.M.', 'None', 'No Module Installed', 'Unknown']: dimm[key] = value # 解析大小 if 'size' in dimm: size_str = dimm['size'] if 'MB' in size_str: dimm["size_mb"] = safe_int(size_str.replace('MB', '').strip()) elif 'GB' in size_str: dimm["size_gb"] = safe_float(size_str.replace('GB', '').strip()) dimm["size_mb"] = int(dimm["size_gb"] * 1024) elif 'No Module' in size_str: continue # 跳过空插槽 # 解析速度 if 'speed' in dimm: speed_str = dimm['speed'] if 'MT/s' in speed_str: dimm["speed_mts"] = safe_int(speed_str.replace('MT/s', '').strip()) elif 'MHz' in speed_str: dimm["speed_mhz"] = safe_int(speed_str.replace('MHz', '').strip()) if dimm: dimms.append(dimm) except Exception as e: pass return dimms def check_ecc_status() -> Dict[str, Any]: """检查 ECC(错误校正码)内存状态。""" result = { "supported": False, "enabled": False, "mode": "unknown", "errors": 0 } # 方法 1: 检查 /proc/meminfo try: with open('/proc/meminfo', 'r') as f: content = f.read() if 'HardwareCorrupted' in content: result["supported"] = True match = re.search(r'HardwareCorrupted:\s+(\d+)\s+kB', content) if match: result["errors"] = safe_int(match.group(1)) except: pass # 方法 2: 使用 dmidecode 检查内存类型 if check_command_exists('dmidecode'): try: _, stdout, _ = execute_command( ['dmidecode', '-t', 'memory'], check_returncode=False, timeout=10 ) if 'ECC' in stdout or 'Error Correction' in stdout: result["supported"] = True # 尝试提取 ECC 模式 match = re.search(r'Error Correction Type:\s*(.+)', stdout) if match: result["mode"] = match.group(1).strip() result["enabled"] = result["mode"] != 'None' except: pass # 方法 3: 检查 EDAC edac_path = '/sys/devices/system/edac/mc' if os.path.exists(edac_path): result["edac_available"] = True try: # 检查每个内存控制器 for mc in os.listdir(edac_path): if mc.startswith('mc'): mc_path = os.path.join(edac_path, mc) ce_file = os.path.join(mc_path, 'ce_count') # Correctable errors ue_file = os.path.join(mc_path, 'ue_count') # Uncorrectable errors if os.path.exists(ce_file): with open(ce_file, 'r') as f: ce_count = safe_int(f.read().strip()) result["correctable_errors"] = result.get("correctable_errors", 0) + ce_count if os.path.exists(ue_file): with open(ue_file, 'r') as f: ue_count = safe_int(f.read().strip()) result["uncorrectable_errors"] = result.get("uncorrectable_errors", 0) + ue_count except: pass return result def check_edac_errors() -> Dict[str, Any]: """检查 EDAC(Error Detection and Correction)错误。""" result = { "total_errors": 0, "correctable_errors": 0, "uncorrectable_errors": 0, "memory_controllers": [] } edac_path = '/sys/devices/system/edac/mc' if not os.path.exists(edac_path): result["note"] = "EDAC 不可用" return result try: for mc_name in os.listdir(edac_path): if not mc_name.startswith('mc'): continue mc_path = os.path.join(edac_path, mc_name) mc_info = {"name": mc_name} # 读取 CE 计数 ce_file = os.path.join(mc_path, 'ce_count') if os.path.exists(ce_file): with open(ce_file, 'r') as f: ce = safe_int(f.read().strip()) mc_info["correctable_errors"] = ce result["correctable_errors"] += ce # 读取 UE 计数 ue_file = os.path.join(mc_path, 'ue_count') if os.path.exists(ue_file): with open(ue_file, 'r') as f: ue = safe_int(f.read().strip()) mc_info["uncorrectable_errors"] = ue result["uncorrectable_errors"] += ue # 读取内存控制器信息 info_files = ['mc_name', 'size_mb', 'mem_type', 'edac_mc_mode'] for info_file in info_files: filepath = os.path.join(mc_path, info_file) if os.path.exists(filepath): with open(filepath, 'r') as f: mc_info[info_file] = f.read().strip() result["memory_controllers"].append(mc_info) result["total_errors"] = result["correctable_errors"] + result["uncorrectable_errors"] except Exception as e: result["error"] = str(e) return result @require_root def run_memtester(duration: int = 300) -> Dict[str, Any]: """ 运行内存压力测试。 Args: duration: 测试持续时间(秒),实际 memtester 是基于大小而非时间 Returns: Dict[str, Any]: 测试结果 """ result = { "passed": False, "size_mb": 0, "iterations": 1, "start_time": None, "end_time": None, "duration_seconds": 0, "errors": [], "tests_run": [] } if not check_command_exists('memtester'): result["errors"].append("memtester 未安装") return result try: # 计算测试内存大小 # 留出一些内存给系统和 stress-ng 使用 with open('/proc/meminfo', 'r') as f: content = f.read() match = re.search(r'MemAvailable:\s+(\d+)', content) if match: available_mb = safe_int(match.group(1)) // 1024 # 使用可用内存的 70% test_size_mb = max(64, int(available_mb * 0.7)) else: test_size_mb = 256 result["size_mb"] = test_size_mb result["start_time"] = time.strftime('%Y-%m-%d %H:%M:%S') start_ts = time.time() # 运行 memtester cmd = ['memtester', f'{test_size_mb}M', '1'] _, stdout, stderr = execute_command( cmd, timeout=max(300, test_size_mb), # 根据内存大小调整超时 check_returncode=False ) result["end_time"] = time.strftime('%Y-%m-%d %H:%M:%S') result["duration_seconds"] = round(time.time() - start_ts, 2) output = stdout + stderr result["raw_output"] = output[:2000] # 保存部分原始输出 # 分析结果 if 'FAILURE' in output.upper(): result["passed"] = False # 提取错误信息 for line in output.split('\n'): if 'FAILURE' in line.upper() or 'error' in line.lower(): result["errors"].append(line.strip()) elif 'SUCCESS' in output.upper() or 'ok' in output.lower() or 'finished' in output.lower(): result["passed"] = True else: # 检查是否完成所有测试 if 'Done' in output or 'finished' in output.lower(): result["passed"] = True else: result["passed"] = False result["errors"].append("测试可能未完成") # 提取运行的测试 test_names = [ 'Stuck Address', 'Random Value', 'Compare XOR', 'Compare SUB', 'Compare MUL', 'Compare DIV', 'Compare OR', 'Compare AND', 'Sequential Increment', 'Solid Bits', 'Block Sequential', 'Checkerboard', 'Bit Spread', 'Bit Flip', 'Walking Ones', 'Walking Zeroes' ] for test in test_names: if test in output: result["tests_run"].append(test) except Exception as e: result["passed"] = False result["errors"].append(str(e)) return result @require_root def run_memory_stress_ng(duration: int = 300) -> Dict[str, Any]: """ 使用 stress-ng 进行内存压力测试。 Args: duration: 测试持续时间(秒) Returns: Dict[str, Any]: 测试结果 """ result = { "passed": False, "tool": "stress-ng", "duration_seconds": duration, "start_time": None, "end_time": None, "errors": [] } if not check_command_exists('stress-ng'): result["errors"].append("stress-ng 未安装") return result try: result["start_time"] = time.strftime('%Y-%m-%d %H:%M:%S') # 运行 stress-ng 内存测试 cmd = [ 'stress-ng', '--vm', '4', # 4 个 vm worker '--vm-bytes', '80%', # 每个 worker 使用 80% 可用内存 '--vm-method', 'all', # 使用所有测试方法 '--timeout', str(duration), '--metrics-brief' ] _, stdout, stderr = execute_command( cmd, timeout=duration + 30, check_returncode=False ) result["end_time"] = time.strftime('%Y-%m-%d %H:%M:%S') output = stdout + stderr if 'error' in output.lower() or 'fail' in output.lower(): result["passed"] = False else: result["passed"] = True # 提取指标 bogo_ops = re.search(r'stress-ng:\s+vm:\s+(\d+)\s+bogo ops', output) if bogo_ops: result["bogo_ops"] = safe_int(bogo_ops.group(1)) except Exception as e: result["passed"] = False result["errors"].append(str(e)) return result @require_root def run_memory_stress(duration: int = 300) -> Dict[str, Any]: """ 使用 stress 进行内存压力测试(备选方案)。 Args: duration: 测试持续时间(秒) Returns: Dict[str, Any]: 测试结果 """ result = { "passed": False, "tool": "stress", "duration_seconds": duration, "start_time": None, "end_time": None, "workers": 4, "errors": [] } if not check_command_exists('stress'): result["errors"].append("stress 未安装") return result try: result["start_time"] = time.strftime('%Y-%m-%d %H:%M:%S') # 运行 stress 内存测试 # --vm: 内存分配 worker 数量 # --vm-bytes: 每个 worker 分配的内存 # --vm-keep: 保持内存占用 # --timeout: 超时时间 cmd = [ 'stress', '--vm', '4', '--vm-bytes', '80%', '--vm-keep', '--timeout', str(duration) ] _, stdout, stderr = execute_command( cmd, timeout=duration + 30, check_returncode=False ) result["end_time"] = time.strftime('%Y-%m-%d %H:%M:%S') output = stdout + stderr # stress 的成功退出码通常是 0 # 如果有错误输出,可能是失败的 if 'error' in output.lower() or 'fail' in output.lower(): result["passed"] = False else: result["passed"] = True except Exception as e: result["passed"] = False result["errors"].append(str(e)) return result if __name__ == '__main__': import json print(json.dumps(run_memory_check(stress_test=False), indent=2, ensure_ascii=False))