Files
ServerGuard/modules/memory.py
2026-03-02 14:14:40 +08:00

578 lines
19 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
ServerGuard - 内存检测与压力测试模块
深度检测内存的读写错误和稳定性。
"""
import os
import re
import time
from typing import Dict, Any, List, Optional
import sys
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from utils import (
execute_command, check_command_exists, safe_int, safe_float,
format_bytes, require_root
)
def run_memory_check(stress_test: bool = False, stress_duration: int = 300) -> Dict[str, Any]:
"""
执行内存检测。
Args:
stress_test: 是否执行压力测试
stress_duration: 压力测试持续时间(秒)
Returns:
Dict[str, Any]: 检测结果
"""
result = {
"status": "success",
"summary": {},
"dimm_info": [],
"ecc_status": {},
"edac_errors": {},
"stress_test": {}
}
try:
# 获取内存摘要信息
result["summary"] = get_memory_summary()
# 获取 DIMM 详细信息
result["dimm_info"] = get_dimm_info()
# 检查 ECC 状态
result["ecc_status"] = check_ecc_status()
# 检查 EDAC 错误
result["edac_errors"] = check_edac_errors()
if result["edac_errors"].get("total_errors", 0) > 0:
result["status"] = "warning"
# 执行内存压力测试
if stress_test:
# 优先使用 memtester
if check_command_exists('memtester'):
result["stress_test"] = run_memtester(stress_duration)
# 备选使用 stress-ng
elif check_command_exists('stress-ng'):
result["stress_test"] = run_memory_stress_ng(stress_duration)
# 最后使用 stress
elif check_command_exists('stress'):
result["stress_test"] = run_memory_stress(stress_duration)
else:
result["stress_test"] = {
"passed": False,
"error": "未找到内存压力测试工具 (memtester/stress-ng/stress)"
}
if not result["stress_test"].get("passed", False):
result["status"] = "error"
except Exception as e:
result["status"] = "error"
result["error"] = str(e)
return result
def get_memory_summary() -> Dict[str, Any]:
"""获取内存摘要信息。"""
result = {
"total_bytes": 0,
"total_gb": 0,
"available_bytes": 0,
"available_gb": 0,
"used_bytes": 0,
"used_gb": 0,
"free_bytes": 0,
"free_gb": 0,
"buffers_bytes": 0,
"cached_bytes": 0,
"swap_total_bytes": 0,
"swap_used_bytes": 0,
"swap_free_bytes": 0
}
try:
with open('/proc/meminfo', 'r') as f:
meminfo = f.read()
# 解析 meminfo
patterns = {
"total_bytes": r'MemTotal:\s+(\d+)',
"free_bytes": r'MemFree:\s+(\d+)',
"available_bytes": r'MemAvailable:\s+(\d+)',
"buffers_bytes": r'Buffers:\s+(\d+)',
"cached_bytes": r'Cached:\s+(\d+)',
"swap_total_bytes": r'SwapTotal:\s+(\d+)',
"swap_free_bytes": r'SwapFree:\s+(\d+)'
}
for key, pattern in patterns.items():
match = re.search(pattern, meminfo)
if match:
kb = safe_int(match.group(1))
bytes_val = kb * 1024
result[key] = bytes_val
# 同时设置 GB 版本
gb_key = key.replace('bytes', 'gb')
result[gb_key] = round(bytes_val / (1024**3), 2)
# 计算已用内存
result["used_bytes"] = result["total_bytes"] - result["free_bytes"] - result["buffers_bytes"] - result["cached_bytes"]
result["used_gb"] = round(result["used_bytes"] / (1024**3), 2)
# 计算交换空间使用情况
result["swap_used_bytes"] = result["swap_total_bytes"] - result["swap_free_bytes"]
result["swap_used_gb"] = round(result["swap_used_bytes"] / (1024**3), 2)
result["swap_free_gb"] = round(result["swap_free_bytes"] / (1024**3), 2)
# 计算使用百分比
if result["total_bytes"] > 0:
result["usage_percent"] = round((result["used_bytes"] / result["total_bytes"]) * 100, 1)
except Exception as e:
result["error"] = str(e)
return result
def get_dimm_info() -> List[Dict[str, Any]]:
"""获取 DIMM内存条详细信息。"""
dimms = []
if check_command_exists('dmidecode'):
try:
_, stdout, _ = execute_command(
['dmidecode', '-t', 'memory'],
check_returncode=False, timeout=15
)
# 分割每个内存设备
devices = stdout.split('Memory Device')
for device in devices[1:]: # 第一个是标题,跳过
dimm = {}
# 解析各项属性
patterns = {
"array_handle": r'Array Handle:\s*(\S+)',
"error_handle": r'Error Information Handle:\s*(\S+)',
"total_width": r'Total Width:\s*(\d+)',
"data_width": r'Data Width:\s*(\d+)',
"size": r'Size:\s*(.*)',
"form_factor": r'Form Factor:\s*(\S+)',
"set": r'Set:\s*(\S+)',
"locator": r'Locator:\s*(.+)',
"bank_locator": r'Bank Locator:\s*(.+)',
"type": r'Type:\s*(\S+)',
"type_detail": r'Type Detail:\s*(.+)',
"speed": r'Speed:\s*(.*)',
"manufacturer": r'Manufacturer:\s*(\S+)',
"serial_number": r'Serial Number:\s*(\S+)',
"asset_tag": r'Asset Tag:\s*(\S+)',
"part_number": r'Part Number:\s*(\S+)',
"rank": r'Rank:\s*(\d+)',
"configured_speed": r'Configured Memory Speed:\s*(.*)',
"minimum_voltage": r'Minimum Voltage:\s*(.+)',
"maximum_voltage": r'Maximum Voltage:\s*(.+)',
"configured_voltage": r'Configured Voltage:\s*(.+)'
}
for key, pattern in patterns.items():
match = re.search(pattern, device, re.IGNORECASE)
if match:
value = match.group(1).strip()
# 跳过无效值
if value not in ['Not Specified', 'To be filled by O.E.M.', 'None', 'No Module Installed', 'Unknown']:
dimm[key] = value
# 解析大小
if 'size' in dimm:
size_str = dimm['size']
if 'MB' in size_str:
dimm["size_mb"] = safe_int(size_str.replace('MB', '').strip())
elif 'GB' in size_str:
dimm["size_gb"] = safe_float(size_str.replace('GB', '').strip())
dimm["size_mb"] = int(dimm["size_gb"] * 1024)
elif 'No Module' in size_str:
continue # 跳过空插槽
# 解析速度
if 'speed' in dimm:
speed_str = dimm['speed']
if 'MT/s' in speed_str:
dimm["speed_mts"] = safe_int(speed_str.replace('MT/s', '').strip())
elif 'MHz' in speed_str:
dimm["speed_mhz"] = safe_int(speed_str.replace('MHz', '').strip())
if dimm:
dimms.append(dimm)
except Exception as e:
pass
return dimms
def check_ecc_status() -> Dict[str, Any]:
"""检查 ECC错误校正码内存状态。"""
result = {
"supported": False,
"enabled": False,
"mode": "unknown",
"errors": 0
}
# 方法 1: 检查 /proc/meminfo
try:
with open('/proc/meminfo', 'r') as f:
content = f.read()
if 'HardwareCorrupted' in content:
result["supported"] = True
match = re.search(r'HardwareCorrupted:\s+(\d+)\s+kB', content)
if match:
result["errors"] = safe_int(match.group(1))
except:
pass
# 方法 2: 使用 dmidecode 检查内存类型
if check_command_exists('dmidecode'):
try:
_, stdout, _ = execute_command(
['dmidecode', '-t', 'memory'],
check_returncode=False, timeout=10
)
if 'ECC' in stdout or 'Error Correction' in stdout:
result["supported"] = True
# 尝试提取 ECC 模式
match = re.search(r'Error Correction Type:\s*(.+)', stdout)
if match:
result["mode"] = match.group(1).strip()
result["enabled"] = result["mode"] != 'None'
except:
pass
# 方法 3: 检查 EDAC
edac_path = '/sys/devices/system/edac/mc'
if os.path.exists(edac_path):
result["edac_available"] = True
try:
# 检查每个内存控制器
for mc in os.listdir(edac_path):
if mc.startswith('mc'):
mc_path = os.path.join(edac_path, mc)
ce_file = os.path.join(mc_path, 'ce_count') # Correctable errors
ue_file = os.path.join(mc_path, 'ue_count') # Uncorrectable errors
if os.path.exists(ce_file):
with open(ce_file, 'r') as f:
ce_count = safe_int(f.read().strip())
result["correctable_errors"] = result.get("correctable_errors", 0) + ce_count
if os.path.exists(ue_file):
with open(ue_file, 'r') as f:
ue_count = safe_int(f.read().strip())
result["uncorrectable_errors"] = result.get("uncorrectable_errors", 0) + ue_count
except:
pass
return result
def check_edac_errors() -> Dict[str, Any]:
"""检查 EDACError Detection and Correction错误。"""
result = {
"total_errors": 0,
"correctable_errors": 0,
"uncorrectable_errors": 0,
"memory_controllers": []
}
edac_path = '/sys/devices/system/edac/mc'
if not os.path.exists(edac_path):
result["note"] = "EDAC 不可用"
return result
try:
for mc_name in os.listdir(edac_path):
if not mc_name.startswith('mc'):
continue
mc_path = os.path.join(edac_path, mc_name)
mc_info = {"name": mc_name}
# 读取 CE 计数
ce_file = os.path.join(mc_path, 'ce_count')
if os.path.exists(ce_file):
with open(ce_file, 'r') as f:
ce = safe_int(f.read().strip())
mc_info["correctable_errors"] = ce
result["correctable_errors"] += ce
# 读取 UE 计数
ue_file = os.path.join(mc_path, 'ue_count')
if os.path.exists(ue_file):
with open(ue_file, 'r') as f:
ue = safe_int(f.read().strip())
mc_info["uncorrectable_errors"] = ue
result["uncorrectable_errors"] += ue
# 读取内存控制器信息
info_files = ['mc_name', 'size_mb', 'mem_type', 'edac_mc_mode']
for info_file in info_files:
filepath = os.path.join(mc_path, info_file)
if os.path.exists(filepath):
with open(filepath, 'r') as f:
mc_info[info_file] = f.read().strip()
result["memory_controllers"].append(mc_info)
result["total_errors"] = result["correctable_errors"] + result["uncorrectable_errors"]
except Exception as e:
result["error"] = str(e)
return result
@require_root
def run_memtester(duration: int = 300) -> Dict[str, Any]:
"""
运行内存压力测试。
Args:
duration: 测试持续时间(秒),实际 memtester 是基于大小而非时间
Returns:
Dict[str, Any]: 测试结果
"""
result = {
"passed": False,
"size_mb": 0,
"iterations": 1,
"start_time": None,
"end_time": None,
"duration_seconds": 0,
"errors": [],
"tests_run": []
}
if not check_command_exists('memtester'):
result["errors"].append("memtester 未安装")
return result
try:
# 计算测试内存大小
# 留出一些内存给系统和 stress-ng 使用
with open('/proc/meminfo', 'r') as f:
content = f.read()
match = re.search(r'MemAvailable:\s+(\d+)', content)
if match:
available_mb = safe_int(match.group(1)) // 1024
# 使用可用内存的 70%
test_size_mb = max(64, int(available_mb * 0.7))
else:
test_size_mb = 256
result["size_mb"] = test_size_mb
result["start_time"] = time.strftime('%Y-%m-%d %H:%M:%S')
start_ts = time.time()
# 运行 memtester
cmd = ['memtester', f'{test_size_mb}M', '1']
_, stdout, stderr = execute_command(
cmd,
timeout=max(300, test_size_mb), # 根据内存大小调整超时
check_returncode=False
)
result["end_time"] = time.strftime('%Y-%m-%d %H:%M:%S')
result["duration_seconds"] = round(time.time() - start_ts, 2)
output = stdout + stderr
result["raw_output"] = output[:2000] # 保存部分原始输出
# 分析结果
if 'FAILURE' in output.upper():
result["passed"] = False
# 提取错误信息
for line in output.split('\n'):
if 'FAILURE' in line.upper() or 'error' in line.lower():
result["errors"].append(line.strip())
elif 'SUCCESS' in output.upper() or 'ok' in output.lower() or 'finished' in output.lower():
result["passed"] = True
else:
# 检查是否完成所有测试
if 'Done' in output or 'finished' in output.lower():
result["passed"] = True
else:
result["passed"] = False
result["errors"].append("测试可能未完成")
# 提取运行的测试
test_names = [
'Stuck Address', 'Random Value', 'Compare XOR',
'Compare SUB', 'Compare MUL', 'Compare DIV',
'Compare OR', 'Compare AND', 'Sequential Increment',
'Solid Bits', 'Block Sequential', 'Checkerboard',
'Bit Spread', 'Bit Flip', 'Walking Ones', 'Walking Zeroes'
]
for test in test_names:
if test in output:
result["tests_run"].append(test)
except Exception as e:
result["passed"] = False
result["errors"].append(str(e))
return result
@require_root
def run_memory_stress_ng(duration: int = 300) -> Dict[str, Any]:
"""
使用 stress-ng 进行内存压力测试。
Args:
duration: 测试持续时间(秒)
Returns:
Dict[str, Any]: 测试结果
"""
result = {
"passed": False,
"tool": "stress-ng",
"duration_seconds": duration,
"start_time": None,
"end_time": None,
"errors": []
}
if not check_command_exists('stress-ng'):
result["errors"].append("stress-ng 未安装")
return result
try:
result["start_time"] = time.strftime('%Y-%m-%d %H:%M:%S')
# 运行 stress-ng 内存测试
cmd = [
'stress-ng',
'--vm', '4', # 4 个 vm worker
'--vm-bytes', '80%', # 每个 worker 使用 80% 可用内存
'--vm-method', 'all', # 使用所有测试方法
'--timeout', str(duration),
'--metrics-brief'
]
_, stdout, stderr = execute_command(
cmd,
timeout=duration + 30,
check_returncode=False
)
result["end_time"] = time.strftime('%Y-%m-%d %H:%M:%S')
output = stdout + stderr
if 'error' in output.lower() or 'fail' in output.lower():
result["passed"] = False
else:
result["passed"] = True
# 提取指标
bogo_ops = re.search(r'stress-ng:\s+vm:\s+(\d+)\s+bogo ops', output)
if bogo_ops:
result["bogo_ops"] = safe_int(bogo_ops.group(1))
except Exception as e:
result["passed"] = False
result["errors"].append(str(e))
return result
@require_root
def run_memory_stress(duration: int = 300) -> Dict[str, Any]:
"""
使用 stress 进行内存压力测试(备选方案)。
Args:
duration: 测试持续时间(秒)
Returns:
Dict[str, Any]: 测试结果
"""
result = {
"passed": False,
"tool": "stress",
"duration_seconds": duration,
"start_time": None,
"end_time": None,
"workers": 4,
"errors": []
}
if not check_command_exists('stress'):
result["errors"].append("stress 未安装")
return result
try:
result["start_time"] = time.strftime('%Y-%m-%d %H:%M:%S')
# 运行 stress 内存测试
# --vm: 内存分配 worker 数量
# --vm-bytes: 每个 worker 分配的内存
# --vm-keep: 保持内存占用
# --timeout: 超时时间
cmd = [
'stress',
'--vm', '4',
'--vm-bytes', '80%',
'--vm-keep',
'--timeout', str(duration)
]
_, stdout, stderr = execute_command(
cmd,
timeout=duration + 30,
check_returncode=False
)
result["end_time"] = time.strftime('%Y-%m-%d %H:%M:%S')
output = stdout + stderr
# stress 的成功退出码通常是 0
# 如果有错误输出,可能是失败的
if 'error' in output.lower() or 'fail' in output.lower():
result["passed"] = False
else:
result["passed"] = True
except Exception as e:
result["passed"] = False
result["errors"].append(str(e))
return result
if __name__ == '__main__':
import json
print(json.dumps(run_memory_check(stress_test=False), indent=2, ensure_ascii=False))