597 lines
20 KiB
Python
597 lines
20 KiB
Python
"""
|
||
ServerGuard - 内存检测与压力测试模块
|
||
|
||
深度检测内存的读写错误和稳定性。
|
||
"""
|
||
|
||
import os
|
||
import re
|
||
import time
|
||
from typing import Dict, Any, List, Optional
|
||
|
||
import sys
|
||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||
|
||
from utils import (
|
||
execute_command, check_command_exists, safe_int, safe_float,
|
||
format_bytes, require_root
|
||
)
|
||
|
||
|
||
def run_memory_check(stress_test: bool = False, stress_duration: int = 300) -> Dict[str, Any]:
|
||
"""
|
||
执行内存检测。
|
||
|
||
Args:
|
||
stress_test: 是否执行压力测试
|
||
stress_duration: 压力测试持续时间(秒)
|
||
|
||
Returns:
|
||
Dict[str, Any]: 检测结果
|
||
"""
|
||
result = {
|
||
"status": "success",
|
||
"summary": {},
|
||
"dimm_info": [],
|
||
"ecc_status": {},
|
||
"edac_errors": {},
|
||
"stress_test": {}
|
||
}
|
||
|
||
try:
|
||
# 获取内存摘要信息
|
||
result["summary"] = get_memory_summary()
|
||
|
||
# 获取 DIMM 详细信息
|
||
result["dimm_info"] = get_dimm_info()
|
||
|
||
# 检查 ECC 状态
|
||
result["ecc_status"] = check_ecc_status()
|
||
|
||
# 检查 EDAC 错误
|
||
result["edac_errors"] = check_edac_errors()
|
||
if result["edac_errors"].get("total_errors", 0) > 0:
|
||
result["status"] = "warning"
|
||
|
||
# 执行内存压力测试
|
||
if stress_test:
|
||
# 优先使用 memtester
|
||
if check_command_exists('memtester'):
|
||
result["stress_test"] = run_memtester(stress_duration)
|
||
# 备选使用 stress-ng
|
||
elif check_command_exists('stress-ng'):
|
||
result["stress_test"] = run_memory_stress_ng(stress_duration)
|
||
# 最后使用 stress
|
||
elif check_command_exists('stress'):
|
||
result["stress_test"] = run_memory_stress(stress_duration)
|
||
else:
|
||
result["stress_test"] = {
|
||
"passed": False,
|
||
"error": "未找到内存压力测试工具 (memtester/stress-ng/stress)"
|
||
}
|
||
|
||
if not result["stress_test"].get("passed", False):
|
||
result["status"] = "error"
|
||
|
||
except Exception as e:
|
||
result["status"] = "error"
|
||
result["error"] = str(e)
|
||
|
||
return result
|
||
|
||
|
||
def get_memory_summary() -> Dict[str, Any]:
|
||
"""获取内存摘要信息。"""
|
||
result = {
|
||
"total_bytes": 0,
|
||
"total_gb": 0,
|
||
"available_bytes": 0,
|
||
"available_gb": 0,
|
||
"used_bytes": 0,
|
||
"used_gb": 0,
|
||
"free_bytes": 0,
|
||
"free_gb": 0,
|
||
"buffers_bytes": 0,
|
||
"cached_bytes": 0,
|
||
"swap_total_bytes": 0,
|
||
"swap_used_bytes": 0,
|
||
"swap_free_bytes": 0
|
||
}
|
||
|
||
try:
|
||
with open('/proc/meminfo', 'r') as f:
|
||
meminfo = f.read()
|
||
|
||
# 解析 meminfo
|
||
patterns = {
|
||
"total_bytes": r'MemTotal:\s+(\d+)',
|
||
"free_bytes": r'MemFree:\s+(\d+)',
|
||
"available_bytes": r'MemAvailable:\s+(\d+)',
|
||
"buffers_bytes": r'Buffers:\s+(\d+)',
|
||
"cached_bytes": r'Cached:\s+(\d+)',
|
||
"swap_total_bytes": r'SwapTotal:\s+(\d+)',
|
||
"swap_free_bytes": r'SwapFree:\s+(\d+)'
|
||
}
|
||
|
||
for key, pattern in patterns.items():
|
||
match = re.search(pattern, meminfo)
|
||
if match:
|
||
kb = safe_int(match.group(1))
|
||
bytes_val = kb * 1024
|
||
result[key] = bytes_val
|
||
|
||
# 同时设置 GB 版本
|
||
gb_key = key.replace('bytes', 'gb')
|
||
result[gb_key] = round(bytes_val / (1024**3), 2)
|
||
|
||
# 计算已用内存
|
||
result["used_bytes"] = result["total_bytes"] - result["free_bytes"] - result["buffers_bytes"] - result["cached_bytes"]
|
||
result["used_gb"] = round(result["used_bytes"] / (1024**3), 2)
|
||
|
||
# 计算交换空间使用情况
|
||
result["swap_used_bytes"] = result["swap_total_bytes"] - result["swap_free_bytes"]
|
||
result["swap_used_gb"] = round(result["swap_used_bytes"] / (1024**3), 2)
|
||
result["swap_free_gb"] = round(result["swap_free_bytes"] / (1024**3), 2)
|
||
|
||
# 计算使用百分比
|
||
if result["total_bytes"] > 0:
|
||
result["usage_percent"] = round((result["used_bytes"] / result["total_bytes"]) * 100, 1)
|
||
|
||
except Exception as e:
|
||
result["error"] = str(e)
|
||
|
||
return result
|
||
|
||
|
||
def get_dimm_info() -> List[Dict[str, Any]]:
|
||
"""获取 DIMM(内存条)详细信息。"""
|
||
dimms = []
|
||
|
||
if check_command_exists('dmidecode'):
|
||
try:
|
||
_, stdout, _ = execute_command(
|
||
['dmidecode', '-t', 'memory'],
|
||
check_returncode=False, timeout=15
|
||
)
|
||
|
||
# 分割每个内存设备
|
||
devices = stdout.split('Memory Device')
|
||
|
||
for device in devices[1:]: # 第一个是标题,跳过
|
||
dimm = {}
|
||
|
||
# 解析各项属性
|
||
patterns = {
|
||
"array_handle": r'Array Handle:\s*(\S+)',
|
||
"error_handle": r'Error Information Handle:\s*(\S+)',
|
||
"total_width": r'Total Width:\s*(\d+)',
|
||
"data_width": r'Data Width:\s*(\d+)',
|
||
"size": r'Size:\s*(.*)',
|
||
"form_factor": r'Form Factor:\s*(\S+)',
|
||
"set": r'Set:\s*(\S+)',
|
||
"locator": r'Locator:\s*(.+)',
|
||
"bank_locator": r'Bank Locator:\s*(.+)',
|
||
"type": r'Type:\s*(\S+)',
|
||
"type_detail": r'Type Detail:\s*(.+)',
|
||
"speed": r'Speed:\s*(.*)',
|
||
"manufacturer": r'Manufacturer:\s*(\S+)',
|
||
"serial_number": r'Serial Number:\s*(\S+)',
|
||
"asset_tag": r'Asset Tag:\s*(\S+)',
|
||
"part_number": r'Part Number:\s*(\S+)',
|
||
"rank": r'Rank:\s*(\d+)',
|
||
"configured_speed": r'Configured Memory Speed:\s*(.*)',
|
||
"minimum_voltage": r'Minimum Voltage:\s*(.+)',
|
||
"maximum_voltage": r'Maximum Voltage:\s*(.+)',
|
||
"configured_voltage": r'Configured Voltage:\s*(.+)'
|
||
}
|
||
|
||
for key, pattern in patterns.items():
|
||
match = re.search(pattern, device, re.IGNORECASE)
|
||
if match:
|
||
value = match.group(1).strip()
|
||
# 跳过无效值
|
||
if value not in ['Not Specified', 'To be filled by O.E.M.', 'None', 'No Module Installed', 'Unknown']:
|
||
dimm[key] = value
|
||
|
||
# 解析大小
|
||
if 'size' in dimm:
|
||
size_str = dimm['size']
|
||
if 'MB' in size_str:
|
||
dimm["size_mb"] = safe_int(size_str.replace('MB', '').strip())
|
||
elif 'GB' in size_str:
|
||
dimm["size_gb"] = safe_float(size_str.replace('GB', '').strip())
|
||
dimm["size_mb"] = int(dimm["size_gb"] * 1024)
|
||
elif 'No Module' in size_str:
|
||
continue # 跳过空插槽
|
||
|
||
# 解析速度
|
||
if 'speed' in dimm:
|
||
speed_str = dimm['speed']
|
||
if 'MT/s' in speed_str:
|
||
dimm["speed_mts"] = safe_int(speed_str.replace('MT/s', '').strip())
|
||
elif 'MHz' in speed_str:
|
||
dimm["speed_mhz"] = safe_int(speed_str.replace('MHz', '').strip())
|
||
|
||
if dimm:
|
||
dimms.append(dimm)
|
||
|
||
except Exception as e:
|
||
pass
|
||
|
||
return dimms
|
||
|
||
|
||
def check_ecc_status() -> Dict[str, Any]:
|
||
"""检查 ECC(错误校正码)内存状态。"""
|
||
result = {
|
||
"supported": False,
|
||
"enabled": False,
|
||
"mode": "unknown",
|
||
"errors": 0
|
||
}
|
||
|
||
# 方法 1: 检查 /proc/meminfo
|
||
try:
|
||
with open('/proc/meminfo', 'r') as f:
|
||
content = f.read()
|
||
|
||
if 'HardwareCorrupted' in content:
|
||
result["supported"] = True
|
||
match = re.search(r'HardwareCorrupted:\s+(\d+)\s+kB', content)
|
||
if match:
|
||
result["errors"] = safe_int(match.group(1))
|
||
except:
|
||
pass
|
||
|
||
# 方法 2: 使用 dmidecode 检查内存类型
|
||
if check_command_exists('dmidecode'):
|
||
try:
|
||
_, stdout, _ = execute_command(
|
||
['dmidecode', '-t', 'memory'],
|
||
check_returncode=False, timeout=10
|
||
)
|
||
|
||
if 'ECC' in stdout or 'Error Correction' in stdout:
|
||
result["supported"] = True
|
||
|
||
# 尝试提取 ECC 模式
|
||
match = re.search(r'Error Correction Type:\s*(.+)', stdout)
|
||
if match:
|
||
result["mode"] = match.group(1).strip()
|
||
result["enabled"] = result["mode"] != 'None'
|
||
|
||
except:
|
||
pass
|
||
|
||
# 方法 3: 检查 EDAC
|
||
edac_path = '/sys/devices/system/edac/mc'
|
||
if os.path.exists(edac_path):
|
||
result["edac_available"] = True
|
||
try:
|
||
# 检查每个内存控制器
|
||
for mc in os.listdir(edac_path):
|
||
if mc.startswith('mc'):
|
||
mc_path = os.path.join(edac_path, mc)
|
||
ce_file = os.path.join(mc_path, 'ce_count') # Correctable errors
|
||
ue_file = os.path.join(mc_path, 'ue_count') # Uncorrectable errors
|
||
|
||
if os.path.exists(ce_file):
|
||
with open(ce_file, 'r') as f:
|
||
ce_count = safe_int(f.read().strip())
|
||
result["correctable_errors"] = result.get("correctable_errors", 0) + ce_count
|
||
|
||
if os.path.exists(ue_file):
|
||
with open(ue_file, 'r') as f:
|
||
ue_count = safe_int(f.read().strip())
|
||
result["uncorrectable_errors"] = result.get("uncorrectable_errors", 0) + ue_count
|
||
except:
|
||
pass
|
||
|
||
return result
|
||
|
||
|
||
def check_edac_errors() -> Dict[str, Any]:
|
||
"""检查 EDAC(Error Detection and Correction)错误。"""
|
||
result = {
|
||
"total_errors": 0,
|
||
"correctable_errors": 0,
|
||
"uncorrectable_errors": 0,
|
||
"memory_controllers": []
|
||
}
|
||
|
||
edac_path = '/sys/devices/system/edac/mc'
|
||
|
||
if not os.path.exists(edac_path):
|
||
result["note"] = "EDAC 不可用"
|
||
return result
|
||
|
||
try:
|
||
for mc_name in os.listdir(edac_path):
|
||
if not mc_name.startswith('mc'):
|
||
continue
|
||
|
||
mc_path = os.path.join(edac_path, mc_name)
|
||
mc_info = {"name": mc_name}
|
||
|
||
# 读取 CE 计数
|
||
ce_file = os.path.join(mc_path, 'ce_count')
|
||
if os.path.exists(ce_file):
|
||
with open(ce_file, 'r') as f:
|
||
ce = safe_int(f.read().strip())
|
||
mc_info["correctable_errors"] = ce
|
||
result["correctable_errors"] += ce
|
||
|
||
# 读取 UE 计数
|
||
ue_file = os.path.join(mc_path, 'ue_count')
|
||
if os.path.exists(ue_file):
|
||
with open(ue_file, 'r') as f:
|
||
ue = safe_int(f.read().strip())
|
||
mc_info["uncorrectable_errors"] = ue
|
||
result["uncorrectable_errors"] += ue
|
||
|
||
# 读取内存控制器信息
|
||
info_files = ['mc_name', 'size_mb', 'mem_type', 'edac_mc_mode']
|
||
for info_file in info_files:
|
||
filepath = os.path.join(mc_path, info_file)
|
||
if os.path.exists(filepath):
|
||
with open(filepath, 'r') as f:
|
||
mc_info[info_file] = f.read().strip()
|
||
|
||
result["memory_controllers"].append(mc_info)
|
||
|
||
result["total_errors"] = result["correctable_errors"] + result["uncorrectable_errors"]
|
||
|
||
except Exception as e:
|
||
result["error"] = str(e)
|
||
|
||
return result
|
||
|
||
|
||
@require_root
|
||
def run_memtester(duration: int = 300) -> Dict[str, Any]:
|
||
"""
|
||
运行内存压力测试。
|
||
|
||
Args:
|
||
duration: 测试持续时间(秒),实际 memtester 是基于大小而非时间
|
||
|
||
Returns:
|
||
Dict[str, Any]: 测试结果
|
||
"""
|
||
import logging
|
||
logger = logging.getLogger(__name__)
|
||
|
||
result = {
|
||
"passed": False,
|
||
"size_mb": 0,
|
||
"iterations": 1,
|
||
"start_time": None,
|
||
"end_time": None,
|
||
"duration_seconds": 0,
|
||
"errors": [],
|
||
"tests_run": []
|
||
}
|
||
|
||
if not check_command_exists('memtester'):
|
||
result["errors"].append("memtester 未安装")
|
||
logger.warning("[MEMORY STRESS TEST] memtester 未安装")
|
||
return result
|
||
|
||
try:
|
||
logger.info("[MEMORY STRESS TEST] 开始使用 memtester 进行内存测试")
|
||
|
||
# 计算测试内存大小
|
||
# 留出一些内存给系统和 stress-ng 使用
|
||
with open('/proc/meminfo', 'r') as f:
|
||
content = f.read()
|
||
|
||
match = re.search(r'MemAvailable:\s+(\d+)', content)
|
||
if match:
|
||
available_mb = safe_int(match.group(1)) // 1024
|
||
# 使用可用内存的 70%
|
||
test_size_mb = max(64, int(available_mb * 0.7))
|
||
else:
|
||
test_size_mb = 256
|
||
|
||
result["size_mb"] = test_size_mb
|
||
result["start_time"] = time.strftime('%Y-%m-%d %H:%M:%S')
|
||
start_ts = time.time()
|
||
|
||
logger.info(f"[MEMORY STRESS TEST] 测试内存大小: {test_size_mb}MB")
|
||
|
||
# 运行 memtester
|
||
cmd = ['memtester', f'{test_size_mb}M', '1']
|
||
logger.info(f"[MEMORY STRESS TEST] 执行命令: {' '.join(cmd)}")
|
||
|
||
_, stdout, stderr = execute_command(
|
||
cmd,
|
||
timeout=max(300, test_size_mb), # 根据内存大小调整超时
|
||
check_returncode=False
|
||
)
|
||
|
||
result["end_time"] = time.strftime('%Y-%m-%d %H:%M:%S')
|
||
result["duration_seconds"] = round(time.time() - start_ts, 2)
|
||
|
||
logger.info(f"[MEMORY STRESS TEST] memtester 执行完成,耗时: {result['duration_seconds']}秒")
|
||
|
||
output = stdout + stderr
|
||
result["raw_output"] = output[:2000] # 保存部分原始输出
|
||
|
||
# 分析结果
|
||
if 'FAILURE' in output.upper():
|
||
result["passed"] = False
|
||
logger.error("[MEMORY STRESS TEST] 测试失败: 发现 FAILURE")
|
||
# 提取错误信息
|
||
for line in output.split('\n'):
|
||
if 'FAILURE' in line.upper() or 'error' in line.lower():
|
||
result["errors"].append(line.strip())
|
||
logger.error(f"[MEMORY STRESS TEST] 错误详情: {line.strip()}")
|
||
elif 'SUCCESS' in output.upper() or 'ok' in output.lower() or 'finished' in output.lower():
|
||
result["passed"] = True
|
||
logger.info("[MEMORY STRESS TEST] 测试通过")
|
||
else:
|
||
# 检查是否完成所有测试
|
||
if 'Done' in output or 'finished' in output.lower():
|
||
result["passed"] = True
|
||
logger.info("[MEMORY STRESS TEST] 测试完成")
|
||
else:
|
||
result["passed"] = False
|
||
result["errors"].append("测试可能未完成")
|
||
logger.warning("[MEMORY STRESS TEST] 测试可能未完成")
|
||
|
||
# 提取运行的测试
|
||
test_names = [
|
||
'Stuck Address', 'Random Value', 'Compare XOR',
|
||
'Compare SUB', 'Compare MUL', 'Compare DIV',
|
||
'Compare OR', 'Compare AND', 'Sequential Increment',
|
||
'Solid Bits', 'Block Sequential', 'Checkerboard',
|
||
'Bit Spread', 'Bit Flip', 'Walking Ones', 'Walking Zeroes'
|
||
]
|
||
|
||
for test in test_names:
|
||
if test in output:
|
||
result["tests_run"].append(test)
|
||
|
||
logger.info(f"[MEMORY STRESS TEST] 执行的测试项: {', '.join(result['tests_run'])}")
|
||
|
||
except Exception as e:
|
||
result["passed"] = False
|
||
result["errors"].append(str(e))
|
||
logger.exception(f"[MEMORY STRESS TEST] memtester 执行异常: {e}")
|
||
|
||
return result
|
||
|
||
|
||
@require_root
|
||
def run_memory_stress_ng(duration: int = 300) -> Dict[str, Any]:
|
||
"""
|
||
使用 stress-ng 进行内存压力测试。
|
||
|
||
Args:
|
||
duration: 测试持续时间(秒)
|
||
|
||
Returns:
|
||
Dict[str, Any]: 测试结果
|
||
"""
|
||
result = {
|
||
"passed": False,
|
||
"tool": "stress-ng",
|
||
"duration_seconds": duration,
|
||
"start_time": None,
|
||
"end_time": None,
|
||
"errors": []
|
||
}
|
||
|
||
if not check_command_exists('stress-ng'):
|
||
result["errors"].append("stress-ng 未安装")
|
||
return result
|
||
|
||
try:
|
||
result["start_time"] = time.strftime('%Y-%m-%d %H:%M:%S')
|
||
|
||
# 运行 stress-ng 内存测试
|
||
cmd = [
|
||
'stress-ng',
|
||
'--vm', '4', # 4 个 vm worker
|
||
'--vm-bytes', '80%', # 每个 worker 使用 80% 可用内存
|
||
'--vm-method', 'all', # 使用所有测试方法
|
||
'--timeout', str(duration),
|
||
'--metrics-brief'
|
||
]
|
||
|
||
_, stdout, stderr = execute_command(
|
||
cmd,
|
||
timeout=duration + 30,
|
||
check_returncode=False
|
||
)
|
||
|
||
result["end_time"] = time.strftime('%Y-%m-%d %H:%M:%S')
|
||
|
||
output = stdout + stderr
|
||
|
||
if 'error' in output.lower() or 'fail' in output.lower():
|
||
result["passed"] = False
|
||
else:
|
||
result["passed"] = True
|
||
|
||
# 提取指标
|
||
bogo_ops = re.search(r'stress-ng:\s+vm:\s+(\d+)\s+bogo ops', output)
|
||
if bogo_ops:
|
||
result["bogo_ops"] = safe_int(bogo_ops.group(1))
|
||
|
||
except Exception as e:
|
||
result["passed"] = False
|
||
result["errors"].append(str(e))
|
||
|
||
return result
|
||
|
||
|
||
@require_root
|
||
def run_memory_stress(duration: int = 300) -> Dict[str, Any]:
|
||
"""
|
||
使用 stress 进行内存压力测试(备选方案)。
|
||
|
||
Args:
|
||
duration: 测试持续时间(秒)
|
||
|
||
Returns:
|
||
Dict[str, Any]: 测试结果
|
||
"""
|
||
result = {
|
||
"passed": False,
|
||
"tool": "stress",
|
||
"duration_seconds": duration,
|
||
"start_time": None,
|
||
"end_time": None,
|
||
"workers": 4,
|
||
"errors": []
|
||
}
|
||
|
||
if not check_command_exists('stress'):
|
||
result["errors"].append("stress 未安装")
|
||
return result
|
||
|
||
try:
|
||
result["start_time"] = time.strftime('%Y-%m-%d %H:%M:%S')
|
||
|
||
# 运行 stress 内存测试
|
||
# --vm: 内存分配 worker 数量
|
||
# --vm-bytes: 每个 worker 分配的内存
|
||
# --vm-keep: 保持内存占用
|
||
# --timeout: 超时时间
|
||
cmd = [
|
||
'stress',
|
||
'--vm', '4',
|
||
'--vm-bytes', '80%',
|
||
'--vm-keep',
|
||
'--timeout', str(duration)
|
||
]
|
||
|
||
_, stdout, stderr = execute_command(
|
||
cmd,
|
||
timeout=duration + 30,
|
||
check_returncode=False
|
||
)
|
||
|
||
result["end_time"] = time.strftime('%Y-%m-%d %H:%M:%S')
|
||
|
||
output = stdout + stderr
|
||
|
||
# stress 的成功退出码通常是 0
|
||
# 如果有错误输出,可能是失败的
|
||
if 'error' in output.lower() or 'fail' in output.lower():
|
||
result["passed"] = False
|
||
else:
|
||
result["passed"] = True
|
||
|
||
except Exception as e:
|
||
result["passed"] = False
|
||
result["errors"].append(str(e))
|
||
|
||
return result
|
||
|
||
|
||
if __name__ == '__main__':
|
||
import json
|
||
print(json.dumps(run_memory_check(stress_test=False), indent=2, ensure_ascii=False))
|