ServerGuard/modules/memory.py

"""
ServerGuard - 内存检测与压力测试模块

深度检测内存的读写错误和稳定性。
"""

import os
import re
import time
from typing import Dict, Any, List, Optional

import sys
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from utils import (
    execute_command, check_command_exists, safe_int, safe_float,
    format_bytes, require_root
)


def run_memory_check(stress_test: bool = False, stress_duration: int = 300) -> Dict[str, Any]:
    """
    执行内存检测。

    Args:
        stress_test: 是否执行压力测试
        stress_duration: 压力测试持续时间（秒）

    Returns:
        Dict[str, Any]: 检测结果
    """
    result = {
        "status": "success",
        "summary": {},
        "dimm_info": [],
        "ecc_status": {},
        "edac_errors": {},
        "stress_test": {}
    }

    try:
        # 获取内存摘要信息
        result["summary"] = get_memory_summary()

        # 获取 DIMM 详细信息
        result["dimm_info"] = get_dimm_info()

        # 检查 ECC 状态
        result["ecc_status"] = check_ecc_status()

        # 检查 EDAC 错误
        result["edac_errors"] = check_edac_errors()
        if result["edac_errors"].get("total_errors", 0) > 0:
            result["status"] = "warning"

        # 执行内存压力测试
        if stress_test:
            # 优先使用 memtester
            if check_command_exists('memtester'):
                result["stress_test"] = run_memtester(stress_duration)
            # 备选使用 stress-ng
            elif check_command_exists('stress-ng'):
                result["stress_test"] = run_memory_stress_ng(stress_duration)
            # 最后使用 stress
            elif check_command_exists('stress'):
                result["stress_test"] = run_memory_stress(stress_duration)
            else:
                result["stress_test"] = {
                    "passed": False,
                    "error": "未找到内存压力测试工具 (memtester/stress-ng/stress)"
                }

            if not result["stress_test"].get("passed", False):
                result["status"] = "error"

    except Exception as e:
        result["status"] = "error"
        result["error"] = str(e)

    return result


def get_memory_summary() -> Dict[str, Any]:
    """获取内存摘要信息。"""
    result = {
        "total_bytes": 0,
        "total_gb": 0,
        "available_bytes": 0,
        "available_gb": 0,
        "used_bytes": 0,
        "used_gb": 0,
        "free_bytes": 0,
        "free_gb": 0,
        "buffers_bytes": 0,
        "cached_bytes": 0,
        "swap_total_bytes": 0,
        "swap_used_bytes": 0,
        "swap_free_bytes": 0
    }

    try:
        with open('/proc/meminfo', 'r') as f:
            meminfo = f.read()

        # 解析 meminfo
        patterns = {
            "total_bytes": r'MemTotal:\s+(\d+)',
            "free_bytes": r'MemFree:\s+(\d+)',
            "available_bytes": r'MemAvailable:\s+(\d+)',
            "buffers_bytes": r'Buffers:\s+(\d+)',
            "cached_bytes": r'Cached:\s+(\d+)',
            "swap_total_bytes": r'SwapTotal:\s+(\d+)',
            "swap_free_bytes": r'SwapFree:\s+(\d+)'
        }

        for key, pattern in patterns.items():
            match = re.search(pattern, meminfo)
            if match:
                kb = safe_int(match.group(1))
                bytes_val = kb * 1024
                result[key] = bytes_val

                # 同时设置 GB 版本
                gb_key = key.replace('bytes', 'gb')
                result[gb_key] = round(bytes_val / (1024**3), 2)

        # 计算已用内存
        result["used_bytes"] = result["total_bytes"] - result["free_bytes"] - result["buffers_bytes"] - result["cached_bytes"]
        result["used_gb"] = round(result["used_bytes"] / (1024**3), 2)

        # 计算交换空间使用情况
        result["swap_used_bytes"] = result["swap_total_bytes"] - result["swap_free_bytes"]
        result["swap_used_gb"] = round(result["swap_used_bytes"] / (1024**3), 2)
        result["swap_free_gb"] = round(result["swap_free_bytes"] / (1024**3), 2)

        # 计算使用百分比
        if result["total_bytes"] > 0:
            result["usage_percent"] = round((result["used_bytes"] / result["total_bytes"]) * 100, 1)

    except Exception as e:
        result["error"] = str(e)

    return result


def get_dimm_info() -> List[Dict[str, Any]]:
    """获取 DIMM（内存条）详细信息。"""
    dimms = []

    if check_command_exists('dmidecode'):
        try:
            _, stdout, _ = execute_command(
                ['dmidecode', '-t', 'memory'],
                check_returncode=False, timeout=15
            )

            # 分割每个内存设备
            devices = stdout.split('Memory Device')

            for device in devices[1:]:  # 第一个是标题，跳过
                dimm = {}

                # 解析各项属性
                patterns = {
                    "array_handle": r'Array Handle:\s*(\S+)',
                    "error_handle": r'Error Information Handle:\s*(\S+)',
                    "total_width": r'Total Width:\s*(\d+)',
                    "data_width": r'Data Width:\s*(\d+)',
                    "size": r'Size:\s*(.*)',
                    "form_factor": r'Form Factor:\s*(\S+)',
                    "set": r'Set:\s*(\S+)',
                    "locator": r'Locator:\s*(.+)',
                    "bank_locator": r'Bank Locator:\s*(.+)',
                    "type": r'Type:\s*(\S+)',
                    "type_detail": r'Type Detail:\s*(.+)',
                    "speed": r'Speed:\s*(.*)',
                    "manufacturer": r'Manufacturer:\s*(\S+)',
                    "serial_number": r'Serial Number:\s*(\S+)',
                    "asset_tag": r'Asset Tag:\s*(\S+)',
                    "part_number": r'Part Number:\s*(\S+)',
                    "rank": r'Rank:\s*(\d+)',
                    "configured_speed": r'Configured Memory Speed:\s*(.*)',
                    "minimum_voltage": r'Minimum Voltage:\s*(.+)',
                    "maximum_voltage": r'Maximum Voltage:\s*(.+)',
                    "configured_voltage": r'Configured Voltage:\s*(.+)'
                }

                for key, pattern in patterns.items():
                    match = re.search(pattern, device, re.IGNORECASE)
                    if match:
                        value = match.group(1).strip()
                        # 跳过无效值
                        if value not in ['Not Specified', 'To be filled by O.E.M.', 'None', 'No Module Installed', 'Unknown']:
                            dimm[key] = value

                # 解析大小
                if 'size' in dimm:
                    size_str = dimm['size']
                    if 'MB' in size_str:
                        dimm["size_mb"] = safe_int(size_str.replace('MB', '').strip())
                    elif 'GB' in size_str:
                        dimm["size_gb"] = safe_float(size_str.replace('GB', '').strip())
                        dimm["size_mb"] = int(dimm["size_gb"] * 1024)
                    elif 'No Module' in size_str:
                        continue  # 跳过空插槽

                # 解析速度
                if 'speed' in dimm:
                    speed_str = dimm['speed']
                    if 'MT/s' in speed_str:
                        dimm["speed_mts"] = safe_int(speed_str.replace('MT/s', '').strip())
                    elif 'MHz' in speed_str:
                        dimm["speed_mhz"] = safe_int(speed_str.replace('MHz', '').strip())

                if dimm:
                    dimms.append(dimm)

        except Exception as e:
            pass

    return dimms


def check_ecc_status() -> Dict[str, Any]:
    """检查 ECC（错误校正码）内存状态。"""
    result = {
        "supported": False,
        "enabled": False,
        "mode": "unknown",
        "errors": 0
    }

    # 方法 1: 检查 /proc/meminfo
    try:
        with open('/proc/meminfo', 'r') as f:
            content = f.read()

        if 'HardwareCorrupted' in content:
            result["supported"] = True
            match = re.search(r'HardwareCorrupted:\s+(\d+)\s+kB', content)
            if match:
                result["errors"] = safe_int(match.group(1))
    except:
        pass

    # 方法 2: 使用 dmidecode 检查内存类型
    if check_command_exists('dmidecode'):
        try:
            _, stdout, _ = execute_command(
                ['dmidecode', '-t', 'memory'],
                check_returncode=False, timeout=10
            )

            if 'ECC' in stdout or 'Error Correction' in stdout:
                result["supported"] = True

                # 尝试提取 ECC 模式
                match = re.search(r'Error Correction Type:\s*(.+)', stdout)
                if match:
                    result["mode"] = match.group(1).strip()
                    result["enabled"] = result["mode"] != 'None'

        except:
            pass

    # 方法 3: 检查 EDAC
    edac_path = '/sys/devices/system/edac/mc'
    if os.path.exists(edac_path):
        result["edac_available"] = True
        try:
            # 检查每个内存控制器
            for mc in os.listdir(edac_path):
                if mc.startswith('mc'):
                    mc_path = os.path.join(edac_path, mc)
                    ce_file = os.path.join(mc_path, 'ce_count')  # Correctable errors
                    ue_file = os.path.join(mc_path, 'ue_count')  # Uncorrectable errors

                    if os.path.exists(ce_file):
                        with open(ce_file, 'r') as f:
                            ce_count = safe_int(f.read().strip())
                            result["correctable_errors"] = result.get("correctable_errors", 0) + ce_count

                    if os.path.exists(ue_file):
                        with open(ue_file, 'r') as f:
                            ue_count = safe_int(f.read().strip())
                            result["uncorrectable_errors"] = result.get("uncorrectable_errors", 0) + ue_count
        except:
            pass

    return result


def check_edac_errors() -> Dict[str, Any]:
    """检查 EDAC（Error Detection and Correction）错误。"""
    result = {
        "total_errors": 0,
        "correctable_errors": 0,
        "uncorrectable_errors": 0,
        "memory_controllers": []
    }

    edac_path = '/sys/devices/system/edac/mc'

    if not os.path.exists(edac_path):
        result["note"] = "EDAC 不可用"
        return result

    try:
        for mc_name in os.listdir(edac_path):
            if not mc_name.startswith('mc'):
                continue

            mc_path = os.path.join(edac_path, mc_name)
            mc_info = {"name": mc_name}

            # 读取 CE 计数
            ce_file = os.path.join(mc_path, 'ce_count')
            if os.path.exists(ce_file):
                with open(ce_file, 'r') as f:
                    ce = safe_int(f.read().strip())
                    mc_info["correctable_errors"] = ce
                    result["correctable_errors"] += ce

            # 读取 UE 计数
            ue_file = os.path.join(mc_path, 'ue_count')
            if os.path.exists(ue_file):
                with open(ue_file, 'r') as f:
                    ue = safe_int(f.read().strip())
                    mc_info["uncorrectable_errors"] = ue
                    result["uncorrectable_errors"] += ue

            # 读取内存控制器信息
            info_files = ['mc_name', 'size_mb', 'mem_type', 'edac_mc_mode']
            for info_file in info_files:
                filepath = os.path.join(mc_path, info_file)
                if os.path.exists(filepath):
                    with open(filepath, 'r') as f:
                        mc_info[info_file] = f.read().strip()

            result["memory_controllers"].append(mc_info)

        result["total_errors"] = result["correctable_errors"] + result["uncorrectable_errors"]

    except Exception as e:
        result["error"] = str(e)

    return result


@require_root
def run_memtester(duration: int = 300) -> Dict[str, Any]:
    """
    运行内存压力测试。

    Args:
        duration: 测试持续时间（秒），实际 memtester 是基于大小而非时间

    Returns:
        Dict[str, Any]: 测试结果
    """
    result = {
        "passed": False,
        "size_mb": 0,
        "iterations": 1,
        "start_time": None,
        "end_time": None,
        "duration_seconds": 0,
        "errors": [],
        "tests_run": []
    }

    if not check_command_exists('memtester'):
        result["errors"].append("memtester 未安装")
        return result

    try:
        # 计算测试内存大小
        # 留出一些内存给系统和 stress-ng 使用
        with open('/proc/meminfo', 'r') as f:
            content = f.read()

        match = re.search(r'MemAvailable:\s+(\d+)', content)
        if match:
            available_mb = safe_int(match.group(1)) // 1024
            # 使用可用内存的 70%
            test_size_mb = max(64, int(available_mb * 0.7))
        else:
            test_size_mb = 256

        result["size_mb"] = test_size_mb
        result["start_time"] = time.strftime('%Y-%m-%d %H:%M:%S')
        start_ts = time.time()

        # 运行 memtester
        cmd = ['memtester', f'{test_size_mb}M', '1']

        _, stdout, stderr = execute_command(
            cmd,
            timeout=max(300, test_size_mb),  # 根据内存大小调整超时
            check_returncode=False
        )

        result["end_time"] = time.strftime('%Y-%m-%d %H:%M:%S')
        result["duration_seconds"] = round(time.time() - start_ts, 2)

        output = stdout + stderr
        result["raw_output"] = output[:2000]  # 保存部分原始输出

        # 分析结果
        if 'FAILURE' in output.upper():
            result["passed"] = False
            # 提取错误信息
            for line in output.split('\n'):
                if 'FAILURE' in line.upper() or 'error' in line.lower():
                    result["errors"].append(line.strip())
        elif 'SUCCESS' in output.upper() or 'ok' in output.lower() or 'finished' in output.lower():
            result["passed"] = True
        else:
            # 检查是否完成所有测试
            if 'Done' in output or 'finished' in output.lower():
                result["passed"] = True
            else:
                result["passed"] = False
                result["errors"].append("测试可能未完成")

        # 提取运行的测试
        test_names = [
            'Stuck Address', 'Random Value', 'Compare XOR',
            'Compare SUB', 'Compare MUL', 'Compare DIV',
            'Compare OR', 'Compare AND', 'Sequential Increment',
            'Solid Bits', 'Block Sequential', 'Checkerboard',
            'Bit Spread', 'Bit Flip', 'Walking Ones', 'Walking Zeroes'
        ]

        for test in test_names:
            if test in output:
                result["tests_run"].append(test)

    except Exception as e:
        result["passed"] = False
        result["errors"].append(str(e))

    return result


@require_root
def run_memory_stress_ng(duration: int = 300) -> Dict[str, Any]:
    """
    使用 stress-ng 进行内存压力测试。

    Args:
        duration: 测试持续时间（秒）

    Returns:
        Dict[str, Any]: 测试结果
    """
    result = {
        "passed": False,
        "tool": "stress-ng",
        "duration_seconds": duration,
        "start_time": None,
        "end_time": None,
        "errors": []
    }

    if not check_command_exists('stress-ng'):
        result["errors"].append("stress-ng 未安装")
        return result

    try:
        result["start_time"] = time.strftime('%Y-%m-%d %H:%M:%S')

        # 运行 stress-ng 内存测试
        cmd = [
            'stress-ng',
            '--vm', '4',  # 4 个 vm worker
            '--vm-bytes', '80%',  # 每个 worker 使用 80% 可用内存
            '--vm-method', 'all',  # 使用所有测试方法
            '--timeout', str(duration),
            '--metrics-brief'
        ]

        _, stdout, stderr = execute_command(
            cmd,
            timeout=duration + 30,
            check_returncode=False
        )

        result["end_time"] = time.strftime('%Y-%m-%d %H:%M:%S')

        output = stdout + stderr

        if 'error' in output.lower() or 'fail' in output.lower():
            result["passed"] = False
        else:
            result["passed"] = True

        # 提取指标
        bogo_ops = re.search(r'stress-ng:\s+vm:\s+(\d+)\s+bogo ops', output)
        if bogo_ops:
            result["bogo_ops"] = safe_int(bogo_ops.group(1))

    except Exception as e:
        result["passed"] = False
        result["errors"].append(str(e))

    return result


@require_root
def run_memory_stress(duration: int = 300) -> Dict[str, Any]:
    """
    使用 stress 进行内存压力测试（备选方案）。

    Args:
        duration: 测试持续时间（秒）

    Returns:
        Dict[str, Any]: 测试结果
    """
    result = {
        "passed": False,
        "tool": "stress",
        "duration_seconds": duration,
        "start_time": None,
        "end_time": None,
        "workers": 4,
        "errors": []
    }

    if not check_command_exists('stress'):
        result["errors"].append("stress 未安装")
        return result

    try:
        result["start_time"] = time.strftime('%Y-%m-%d %H:%M:%S')

        # 运行 stress 内存测试
        # --vm: 内存分配 worker 数量
        # --vm-bytes: 每个 worker 分配的内存
        # --vm-keep: 保持内存占用
        # --timeout: 超时时间
        cmd = [
            'stress',
            '--vm', '4',
            '--vm-bytes', '80%',
            '--vm-keep',
            '--timeout', str(duration)
        ]

        _, stdout, stderr = execute_command(
            cmd,
            timeout=duration + 30,
            check_returncode=False
        )

        result["end_time"] = time.strftime('%Y-%m-%d %H:%M:%S')

        output = stdout + stderr

        # stress 的成功退出码通常是 0
        # 如果有错误输出，可能是失败的
        if 'error' in output.lower() or 'fail' in output.lower():
            result["passed"] = False
        else:
            result["passed"] = True

    except Exception as e:
        result["passed"] = False
        result["errors"].append(str(e))

    return result


if __name__ == '__main__':
    import json
    print(json.dumps(run_memory_check(stress_test=False), indent=2, ensure_ascii=False))