ServerGuard/modules/log_analyzer.py

"""
ServerGuard - 日志分析模块

自动分析系统日志，查找硬件相关错误关键词。
"""

import os
import re
import gzip
from typing import Dict, Any, List, Optional
from datetime import datetime, timedelta

import sys
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from utils import execute_command, check_command_exists, safe_int


# 硬件错误关键词分类
HARDWARE_ERROR_PATTERNS = {
    "cpu_errors": [
        r'Machine check events? logged',
        r'Hardware Error',
        r'CMCI storm',
        r'machine check',
        r'CPU\s*\d+.*temperature',
        r'thermal.*cpu',
        r'CPU.*throttl',
        r'core.*temp',
        r'CPU.*fault',
        r'uncorrectable',
        r'correctable.*error',
    ],
    "memory_errors": [
        r'Hardware error.*memory',
        r'EDAC.*error',
        r'memory.*error',
        r'Memory.*parity',
        r'ECC.*error',
        r'ue\s+count',
        r'ce\s+count',
        r'Out of memory',
        r'oom-kill',
        r'page allocation failure',
    ],
    "storage_errors": [
        r'I/O error',
        r'Buffer I/O error',
        r'blk_update_request',
        r'ata\d+.*error',
        r'SATA.*error',
        r'NVMe.*error',
        r'critical.*warning',
        r'disk error',
        r'block.*error',
        r'SMART.*failure',
        r'medium error',
        r'uncorrectable error',
    ],
    "pci_errors": [
        r'PCIe.*error',
        r'pcieport.*error',
        r'PCI.*error',
        r'AER:\s*',
        r'Corrected error',
        r'Uncorrected error',
        r'Non-Fatal error',
        r'Fatal error',
        r'Unsupported Request',
    ],
    "usb_errors": [
        r'usb.*error',
        r'USB.*over-current',
        r'usb.*disconnect',
        r'usb.*timeout',
        r'ehci.*error',
        r'xhci.*error',
    ],
    "power_errors": [
        r'thermal.*shutdown',
        r'critical.*temperature',
        r'overheat',
        r'power.*fail',
        r'under.*voltage',
        r'over.*voltage',
        r'brownout',
    ],
    "kernel_panics": [
        r'Kernel panic',
        r'sysrq.*trigger',
        r'watchdog.*bug',
        r'softlockup',
        r'hardlockup',
        r'BUG:.*spinlock',
        r'BUG:.*scheduling',
        r'Oops:',
        r'Call Trace:',
        r'general protection fault',
        r'double fault',
        r'stack.*corruption',
    ]
}


def analyze_logs() -> Dict[str, Any]:
    """
    分析系统日志中的硬件错误。

    Returns:
        Dict[str, Any]: 分析结果
    """
    result = {
        "status": "success",
        "scan_time": datetime.now().isoformat(),
        "dmesg_analysis": {},
        "journal_analysis": {},
        "hardware_errors": {},
        "error_details": {},  # 新增：详细的错误信息
        "critical_events": [],
        "summary": {}
    }

    try:
        # 分析 dmesg
        result["dmesg_analysis"] = analyze_dmesg()

        # 分析 journalctl
        result["journal_analysis"] = analyze_journalctl()

        # 汇总错误统计
        result["hardware_errors"] = summarize_errors(result)

        # 收集详细的错误信息
        result["error_details"] = collect_error_details(result)

        # 识别关键事件
        result["critical_events"] = identify_critical_events(result)

        # 生成摘要
        total_errors = sum(result["hardware_errors"].values())
        result["summary"] = {
            "total_errors_found": total_errors,
            "critical_events": len(result["critical_events"]),
            "recommend_action": total_errors > 0
        }

        # 如果有错误，标记警告状态
        if total_errors > 0:
            result["status"] = "warning"

    except Exception as e:
        result["status"] = "error"
        result["error"] = str(e)

    return result


def is_false_positive(line: str, error_type: str) -> bool:
    """
    判断是否为误报的正常日志。

    Args:
        line: 日志行内容
        error_type: 错误类型

    Returns:
        bool: 是否为误报
    """
    line_lower = line.lower()

    # kernel_panics 类型的误报
    if error_type == "kernel_panics":
        # "Call Trace:" 本身不是错误，需要结合上下文
        # 如果只出现 Call Trace 但没有其他错误关键字，可能是正常的堆栈跟踪
        if 'call trace:' in line_lower:
            # 检查是否包含真正的错误关键字
            real_error_keywords = ['oops', 'panic', 'bug:', 'warning:', 'error:']
            if not any(kw in line_lower for kw in real_error_keywords):
                return True

    # power_errors 类型的误报
    if error_type == "power_errors":
        # 电源按钮输入是正常的 ACPI 事件
        if 'power button' in line_lower and 'input:' in line_lower:
            return True
        if 'pwrf' in line_lower:  # ACPI Power Button
            return True

    # cpu_errors 类型的误报
    if error_type == "cpu_errors":
        # CPU 温度信息是正常的传感器读数
        if 'temperature' in line_lower and 'above' not in line_lower and 'critical' not in line_lower:
            return True

    return False


def analyze_dmesg() -> Dict[str, Any]:
    """分析 dmesg 输出。"""
    result = {
        "available": False,
        "error_counts": {key: 0 for key in HARDWARE_ERROR_PATTERNS.keys()},
        "recent_errors": [],
        "boot_errors": []
    }

    if not check_command_exists('dmesg'):
        result["note"] = "dmesg 不可用"
        return result

    try:
        # 获取 dmesg 输出
        _, stdout, _ = execute_command(
            ['dmesg', '--time-format=iso'],
            check_returncode=False, timeout=15
        )

        result["available"] = True

        # 如果没有 --time-format 支持，使用标准格式
        if not stdout.strip():
            _, stdout, _ = execute_command(
                ['dmesg'],
                check_returncode=False, timeout=15
            )

        lines = stdout.split('\n')

        # 分析每一行
        for line in lines:
            if not line.strip():
                continue

            # 检查各类错误
            for error_type, patterns in HARDWARE_ERROR_PATTERNS.items():
                for pattern in patterns:
                    if re.search(pattern, line, re.IGNORECASE):
                        # 过滤误报
                        if is_false_positive(line, error_type):
                            continue

                        result["error_counts"][error_type] += 1

                        # 保存最近的一些错误（限制数量避免输出过多）
                        if len(result["recent_errors"]) < 50:
                            error_entry = {
                                "type": error_type,
                                "message": line.strip(),
                                "pattern": pattern,
                                "source": "dmesg"
                            }
                            # 去重
                            if error_entry not in result["recent_errors"]:
                                result["recent_errors"].append(error_entry)
                        break

        # 检查启动错误
        result["boot_errors"] = extract_boot_errors(lines)

    except Exception as e:
        result["error"] = str(e)

    return result


def analyze_journalctl() -> Dict[str, Any]:
    """分析 journalctl 日志。"""
    result = {
        "available": False,
        "error_counts": {key: 0 for key in HARDWARE_ERROR_PATTERNS.keys()},
        "recent_errors": [],
        "boot_events": []
    }

    if not check_command_exists('journalctl'):
        result["note"] = "journalctl 不可用"
        return result

    try:
        # 获取最近 1000 行日志
        _, stdout, stderr = execute_command(
            ['journalctl', '-n', '1000', '--no-pager', '-p', 'err'],
            check_returncode=False, timeout=15
        )

        if 'No journal files were found' in stderr:
            result["note"] = "无 journal 文件"
            return result

        result["available"] = True

        lines = stdout.split('\n')

        for line in lines:
            if not line.strip():
                continue

            # 检查各类错误
            for error_type, patterns in HARDWARE_ERROR_PATTERNS.items():
                for pattern in patterns:
                    if re.search(pattern, line, re.IGNORECASE):
                        # 过滤误报
                        if is_false_positive(line, error_type):
                            continue

                        result["error_counts"][error_type] += 1

                        if len(result["recent_errors"]) < 50:
                            error_entry = {
                                "type": error_type,
                                "message": line.strip(),
                                "source": "journalctl"
                            }
                            # 去重
                            if error_entry not in result["recent_errors"]:
                                result["recent_errors"].append(error_entry)
                        break

        # 获取启动事件
        result["boot_events"] = get_journal_boot_events()

    except Exception as e:
        result["error"] = str(e)

    return result


def extract_boot_errors(lines: List[str]) -> List[Dict[str, str]]:
    """提取启动过程中的错误。"""
    boot_errors = []
    in_boot = False

    for line in lines:
        # 检测启动阶段
        if 'Linux version' in line or 'Command line:' in line:
            in_boot = True

        if in_boot and ('error' in line.lower() or 'fail' in line.lower() or 'warn' in line.lower()):
            # 排除常见的非关键消息
            if not any(x in line.lower() for x in ['firmware', 'efi', 'acpi']):
                boot_errors.append({
                    "stage": "boot",
                    "message": line.strip()
                })

        # 启动完成后停止
        if in_boot and ('systemd' in line and 'startup' in line):
            in_boot = False

    return boot_errors[:20]  # 限制数量


def get_journal_boot_events() -> List[Dict[str, str]]:
    """获取 journalctl 中的启动事件。"""
    events = []

    try:
        # 获取当前启动的日志
        _, stdout, _ = execute_command(
            ['journalctl', '-b', '0', '--no-pager', '-p', 'warning'],
            check_returncode=False, timeout=10
        )

        for line in stdout.split('\n'):
            if 'error' in line.lower() or 'fail' in line.lower() or 'hardware' in line.lower():
                events.append({"message": line.strip()})

        return events[:20]

    except:
        return []


def summarize_errors(analysis_result: Dict[str, Any]) -> Dict[str, int]:
    """汇总错误统计。"""
    summary = {key: 0 for key in HARDWARE_ERROR_PATTERNS.keys()}

    # 合并 dmesg 和 journalctl 的统计
    dmesg_counts = analysis_result.get("dmesg_analysis", {}).get("error_counts", {})
    journal_counts = analysis_result.get("journal_analysis", {}).get("error_counts", {})

    for error_type in summary.keys():
        summary[error_type] = dmesg_counts.get(error_type, 0) + journal_counts.get(error_type, 0)

    return summary


def collect_error_details(analysis_result: Dict[str, Any]) -> Dict[str, List[Dict[str, str]]]:
    """
    收集详细的错误信息，按类型分类。

    Returns:
        Dict[str, List[Dict]]: 按错误类型分类的详细错误信息
    """
    error_details = {}

    # 收集所有错误
    all_errors = []
    all_errors.extend(analysis_result.get("dmesg_analysis", {}).get("recent_errors", []))
    all_errors.extend(analysis_result.get("journal_analysis", {}).get("recent_errors", []))

    # 按类型分类
    for error_type in HARDWARE_ERROR_PATTERNS.keys():
        type_errors = [e for e in all_errors if e["type"] == error_type]
        if type_errors:
            # 去重并限制数量
            seen_messages = set()
            unique_errors = []
            for err in type_errors:
                msg = err["message"]
                if msg not in seen_messages and len(unique_errors) < 10:
                    seen_messages.add(msg)
                    unique_errors.append(err)
            error_details[error_type] = unique_errors

    return error_details


def identify_critical_events(analysis_result: Dict[str, Any]) -> List[Dict[str, Any]]:
    """识别需要立即关注的关键事件。"""
    critical_events = []

    # 合并所有错误
    all_errors = []
    all_errors.extend(analysis_result.get("dmesg_analysis", {}).get("recent_errors", []))
    all_errors.extend(analysis_result.get("journal_analysis", {}).get("recent_errors", []))

    # 定义关键错误模式
    critical_patterns = [
        (r'Kernel panic', 'kernel_panic', '内核崩溃', 'critical'),
        (r'hardlockup', 'hard_lockup', 'CPU 硬死锁', 'critical'),
        (r'softlockup', 'soft_lockup', 'CPU 软死锁', 'critical'),
        (r'thermal.*shutdown', 'thermal_shutdown', '过热关机', 'critical'),
        (r'Hardware Error', 'hardware_error', '硬件错误', 'critical'),
        (r'Fatal.*PCIe', 'pcie_fatal', 'PCIe 致命错误', 'critical'),
        (r'I/O error.*sector', 'disk_io_error', '磁盘 I/O 错误', 'warning'),
        (r'Uncorrectable.*error', 'uncorrectable_error', '不可纠正错误', 'warning'),
        (r'out of memory.*kill', 'oom_kill', 'OOM 进程杀死', 'warning'),
        (r'GPU.*fallen.*bus', 'gpu_disconnect', 'GPU 断开连接', 'warning'),
        (r'Machine check', 'mce_error', '机器检查错误(MCE)', 'critical'),
        (r'EDAC.*error', 'edac_error', '内存 EDAC 错误', 'warning'),
    ]

    for error in all_errors:
        message = error.get("message", "")
        for pattern, event_type, description, severity in critical_patterns:
            if re.search(pattern, message, re.IGNORECASE):
                event = {
                    "type": event_type,
                    "description": description,
                    "severity": severity,
                    "message": message[:300],  # 限制长度
                    "source": error.get("source", "unknown"),
                    "error_category": error.get("type", "unknown")
                }

                # 避免重复
                if event not in critical_events:
                    critical_events.append(event)

    # 按严重程度排序
    severity_order = {'critical': 0, 'warning': 1, 'info': 2}
    critical_events.sort(key=lambda x: severity_order.get(x.get('severity', 'info'), 3))

    return critical_events[:20]  # 限制数量


def get_kernel_panic_logs() -> List[Dict[str, str]]:
    """专门查找内核崩溃信息。"""
    panics = []

    # 检查 dmesg
    if check_command_exists('dmesg'):
        try:
            _, stdout, _ = execute_command(['dmesg'], check_returncode=False, timeout=10)

            for line in stdout.split('\n'):
                if 'Kernel panic' in line or 'sysrq' in line.lower():
                    panics.append({
                        "source": "dmesg",
                        "message": line.strip()
                    })
        except:
            pass

    # 检查 journalctl
    if check_command_exists('journalctl'):
        try:
            _, stdout, _ = execute_command(
                ['journalctl', '-k', '--no-pager', '-g', 'panic'],
                check_returncode=False, timeout=10
            )

            for line in stdout.split('\n'):
                if 'panic' in line.lower():
                    panics.append({
                        "source": "journalctl",
                        "message": line.strip()
                    })
        except:
            pass

    return panics


def get_hardware_error_logs() -> Dict[str, List[Dict[str, str]]]:
    """获取特定类型的硬件错误日志。"""
    result = {
        "mce_errors": [],
        "ecc_errors": [],
        "io_errors": [],
        "thermal_errors": []
    }

    if check_command_exists('dmesg'):
        try:
            _, stdout, _ = execute_command(['dmesg'], check_returncode=False, timeout=10)

            for line in stdout.split('\n'):
                # MCE 错误
                if re.search(r'Machine check|CMCI|hardware error', line, re.IGNORECASE):
                    result["mce_errors"].append({
                        "message": line.strip(),
                        "type": "mce"
                    })

                # ECC 错误
                if re.search(r'ECC|EDAC|memory error', line, re.IGNORECASE):
                    result["ecc_errors"].append({
                        "message": line.strip(),
                        "type": "ecc"
                    })

                # I/O 错误
                if re.search(r'I/O error|ata.*error|blk_update', line, re.IGNORECASE):
                    result["io_errors"].append({
                        "message": line.strip(),
                        "type": "io"
                    })

                # 热错误
                if re.search(r'thermal|overheat|critical temp', line, re.IGNORECASE):
                    result["thermal_errors"].append({
                        "message": line.strip(),
                        "type": "thermal"
                    })
        except:
            pass

    # 限制数量
    for key in result:
        result[key] = result[key][:10]

    return result


def search_logs_by_keyword(keyword: str, max_lines: int = 100) -> List[str]:
    """
    根据关键词搜索日志。

    Args:
        keyword: 搜索关键词
        max_lines: 最大返回行数

    Returns:
        List[str]: 匹配的行列表
    """
    results = []

    # 搜索 dmesg
    if check_command_exists('dmesg'):
        try:
            _, stdout, _ = execute_command(
                ['dmesg'],
                check_returncode=False, timeout=10
            )

            for line in stdout.split('\n'):
                if keyword.lower() in line.lower():
                    results.append(f"[dmesg] {line.strip()}")
                    if len(results) >= max_lines:
                        return results
        except:
            pass

    # 搜索 journalctl
    if check_command_exists('journalctl'):
        try:
            _, stdout, _ = execute_command(
                ['journalctl', '-n', str(max_lines * 2), '--no-pager'],
                check_returncode=False, timeout=10
            )

            for line in stdout.split('\n'):
                if keyword.lower() in line.lower():
                    results.append(f"[journal] {line.strip()}")
                    if len(results) >= max_lines:
                        return results
        except:
            pass

    return results


def get_system_logs(since: Optional[str] = None, until: Optional[str] = None) -> Dict[str, Any]:
    """
    获取系统日志。

    Args:
        since: 开始时间 (格式: '2024-01-01 00:00:00')
        until: 结束时间

    Returns:
        Dict[str, Any]: 日志数据
    """
    result = {
        "dmesg": "",
        "journalctl": "",
        "kern_log": ""
    }

    # dmesg
    if check_command_exists('dmesg'):
        try:
            _, stdout, _ = execute_command(['dmesg'], check_returncode=False, timeout=10)
            result["dmesg"] = stdout
        except:
            pass

    # journalctl
    if check_command_exists('journalctl'):
        try:
            cmd = ['journalctl', '--no-pager', '-n', '5000']
            if since:
                cmd.extend(['--since', since])
            if until:
                cmd.extend(['--until', until])

            _, stdout, _ = execute_command(cmd, check_returncode=False, timeout=15)
            result["journalctl"] = stdout
        except:
            pass

    # /var/log/kern.log
    kern_log_path = '/var/log/kern.log'
    if os.path.exists(kern_log_path):
        try:
            with open(kern_log_path, 'r', encoding='utf-8', errors='ignore') as f:
                lines = f.readlines()[-5000:]  # 最后 5000 行
                result["kern_log"] = ''.join(lines)
        except:
            pass

    return result


if __name__ == '__main__':
    import json
    print(json.dumps(analyze_logs(), indent=2, ensure_ascii=False))