"""
ServerGuard - 日志分析模块

自动分析系统日志，查找硬件相关错误关键词。
"""

import os
import re
import gzip
from typing import Dict, Any, List, Optional
from datetime import datetime, timedelta

import sys
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from utils import execute_command, check_command_exists, safe_int


# 硬件错误关键词分类
HARDWARE_ERROR_PATTERNS = {
    "cpu_errors": [
        r'Machine check events? logged',
        r'Hardware Error',
        r'CMCI storm',
        r'machine check',
        r'CPU\s*\d+.*temperature',
        r'thermal.*cpu',
        r'CPU.*throttl',
        r'core.*temp',
        r'CPU.*fault',
        r'uncorrectable',
        r'correctable.*error',
    ],
    "memory_errors": [
        r'Hardware error.*memory',
        r'EDAC.*error',
        r'memory.*error',
        r'Memory.*parity',
        r'ECC.*error',
        r'ue\s+count',
        r'ce\s+count',
        r'Out of memory',
        r'oom-kill',
        r'page allocation failure',
    ],
    "storage_errors": [
        r'I/O error',
        r'Buffer I/O error',
        r'blk_update_request',
        r'ata\d+.*error',
        r'SATA.*error',
        r'NVMe.*error',
        r'critical.*warning',
        r'disk error',
        r'block.*error',
        r'SMART.*failure',
        r'medium error',
        r'uncorrectable error',
    ],
    "pci_errors": [
        r'PCIe.*error',
        r'pcieport.*error',
        r'PCI.*error',
        r'AER:\s*',
        r'Corrected error',
        r'Uncorrected error',
        r'Non-Fatal error',
        r'Fatal error',
        r'Unsupported Request',
    ],
    "usb_errors": [
        r'usb.*error',
        r'USB.*over-current',
        r'usb.*disconnect',
        r'usb.*timeout',
        r'ehci.*error',
        r'xhci.*error',
    ],
    "power_errors": [
        r'thermal.*shutdown',
        r'critical.*temperature',
        r'overheat',
        r'power.*fail',
        r'under.*voltage',
        r'over.*voltage',
        r'brownout',
        r'power.*button',
    ],
    "kernel_panics": [
        r'Kernel panic',
        r'sysrq.*trigger',
        r'watchdog.*bug',
        r'softlockup',
        r'hardlockup',
        r'BUG:.*spinlock',
        r'BUG:.*scheduling',
        r'Oops:',
        r'Call Trace:',
        r'general protection fault',
        r'double fault',
        r'stack.*corruption',
    ]
}


def analyze_logs() -> Dict[str, Any]:
    """
    分析系统日志中的硬件错误。

    Returns:
        Dict[str, Any]: 分析结果
    """
    result = {
        "status": "success",
        "scan_time": datetime.now().isoformat(),
        "dmesg_analysis": {},
        "journal_analysis": {},
        "hardware_errors": {},
        "critical_events": [],
        "summary": {}
    }
    
    try:
        # 分析 dmesg
        result["dmesg_analysis"] = analyze_dmesg()
        
        # 分析 journalctl
        result["journal_analysis"] = analyze_journalctl()
        
        # 汇总错误统计
        result["hardware_errors"] = summarize_errors(result)
        
        # 识别关键事件
        result["critical_events"] = identify_critical_events(result)
        
        # 生成摘要
        total_errors = sum(result["hardware_errors"].values())
        result["summary"] = {
            "total_errors_found": total_errors,
            "critical_events": len(result["critical_events"]),
            "recommend_action": total_errors > 0
        }
        
        # 如果有错误，标记警告状态
        if total_errors > 0:
            result["status"] = "warning"
        
    except Exception as e:
        result["status"] = "error"
        result["error"] = str(e)
    
    return result


def analyze_dmesg() -> Dict[str, Any]:
    """分析 dmesg 输出。"""
    result = {
        "available": False,
        "error_counts": {key: 0 for key in HARDWARE_ERROR_PATTERNS.keys()},
        "recent_errors": [],
        "boot_errors": []
    }
    
    if not check_command_exists('dmesg'):
        result["note"] = "dmesg 不可用"
        return result
    
    try:
        # 获取 dmesg 输出
        _, stdout, _ = execute_command(
            ['dmesg', '--time-format=iso'],
            check_returncode=False, timeout=15
        )
        
        result["available"] = True
        
        # 如果没有 --time-format 支持，使用标准格式
        if not stdout.strip():
            _, stdout, _ = execute_command(
                ['dmesg'],
                check_returncode=False, timeout=15
            )
        
        lines = stdout.split('\n')
        
        # 分析每一行
        for line in lines:
            if not line.strip():
                continue
            
            # 检查各类错误
            for error_type, patterns in HARDWARE_ERROR_PATTERNS.items():
                for pattern in patterns:
                    if re.search(pattern, line, re.IGNORECASE):
                        result["error_counts"][error_type] += 1
                        
                        # 保存最近的一些错误
                        if len(result["recent_errors"]) < 50:
                            error_entry = {
                                "type": error_type,
                                "message": line.strip(),
                                "pattern": pattern
                            }
                            if error_entry not in result["recent_errors"]:
                                result["recent_errors"].append(error_entry)
                        break
        
        # 检查启动错误
        result["boot_errors"] = extract_boot_errors(lines)
        
    except Exception as e:
        result["error"] = str(e)
    
    return result


def analyze_journalctl() -> Dict[str, Any]:
    """分析 journalctl 日志。"""
    result = {
        "available": False,
        "error_counts": {key: 0 for key in HARDWARE_ERROR_PATTERNS.keys()},
        "recent_errors": [],
        "boot_events": []
    }
    
    if not check_command_exists('journalctl'):
        result["note"] = "journalctl 不可用"
        return result
    
    try:
        # 获取最近 1000 行日志
        _, stdout, stderr = execute_command(
            ['journalctl', '-n', '1000', '--no-pager', '-p', 'err'],
            check_returncode=False, timeout=15
        )
        
        if 'No journal files were found' in stderr:
            result["note"] = "无 journal 文件"
            return result
        
        result["available"] = True
        
        lines = stdout.split('\n')
        
        for line in lines:
            if not line.strip():
                continue
            
            # 检查各类错误
            for error_type, patterns in HARDWARE_ERROR_PATTERNS.items():
                for pattern in patterns:
                    if re.search(pattern, line, re.IGNORECASE):
                        result["error_counts"][error_type] += 1
                        
                        if len(result["recent_errors"]) < 50:
                            error_entry = {
                                "type": error_type,
                                "message": line.strip()
                            }
                            if error_entry not in result["recent_errors"]:
                                result["recent_errors"].append(error_entry)
                        break
        
        # 获取启动事件
        result["boot_events"] = get_journal_boot_events()
        
    except Exception as e:
        result["error"] = str(e)
    
    return result


def extract_boot_errors(lines: List[str]) -> List[Dict[str, str]]:
    """提取启动过程中的错误。"""
    boot_errors = []
    in_boot = False
    
    for line in lines:
        # 检测启动阶段
        if 'Linux version' in line or 'Command line:' in line:
            in_boot = True
        
        if in_boot and ('error' in line.lower() or 'fail' in line.lower() or 'warn' in line.lower()):
            # 排除常见的非关键消息
            if not any(x in line.lower() for x in ['firmware', 'efi', 'acpi']):
                boot_errors.append({
                    "stage": "boot",
                    "message": line.strip()
                })
        
        # 启动完成后停止
        if in_boot and ('systemd' in line and 'startup' in line):
            in_boot = False
    
    return boot_errors[:20]  # 限制数量


def get_journal_boot_events() -> List[Dict[str, str]]:
    """获取 journalctl 中的启动事件。"""
    events = []
    
    try:
        # 获取当前启动的日志
        _, stdout, _ = execute_command(
            ['journalctl', '-b', '0', '--no-pager', '-p', 'warning'],
            check_returncode=False, timeout=10
        )
        
        for line in stdout.split('\n'):
            if 'error' in line.lower() or 'fail' in line.lower() or 'hardware' in line.lower():
                events.append({"message": line.strip()})
        
        return events[:20]
        
    except:
        return []


def summarize_errors(analysis_result: Dict[str, Any]) -> Dict[str, int]:
    """汇总错误统计。"""
    summary = {key: 0 for key in HARDWARE_ERROR_PATTERNS.keys()}
    
    # 合并 dmesg 和 journalctl 的统计
    dmesg_counts = analysis_result.get("dmesg_analysis", {}).get("error_counts", {})
    journal_counts = analysis_result.get("journal_analysis", {}).get("error_counts", {})
    
    for error_type in summary.keys():
        summary[error_type] = dmesg_counts.get(error_type, 0) + journal_counts.get(error_type, 0)
    
    return summary


def identify_critical_events(analysis_result: Dict[str, Any]) -> List[Dict[str, Any]]:
    """识别需要立即关注的关键事件。"""
    critical_events = []
    
    # 合并所有错误
    all_errors = []
    all_errors.extend(analysis_result.get("dmesg_analysis", {}).get("recent_errors", []))
    all_errors.extend(analysis_result.get("journal_analysis", {}).get("recent_errors", []))
    
    # 定义关键错误模式
    critical_patterns = [
        (r'Kernel panic', 'kernel_panic', '内核崩溃'),
        (r'hardlockup', 'hard_lockup', 'CPU 硬死锁'),
        (r'softlockup', 'soft_lockup', 'CPU 软死锁'),
        (r'thermal.*shutdown', 'thermal_shutdown', '过热关机'),
        (r'Hardware Error', 'hardware_error', '硬件错误'),
        (r'Fatal.*PCIe', 'pcie_fatal', 'PCIe 致命错误'),
        (r'I/O error.*sector', 'disk_io_error', '磁盘 I/O 错误'),
        (r'Uncorrectable.*error', 'uncorrectable_error', '不可纠正错误'),
        (r'out of memory.*kill', 'oom_kill', 'OOM 进程杀死'),
        (r'GPU.*fallen.*bus', 'gpu_disconnect', 'GPU 断开连接'),
    ]
    
    for error in all_errors:
        message = error.get("message", "")
        for pattern, event_type, description in critical_patterns:
            if re.search(pattern, message, re.IGNORECASE):
                event = {
                    "type": event_type,
                    "description": description,
                    "message": message[:200],  # 限制长度
                    "source": "dmesg" if error in analysis_result.get("dmesg_analysis", {}).get("recent_errors", []) else "journal"
                }
                
                # 避免重复
                if event not in critical_events:
                    critical_events.append(event)
    
    return critical_events


def get_kernel_panic_logs() -> List[Dict[str, str]]:
    """专门查找内核崩溃信息。"""
    panics = []
    
    # 检查 dmesg
    if check_command_exists('dmesg'):
        try:
            _, stdout, _ = execute_command(['dmesg'], check_returncode=False, timeout=10)
            
            for line in stdout.split('\n'):
                if 'Kernel panic' in line or 'sysrq' in line.lower():
                    panics.append({
                        "source": "dmesg",
                        "message": line.strip()
                    })
        except:
            pass
    
    # 检查 journalctl
    if check_command_exists('journalctl'):
        try:
            _, stdout, _ = execute_command(
                ['journalctl', '-k', '--no-pager', '-g', 'panic'],
                check_returncode=False, timeout=10
            )
            
            for line in stdout.split('\n'):
                if 'panic' in line.lower():
                    panics.append({
                        "source": "journalctl",
                        "message": line.strip()
                    })
        except:
            pass
    
    return panics


def get_hardware_error_logs() -> Dict[str, List[str]]:
    """获取特定类型的硬件错误日志。"""
    result = {
        "mce_errors": [],
        "ecc_errors": [],
        "io_errors": [],
        "thermal_errors": []
    }
    
    if check_command_exists('dmesg'):
        try:
            _, stdout, _ = execute_command(['dmesg'], check_returncode=False, timeout=10)
            
            for line in stdout.split('\n'):
                # MCE 错误
                if re.search(r'Machine check|CMCI|hardware error', line, re.IGNORECASE):
                    result["mce_errors"].append(line.strip())
                
                # ECC 错误
                if re.search(r'ECC|EDAC|memory error', line, re.IGNORECASE):
                    result["ecc_errors"].append(line.strip())
                
                # I/O 错误
                if re.search(r'I/O error|ata.*error|blk_update', line, re.IGNORECASE):
                    result["io_errors"].append(line.strip())
                
                # 热错误
                if re.search(r'thermal|overheat|critical temp', line, re.IGNORECASE):
                    result["thermal_errors"].append(line.strip())
        except:
            pass
    
    # 限制数量
    for key in result:
        result[key] = result[key][:20]
    
    return result


def search_logs_by_keyword(keyword: str, max_lines: int = 100) -> List[str]:
    """
    根据关键词搜索日志。

    Args:
        keyword: 搜索关键词
        max_lines: 最大返回行数

    Returns:
        List[str]: 匹配的行列表
    """
    results = []
    
    # 搜索 dmesg
    if check_command_exists('dmesg'):
        try:
            _, stdout, _ = execute_command(
                ['dmesg'],
                check_returncode=False, timeout=10
            )
            
            for line in stdout.split('\n'):
                if keyword.lower() in line.lower():
                    results.append(f"[dmesg] {line.strip()}")
                    if len(results) >= max_lines:
                        return results
        except:
            pass
    
    # 搜索 journalctl
    if check_command_exists('journalctl'):
        try:
            _, stdout, _ = execute_command(
                ['journalctl', '-n', str(max_lines * 2), '--no-pager'],
                check_returncode=False, timeout=10
            )
            
            for line in stdout.split('\n'):
                if keyword.lower() in line.lower():
                    results.append(f"[journal] {line.strip()}")
                    if len(results) >= max_lines:
                        return results
        except:
            pass
    
    return results


def get_system_logs(since: Optional[str] = None, until: Optional[str] = None) -> Dict[str, Any]:
    """
    获取系统日志。

    Args:
        since: 开始时间 (格式: '2024-01-01 00:00:00')
        until: 结束时间

    Returns:
        Dict[str, Any]: 日志数据
    """
    result = {
        "dmesg": "",
        "journalctl": "",
        "kern_log": ""
    }
    
    # dmesg
    if check_command_exists('dmesg'):
        try:
            _, stdout, _ = execute_command(['dmesg'], check_returncode=False, timeout=10)
            result["dmesg"] = stdout
        except:
            pass
    
    # journalctl
    if check_command_exists('journalctl'):
        try:
            cmd = ['journalctl', '--no-pager', '-n', '5000']
            if since:
                cmd.extend(['--since', since])
            if until:
                cmd.extend(['--until', until])
            
            _, stdout, _ = execute_command(cmd, check_returncode=False, timeout=15)
            result["journalctl"] = stdout
        except:
            pass
    
    # /var/log/kern.log
    kern_log_path = '/var/log/kern.log'
    if os.path.exists(kern_log_path):
        try:
            with open(kern_log_path, 'r', encoding='utf-8', errors='ignore') as f:
                lines = f.readlines()[-5000:]  # 最后 5000 行
                result["kern_log"] = ''.join(lines)
        except:
            pass
    
    return result


if __name__ == '__main__':
    import json
    print(json.dumps(analyze_logs(), indent=2, ensure_ascii=False))