""" ServerGuard - 日志分析模块 自动分析系统日志,查找硬件相关错误关键词。 """ import os import re import gzip from typing import Dict, Any, List, Optional from datetime import datetime, timedelta import sys sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from utils import execute_command, check_command_exists, safe_int # 硬件错误关键词分类 HARDWARE_ERROR_PATTERNS = { "cpu_errors": [ r'Machine check events? logged', r'Hardware Error', r'CMCI storm', r'machine check', r'CPU\s*\d+.*temperature', r'thermal.*cpu', r'CPU.*throttl', r'core.*temp', r'CPU.*fault', r'uncorrectable', r'correctable.*error', ], "memory_errors": [ r'Hardware error.*memory', r'EDAC.*error', r'memory.*error', r'Memory.*parity', r'ECC.*error', r'ue\s+count', r'ce\s+count', r'Out of memory', r'oom-kill', r'page allocation failure', ], "storage_errors": [ r'I/O error', r'Buffer I/O error', r'blk_update_request', r'ata\d+.*error', r'SATA.*error', r'NVMe.*error', r'critical.*warning', r'disk error', r'block.*error', r'SMART.*failure', r'medium error', r'uncorrectable error', ], "pci_errors": [ r'PCIe.*error', r'pcieport.*error', r'PCI.*error', r'AER:\s*', r'Corrected error', r'Uncorrected error', r'Non-Fatal error', r'Fatal error', r'Unsupported Request', ], "usb_errors": [ r'usb.*error', r'USB.*over-current', r'usb.*disconnect', r'usb.*timeout', r'ehci.*error', r'xhci.*error', ], "power_errors": [ r'thermal.*shutdown', r'critical.*temperature', r'overheat', r'power.*fail', r'under.*voltage', r'over.*voltage', r'brownout', r'power.*button', ], "kernel_panics": [ r'Kernel panic', r'sysrq.*trigger', r'watchdog.*bug', r'softlockup', r'hardlockup', r'BUG:.*spinlock', r'BUG:.*scheduling', r'Oops:', r'Call Trace:', r'general protection fault', r'double fault', r'stack.*corruption', ] } def analyze_logs() -> Dict[str, Any]: """ 分析系统日志中的硬件错误。 Returns: Dict[str, Any]: 分析结果 """ result = { "status": "success", "scan_time": datetime.now().isoformat(), "dmesg_analysis": {}, "journal_analysis": {}, "hardware_errors": {}, "critical_events": [], "summary": {} } try: # 分析 dmesg result["dmesg_analysis"] = analyze_dmesg() # 分析 journalctl result["journal_analysis"] = analyze_journalctl() # 汇总错误统计 result["hardware_errors"] = summarize_errors(result) # 识别关键事件 result["critical_events"] = identify_critical_events(result) # 生成摘要 total_errors = sum(result["hardware_errors"].values()) result["summary"] = { "total_errors_found": total_errors, "critical_events": len(result["critical_events"]), "recommend_action": total_errors > 0 } # 如果有错误,标记警告状态 if total_errors > 0: result["status"] = "warning" except Exception as e: result["status"] = "error" result["error"] = str(e) return result def analyze_dmesg() -> Dict[str, Any]: """分析 dmesg 输出。""" result = { "available": False, "error_counts": {key: 0 for key in HARDWARE_ERROR_PATTERNS.keys()}, "recent_errors": [], "boot_errors": [] } if not check_command_exists('dmesg'): result["note"] = "dmesg 不可用" return result try: # 获取 dmesg 输出 _, stdout, _ = execute_command( ['dmesg', '--time-format=iso'], check_returncode=False, timeout=15 ) result["available"] = True # 如果没有 --time-format 支持,使用标准格式 if not stdout.strip(): _, stdout, _ = execute_command( ['dmesg'], check_returncode=False, timeout=15 ) lines = stdout.split('\n') # 分析每一行 for line in lines: if not line.strip(): continue # 检查各类错误 for error_type, patterns in HARDWARE_ERROR_PATTERNS.items(): for pattern in patterns: if re.search(pattern, line, re.IGNORECASE): result["error_counts"][error_type] += 1 # 保存最近的一些错误 if len(result["recent_errors"]) < 50: error_entry = { "type": error_type, "message": line.strip(), "pattern": pattern } if error_entry not in result["recent_errors"]: result["recent_errors"].append(error_entry) break # 检查启动错误 result["boot_errors"] = extract_boot_errors(lines) except Exception as e: result["error"] = str(e) return result def analyze_journalctl() -> Dict[str, Any]: """分析 journalctl 日志。""" result = { "available": False, "error_counts": {key: 0 for key in HARDWARE_ERROR_PATTERNS.keys()}, "recent_errors": [], "boot_events": [] } if not check_command_exists('journalctl'): result["note"] = "journalctl 不可用" return result try: # 获取最近 1000 行日志 _, stdout, stderr = execute_command( ['journalctl', '-n', '1000', '--no-pager', '-p', 'err'], check_returncode=False, timeout=15 ) if 'No journal files were found' in stderr: result["note"] = "无 journal 文件" return result result["available"] = True lines = stdout.split('\n') for line in lines: if not line.strip(): continue # 检查各类错误 for error_type, patterns in HARDWARE_ERROR_PATTERNS.items(): for pattern in patterns: if re.search(pattern, line, re.IGNORECASE): result["error_counts"][error_type] += 1 if len(result["recent_errors"]) < 50: error_entry = { "type": error_type, "message": line.strip() } if error_entry not in result["recent_errors"]: result["recent_errors"].append(error_entry) break # 获取启动事件 result["boot_events"] = get_journal_boot_events() except Exception as e: result["error"] = str(e) return result def extract_boot_errors(lines: List[str]) -> List[Dict[str, str]]: """提取启动过程中的错误。""" boot_errors = [] in_boot = False for line in lines: # 检测启动阶段 if 'Linux version' in line or 'Command line:' in line: in_boot = True if in_boot and ('error' in line.lower() or 'fail' in line.lower() or 'warn' in line.lower()): # 排除常见的非关键消息 if not any(x in line.lower() for x in ['firmware', 'efi', 'acpi']): boot_errors.append({ "stage": "boot", "message": line.strip() }) # 启动完成后停止 if in_boot and ('systemd' in line and 'startup' in line): in_boot = False return boot_errors[:20] # 限制数量 def get_journal_boot_events() -> List[Dict[str, str]]: """获取 journalctl 中的启动事件。""" events = [] try: # 获取当前启动的日志 _, stdout, _ = execute_command( ['journalctl', '-b', '0', '--no-pager', '-p', 'warning'], check_returncode=False, timeout=10 ) for line in stdout.split('\n'): if 'error' in line.lower() or 'fail' in line.lower() or 'hardware' in line.lower(): events.append({"message": line.strip()}) return events[:20] except: return [] def summarize_errors(analysis_result: Dict[str, Any]) -> Dict[str, int]: """汇总错误统计。""" summary = {key: 0 for key in HARDWARE_ERROR_PATTERNS.keys()} # 合并 dmesg 和 journalctl 的统计 dmesg_counts = analysis_result.get("dmesg_analysis", {}).get("error_counts", {}) journal_counts = analysis_result.get("journal_analysis", {}).get("error_counts", {}) for error_type in summary.keys(): summary[error_type] = dmesg_counts.get(error_type, 0) + journal_counts.get(error_type, 0) return summary def identify_critical_events(analysis_result: Dict[str, Any]) -> List[Dict[str, Any]]: """识别需要立即关注的关键事件。""" critical_events = [] # 合并所有错误 all_errors = [] all_errors.extend(analysis_result.get("dmesg_analysis", {}).get("recent_errors", [])) all_errors.extend(analysis_result.get("journal_analysis", {}).get("recent_errors", [])) # 定义关键错误模式 critical_patterns = [ (r'Kernel panic', 'kernel_panic', '内核崩溃'), (r'hardlockup', 'hard_lockup', 'CPU 硬死锁'), (r'softlockup', 'soft_lockup', 'CPU 软死锁'), (r'thermal.*shutdown', 'thermal_shutdown', '过热关机'), (r'Hardware Error', 'hardware_error', '硬件错误'), (r'Fatal.*PCIe', 'pcie_fatal', 'PCIe 致命错误'), (r'I/O error.*sector', 'disk_io_error', '磁盘 I/O 错误'), (r'Uncorrectable.*error', 'uncorrectable_error', '不可纠正错误'), (r'out of memory.*kill', 'oom_kill', 'OOM 进程杀死'), (r'GPU.*fallen.*bus', 'gpu_disconnect', 'GPU 断开连接'), ] for error in all_errors: message = error.get("message", "") for pattern, event_type, description in critical_patterns: if re.search(pattern, message, re.IGNORECASE): event = { "type": event_type, "description": description, "message": message[:200], # 限制长度 "source": "dmesg" if error in analysis_result.get("dmesg_analysis", {}).get("recent_errors", []) else "journal" } # 避免重复 if event not in critical_events: critical_events.append(event) return critical_events def get_kernel_panic_logs() -> List[Dict[str, str]]: """专门查找内核崩溃信息。""" panics = [] # 检查 dmesg if check_command_exists('dmesg'): try: _, stdout, _ = execute_command(['dmesg'], check_returncode=False, timeout=10) for line in stdout.split('\n'): if 'Kernel panic' in line or 'sysrq' in line.lower(): panics.append({ "source": "dmesg", "message": line.strip() }) except: pass # 检查 journalctl if check_command_exists('journalctl'): try: _, stdout, _ = execute_command( ['journalctl', '-k', '--no-pager', '-g', 'panic'], check_returncode=False, timeout=10 ) for line in stdout.split('\n'): if 'panic' in line.lower(): panics.append({ "source": "journalctl", "message": line.strip() }) except: pass return panics def get_hardware_error_logs() -> Dict[str, List[str]]: """获取特定类型的硬件错误日志。""" result = { "mce_errors": [], "ecc_errors": [], "io_errors": [], "thermal_errors": [] } if check_command_exists('dmesg'): try: _, stdout, _ = execute_command(['dmesg'], check_returncode=False, timeout=10) for line in stdout.split('\n'): # MCE 错误 if re.search(r'Machine check|CMCI|hardware error', line, re.IGNORECASE): result["mce_errors"].append(line.strip()) # ECC 错误 if re.search(r'ECC|EDAC|memory error', line, re.IGNORECASE): result["ecc_errors"].append(line.strip()) # I/O 错误 if re.search(r'I/O error|ata.*error|blk_update', line, re.IGNORECASE): result["io_errors"].append(line.strip()) # 热错误 if re.search(r'thermal|overheat|critical temp', line, re.IGNORECASE): result["thermal_errors"].append(line.strip()) except: pass # 限制数量 for key in result: result[key] = result[key][:20] return result def search_logs_by_keyword(keyword: str, max_lines: int = 100) -> List[str]: """ 根据关键词搜索日志。 Args: keyword: 搜索关键词 max_lines: 最大返回行数 Returns: List[str]: 匹配的行列表 """ results = [] # 搜索 dmesg if check_command_exists('dmesg'): try: _, stdout, _ = execute_command( ['dmesg'], check_returncode=False, timeout=10 ) for line in stdout.split('\n'): if keyword.lower() in line.lower(): results.append(f"[dmesg] {line.strip()}") if len(results) >= max_lines: return results except: pass # 搜索 journalctl if check_command_exists('journalctl'): try: _, stdout, _ = execute_command( ['journalctl', '-n', str(max_lines * 2), '--no-pager'], check_returncode=False, timeout=10 ) for line in stdout.split('\n'): if keyword.lower() in line.lower(): results.append(f"[journal] {line.strip()}") if len(results) >= max_lines: return results except: pass return results def get_system_logs(since: Optional[str] = None, until: Optional[str] = None) -> Dict[str, Any]: """ 获取系统日志。 Args: since: 开始时间 (格式: '2024-01-01 00:00:00') until: 结束时间 Returns: Dict[str, Any]: 日志数据 """ result = { "dmesg": "", "journalctl": "", "kern_log": "" } # dmesg if check_command_exists('dmesg'): try: _, stdout, _ = execute_command(['dmesg'], check_returncode=False, timeout=10) result["dmesg"] = stdout except: pass # journalctl if check_command_exists('journalctl'): try: cmd = ['journalctl', '--no-pager', '-n', '5000'] if since: cmd.extend(['--since', since]) if until: cmd.extend(['--until', until]) _, stdout, _ = execute_command(cmd, check_returncode=False, timeout=15) result["journalctl"] = stdout except: pass # /var/log/kern.log kern_log_path = '/var/log/kern.log' if os.path.exists(kern_log_path): try: with open(kern_log_path, 'r', encoding='utf-8', errors='ignore') as f: lines = f.readlines()[-5000:] # 最后 5000 行 result["kern_log"] = ''.join(lines) except: pass return result if __name__ == '__main__': import json print(json.dumps(analyze_logs(), indent=2, ensure_ascii=False))