660 lines
21 KiB
Python
660 lines
21 KiB
Python
"""
|
|
ServerGuard - 日志分析模块
|
|
|
|
自动分析系统日志,查找硬件相关错误关键词。
|
|
"""
|
|
|
|
import os
|
|
import re
|
|
import gzip
|
|
from typing import Dict, Any, List, Optional
|
|
from datetime import datetime, timedelta
|
|
|
|
import sys
|
|
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
|
|
from utils import execute_command, check_command_exists, safe_int
|
|
|
|
|
|
# 硬件错误关键词分类
|
|
HARDWARE_ERROR_PATTERNS = {
|
|
"cpu_errors": [
|
|
r'Machine check events? logged',
|
|
r'Hardware Error',
|
|
r'CMCI storm',
|
|
r'machine check',
|
|
r'CPU\s*\d+.*temperature',
|
|
r'thermal.*cpu',
|
|
r'CPU.*throttl',
|
|
r'core.*temp',
|
|
r'CPU.*fault',
|
|
r'uncorrectable',
|
|
r'correctable.*error',
|
|
],
|
|
"memory_errors": [
|
|
r'Hardware error.*memory',
|
|
r'EDAC.*error',
|
|
r'memory.*error',
|
|
r'Memory.*parity',
|
|
r'ECC.*error',
|
|
r'ue\s+count',
|
|
r'ce\s+count',
|
|
r'Out of memory',
|
|
r'oom-kill',
|
|
r'page allocation failure',
|
|
],
|
|
"storage_errors": [
|
|
r'I/O error',
|
|
r'Buffer I/O error',
|
|
r'blk_update_request',
|
|
r'ata\d+.*error',
|
|
r'SATA.*error',
|
|
r'NVMe.*error',
|
|
r'critical.*warning',
|
|
r'disk error',
|
|
r'block.*error',
|
|
r'SMART.*failure',
|
|
r'medium error',
|
|
r'uncorrectable error',
|
|
],
|
|
"pci_errors": [
|
|
r'PCIe.*error',
|
|
r'pcieport.*error',
|
|
r'PCI.*error',
|
|
r'AER:\s*',
|
|
r'Corrected error',
|
|
r'Uncorrected error',
|
|
r'Non-Fatal error',
|
|
r'Fatal error',
|
|
r'Unsupported Request',
|
|
],
|
|
"usb_errors": [
|
|
r'usb.*error',
|
|
r'USB.*over-current',
|
|
r'usb.*disconnect',
|
|
r'usb.*timeout',
|
|
r'ehci.*error',
|
|
r'xhci.*error',
|
|
],
|
|
"power_errors": [
|
|
r'thermal.*shutdown',
|
|
r'critical.*temperature',
|
|
r'overheat',
|
|
r'power.*fail',
|
|
r'under.*voltage',
|
|
r'over.*voltage',
|
|
r'brownout',
|
|
],
|
|
"kernel_panics": [
|
|
r'Kernel panic',
|
|
r'sysrq.*trigger',
|
|
r'watchdog.*bug',
|
|
r'softlockup',
|
|
r'hardlockup',
|
|
r'BUG:.*spinlock',
|
|
r'BUG:.*scheduling',
|
|
r'Oops:',
|
|
r'Call Trace:',
|
|
r'general protection fault',
|
|
r'double fault',
|
|
r'stack.*corruption',
|
|
]
|
|
}
|
|
|
|
|
|
def analyze_logs() -> Dict[str, Any]:
|
|
"""
|
|
分析系统日志中的硬件错误。
|
|
|
|
Returns:
|
|
Dict[str, Any]: 分析结果
|
|
"""
|
|
result = {
|
|
"status": "success",
|
|
"scan_time": datetime.now().isoformat(),
|
|
"dmesg_analysis": {},
|
|
"journal_analysis": {},
|
|
"hardware_errors": {},
|
|
"error_details": {}, # 新增:详细的错误信息
|
|
"critical_events": [],
|
|
"summary": {}
|
|
}
|
|
|
|
try:
|
|
# 分析 dmesg
|
|
result["dmesg_analysis"] = analyze_dmesg()
|
|
|
|
# 分析 journalctl
|
|
result["journal_analysis"] = analyze_journalctl()
|
|
|
|
# 汇总错误统计
|
|
result["hardware_errors"] = summarize_errors(result)
|
|
|
|
# 收集详细的错误信息
|
|
result["error_details"] = collect_error_details(result)
|
|
|
|
# 识别关键事件
|
|
result["critical_events"] = identify_critical_events(result)
|
|
|
|
# 生成摘要
|
|
total_errors = sum(result["hardware_errors"].values())
|
|
result["summary"] = {
|
|
"total_errors_found": total_errors,
|
|
"critical_events": len(result["critical_events"]),
|
|
"recommend_action": total_errors > 0
|
|
}
|
|
|
|
# 如果有错误,标记警告状态
|
|
if total_errors > 0:
|
|
result["status"] = "warning"
|
|
|
|
except Exception as e:
|
|
result["status"] = "error"
|
|
result["error"] = str(e)
|
|
|
|
return result
|
|
|
|
|
|
def is_false_positive(line: str, error_type: str) -> bool:
|
|
"""
|
|
判断是否为误报的正常日志。
|
|
|
|
Args:
|
|
line: 日志行内容
|
|
error_type: 错误类型
|
|
|
|
Returns:
|
|
bool: 是否为误报
|
|
"""
|
|
line_lower = line.lower()
|
|
|
|
# kernel_panics 类型的误报
|
|
if error_type == "kernel_panics":
|
|
# "Call Trace:" 本身不是错误,需要结合上下文
|
|
# 如果只出现 Call Trace 但没有其他错误关键字,可能是正常的堆栈跟踪
|
|
if 'call trace:' in line_lower:
|
|
# 检查是否包含真正的错误关键字
|
|
real_error_keywords = ['oops', 'panic', 'bug:', 'warning:', 'error:']
|
|
if not any(kw in line_lower for kw in real_error_keywords):
|
|
return True
|
|
|
|
# power_errors 类型的误报
|
|
if error_type == "power_errors":
|
|
# 电源按钮输入是正常的 ACPI 事件
|
|
if 'power button' in line_lower and 'input:' in line_lower:
|
|
return True
|
|
if 'pwrf' in line_lower: # ACPI Power Button
|
|
return True
|
|
|
|
# cpu_errors 类型的误报
|
|
if error_type == "cpu_errors":
|
|
# CPU 温度信息是正常的传感器读数
|
|
if 'temperature' in line_lower and 'above' not in line_lower and 'critical' not in line_lower:
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
def analyze_dmesg() -> Dict[str, Any]:
|
|
"""分析 dmesg 输出。"""
|
|
result = {
|
|
"available": False,
|
|
"error_counts": {key: 0 for key in HARDWARE_ERROR_PATTERNS.keys()},
|
|
"recent_errors": [],
|
|
"boot_errors": []
|
|
}
|
|
|
|
if not check_command_exists('dmesg'):
|
|
result["note"] = "dmesg 不可用"
|
|
return result
|
|
|
|
try:
|
|
# 获取 dmesg 输出
|
|
_, stdout, _ = execute_command(
|
|
['dmesg', '--time-format=iso'],
|
|
check_returncode=False, timeout=15
|
|
)
|
|
|
|
result["available"] = True
|
|
|
|
# 如果没有 --time-format 支持,使用标准格式
|
|
if not stdout.strip():
|
|
_, stdout, _ = execute_command(
|
|
['dmesg'],
|
|
check_returncode=False, timeout=15
|
|
)
|
|
|
|
lines = stdout.split('\n')
|
|
|
|
# 分析每一行
|
|
for line in lines:
|
|
if not line.strip():
|
|
continue
|
|
|
|
# 检查各类错误
|
|
for error_type, patterns in HARDWARE_ERROR_PATTERNS.items():
|
|
for pattern in patterns:
|
|
if re.search(pattern, line, re.IGNORECASE):
|
|
# 过滤误报
|
|
if is_false_positive(line, error_type):
|
|
continue
|
|
|
|
result["error_counts"][error_type] += 1
|
|
|
|
# 保存最近的一些错误(限制数量避免输出过多)
|
|
if len(result["recent_errors"]) < 50:
|
|
error_entry = {
|
|
"type": error_type,
|
|
"message": line.strip(),
|
|
"pattern": pattern,
|
|
"source": "dmesg"
|
|
}
|
|
# 去重
|
|
if error_entry not in result["recent_errors"]:
|
|
result["recent_errors"].append(error_entry)
|
|
break
|
|
|
|
# 检查启动错误
|
|
result["boot_errors"] = extract_boot_errors(lines)
|
|
|
|
except Exception as e:
|
|
result["error"] = str(e)
|
|
|
|
return result
|
|
|
|
|
|
def analyze_journalctl() -> Dict[str, Any]:
|
|
"""分析 journalctl 日志。"""
|
|
result = {
|
|
"available": False,
|
|
"error_counts": {key: 0 for key in HARDWARE_ERROR_PATTERNS.keys()},
|
|
"recent_errors": [],
|
|
"boot_events": []
|
|
}
|
|
|
|
if not check_command_exists('journalctl'):
|
|
result["note"] = "journalctl 不可用"
|
|
return result
|
|
|
|
try:
|
|
# 获取最近 1000 行日志
|
|
_, stdout, stderr = execute_command(
|
|
['journalctl', '-n', '1000', '--no-pager', '-p', 'err'],
|
|
check_returncode=False, timeout=15
|
|
)
|
|
|
|
if 'No journal files were found' in stderr:
|
|
result["note"] = "无 journal 文件"
|
|
return result
|
|
|
|
result["available"] = True
|
|
|
|
lines = stdout.split('\n')
|
|
|
|
for line in lines:
|
|
if not line.strip():
|
|
continue
|
|
|
|
# 检查各类错误
|
|
for error_type, patterns in HARDWARE_ERROR_PATTERNS.items():
|
|
for pattern in patterns:
|
|
if re.search(pattern, line, re.IGNORECASE):
|
|
# 过滤误报
|
|
if is_false_positive(line, error_type):
|
|
continue
|
|
|
|
result["error_counts"][error_type] += 1
|
|
|
|
if len(result["recent_errors"]) < 50:
|
|
error_entry = {
|
|
"type": error_type,
|
|
"message": line.strip(),
|
|
"source": "journalctl"
|
|
}
|
|
# 去重
|
|
if error_entry not in result["recent_errors"]:
|
|
result["recent_errors"].append(error_entry)
|
|
break
|
|
|
|
# 获取启动事件
|
|
result["boot_events"] = get_journal_boot_events()
|
|
|
|
except Exception as e:
|
|
result["error"] = str(e)
|
|
|
|
return result
|
|
|
|
|
|
def extract_boot_errors(lines: List[str]) -> List[Dict[str, str]]:
|
|
"""提取启动过程中的错误。"""
|
|
boot_errors = []
|
|
in_boot = False
|
|
|
|
for line in lines:
|
|
# 检测启动阶段
|
|
if 'Linux version' in line or 'Command line:' in line:
|
|
in_boot = True
|
|
|
|
if in_boot and ('error' in line.lower() or 'fail' in line.lower() or 'warn' in line.lower()):
|
|
# 排除常见的非关键消息
|
|
if not any(x in line.lower() for x in ['firmware', 'efi', 'acpi']):
|
|
boot_errors.append({
|
|
"stage": "boot",
|
|
"message": line.strip()
|
|
})
|
|
|
|
# 启动完成后停止
|
|
if in_boot and ('systemd' in line and 'startup' in line):
|
|
in_boot = False
|
|
|
|
return boot_errors[:20] # 限制数量
|
|
|
|
|
|
def get_journal_boot_events() -> List[Dict[str, str]]:
|
|
"""获取 journalctl 中的启动事件。"""
|
|
events = []
|
|
|
|
try:
|
|
# 获取当前启动的日志
|
|
_, stdout, _ = execute_command(
|
|
['journalctl', '-b', '0', '--no-pager', '-p', 'warning'],
|
|
check_returncode=False, timeout=10
|
|
)
|
|
|
|
for line in stdout.split('\n'):
|
|
if 'error' in line.lower() or 'fail' in line.lower() or 'hardware' in line.lower():
|
|
events.append({"message": line.strip()})
|
|
|
|
return events[:20]
|
|
|
|
except:
|
|
return []
|
|
|
|
|
|
def summarize_errors(analysis_result: Dict[str, Any]) -> Dict[str, int]:
|
|
"""汇总错误统计。"""
|
|
summary = {key: 0 for key in HARDWARE_ERROR_PATTERNS.keys()}
|
|
|
|
# 合并 dmesg 和 journalctl 的统计
|
|
dmesg_counts = analysis_result.get("dmesg_analysis", {}).get("error_counts", {})
|
|
journal_counts = analysis_result.get("journal_analysis", {}).get("error_counts", {})
|
|
|
|
for error_type in summary.keys():
|
|
summary[error_type] = dmesg_counts.get(error_type, 0) + journal_counts.get(error_type, 0)
|
|
|
|
return summary
|
|
|
|
|
|
def collect_error_details(analysis_result: Dict[str, Any]) -> Dict[str, List[Dict[str, str]]]:
|
|
"""
|
|
收集详细的错误信息,按类型分类。
|
|
|
|
Returns:
|
|
Dict[str, List[Dict]]: 按错误类型分类的详细错误信息
|
|
"""
|
|
error_details = {}
|
|
|
|
# 收集所有错误
|
|
all_errors = []
|
|
all_errors.extend(analysis_result.get("dmesg_analysis", {}).get("recent_errors", []))
|
|
all_errors.extend(analysis_result.get("journal_analysis", {}).get("recent_errors", []))
|
|
|
|
# 按类型分类
|
|
for error_type in HARDWARE_ERROR_PATTERNS.keys():
|
|
type_errors = [e for e in all_errors if e["type"] == error_type]
|
|
if type_errors:
|
|
# 去重并限制数量
|
|
seen_messages = set()
|
|
unique_errors = []
|
|
for err in type_errors:
|
|
msg = err["message"]
|
|
if msg not in seen_messages and len(unique_errors) < 10:
|
|
seen_messages.add(msg)
|
|
unique_errors.append(err)
|
|
error_details[error_type] = unique_errors
|
|
|
|
return error_details
|
|
|
|
|
|
def identify_critical_events(analysis_result: Dict[str, Any]) -> List[Dict[str, Any]]:
|
|
"""识别需要立即关注的关键事件。"""
|
|
critical_events = []
|
|
|
|
# 合并所有错误
|
|
all_errors = []
|
|
all_errors.extend(analysis_result.get("dmesg_analysis", {}).get("recent_errors", []))
|
|
all_errors.extend(analysis_result.get("journal_analysis", {}).get("recent_errors", []))
|
|
|
|
# 定义关键错误模式
|
|
critical_patterns = [
|
|
(r'Kernel panic', 'kernel_panic', '内核崩溃', 'critical'),
|
|
(r'hardlockup', 'hard_lockup', 'CPU 硬死锁', 'critical'),
|
|
(r'softlockup', 'soft_lockup', 'CPU 软死锁', 'critical'),
|
|
(r'thermal.*shutdown', 'thermal_shutdown', '过热关机', 'critical'),
|
|
(r'Hardware Error', 'hardware_error', '硬件错误', 'critical'),
|
|
(r'Fatal.*PCIe', 'pcie_fatal', 'PCIe 致命错误', 'critical'),
|
|
(r'I/O error.*sector', 'disk_io_error', '磁盘 I/O 错误', 'warning'),
|
|
(r'Uncorrectable.*error', 'uncorrectable_error', '不可纠正错误', 'warning'),
|
|
(r'out of memory.*kill', 'oom_kill', 'OOM 进程杀死', 'warning'),
|
|
(r'GPU.*fallen.*bus', 'gpu_disconnect', 'GPU 断开连接', 'warning'),
|
|
(r'Machine check', 'mce_error', '机器检查错误(MCE)', 'critical'),
|
|
(r'EDAC.*error', 'edac_error', '内存 EDAC 错误', 'warning'),
|
|
]
|
|
|
|
for error in all_errors:
|
|
message = error.get("message", "")
|
|
for pattern, event_type, description, severity in critical_patterns:
|
|
if re.search(pattern, message, re.IGNORECASE):
|
|
event = {
|
|
"type": event_type,
|
|
"description": description,
|
|
"severity": severity,
|
|
"message": message[:300], # 限制长度
|
|
"source": error.get("source", "unknown"),
|
|
"error_category": error.get("type", "unknown")
|
|
}
|
|
|
|
# 避免重复
|
|
if event not in critical_events:
|
|
critical_events.append(event)
|
|
|
|
# 按严重程度排序
|
|
severity_order = {'critical': 0, 'warning': 1, 'info': 2}
|
|
critical_events.sort(key=lambda x: severity_order.get(x.get('severity', 'info'), 3))
|
|
|
|
return critical_events[:20] # 限制数量
|
|
|
|
|
|
def get_kernel_panic_logs() -> List[Dict[str, str]]:
|
|
"""专门查找内核崩溃信息。"""
|
|
panics = []
|
|
|
|
# 检查 dmesg
|
|
if check_command_exists('dmesg'):
|
|
try:
|
|
_, stdout, _ = execute_command(['dmesg'], check_returncode=False, timeout=10)
|
|
|
|
for line in stdout.split('\n'):
|
|
if 'Kernel panic' in line or 'sysrq' in line.lower():
|
|
panics.append({
|
|
"source": "dmesg",
|
|
"message": line.strip()
|
|
})
|
|
except:
|
|
pass
|
|
|
|
# 检查 journalctl
|
|
if check_command_exists('journalctl'):
|
|
try:
|
|
_, stdout, _ = execute_command(
|
|
['journalctl', '-k', '--no-pager', '-g', 'panic'],
|
|
check_returncode=False, timeout=10
|
|
)
|
|
|
|
for line in stdout.split('\n'):
|
|
if 'panic' in line.lower():
|
|
panics.append({
|
|
"source": "journalctl",
|
|
"message": line.strip()
|
|
})
|
|
except:
|
|
pass
|
|
|
|
return panics
|
|
|
|
|
|
def get_hardware_error_logs() -> Dict[str, List[Dict[str, str]]]:
|
|
"""获取特定类型的硬件错误日志。"""
|
|
result = {
|
|
"mce_errors": [],
|
|
"ecc_errors": [],
|
|
"io_errors": [],
|
|
"thermal_errors": []
|
|
}
|
|
|
|
if check_command_exists('dmesg'):
|
|
try:
|
|
_, stdout, _ = execute_command(['dmesg'], check_returncode=False, timeout=10)
|
|
|
|
for line in stdout.split('\n'):
|
|
# MCE 错误
|
|
if re.search(r'Machine check|CMCI|hardware error', line, re.IGNORECASE):
|
|
result["mce_errors"].append({
|
|
"message": line.strip(),
|
|
"type": "mce"
|
|
})
|
|
|
|
# ECC 错误
|
|
if re.search(r'ECC|EDAC|memory error', line, re.IGNORECASE):
|
|
result["ecc_errors"].append({
|
|
"message": line.strip(),
|
|
"type": "ecc"
|
|
})
|
|
|
|
# I/O 错误
|
|
if re.search(r'I/O error|ata.*error|blk_update', line, re.IGNORECASE):
|
|
result["io_errors"].append({
|
|
"message": line.strip(),
|
|
"type": "io"
|
|
})
|
|
|
|
# 热错误
|
|
if re.search(r'thermal|overheat|critical temp', line, re.IGNORECASE):
|
|
result["thermal_errors"].append({
|
|
"message": line.strip(),
|
|
"type": "thermal"
|
|
})
|
|
except:
|
|
pass
|
|
|
|
# 限制数量
|
|
for key in result:
|
|
result[key] = result[key][:10]
|
|
|
|
return result
|
|
|
|
|
|
def search_logs_by_keyword(keyword: str, max_lines: int = 100) -> List[str]:
|
|
"""
|
|
根据关键词搜索日志。
|
|
|
|
Args:
|
|
keyword: 搜索关键词
|
|
max_lines: 最大返回行数
|
|
|
|
Returns:
|
|
List[str]: 匹配的行列表
|
|
"""
|
|
results = []
|
|
|
|
# 搜索 dmesg
|
|
if check_command_exists('dmesg'):
|
|
try:
|
|
_, stdout, _ = execute_command(
|
|
['dmesg'],
|
|
check_returncode=False, timeout=10
|
|
)
|
|
|
|
for line in stdout.split('\n'):
|
|
if keyword.lower() in line.lower():
|
|
results.append(f"[dmesg] {line.strip()}")
|
|
if len(results) >= max_lines:
|
|
return results
|
|
except:
|
|
pass
|
|
|
|
# 搜索 journalctl
|
|
if check_command_exists('journalctl'):
|
|
try:
|
|
_, stdout, _ = execute_command(
|
|
['journalctl', '-n', str(max_lines * 2), '--no-pager'],
|
|
check_returncode=False, timeout=10
|
|
)
|
|
|
|
for line in stdout.split('\n'):
|
|
if keyword.lower() in line.lower():
|
|
results.append(f"[journal] {line.strip()}")
|
|
if len(results) >= max_lines:
|
|
return results
|
|
except:
|
|
pass
|
|
|
|
return results
|
|
|
|
|
|
def get_system_logs(since: Optional[str] = None, until: Optional[str] = None) -> Dict[str, Any]:
|
|
"""
|
|
获取系统日志。
|
|
|
|
Args:
|
|
since: 开始时间 (格式: '2024-01-01 00:00:00')
|
|
until: 结束时间
|
|
|
|
Returns:
|
|
Dict[str, Any]: 日志数据
|
|
"""
|
|
result = {
|
|
"dmesg": "",
|
|
"journalctl": "",
|
|
"kern_log": ""
|
|
}
|
|
|
|
# dmesg
|
|
if check_command_exists('dmesg'):
|
|
try:
|
|
_, stdout, _ = execute_command(['dmesg'], check_returncode=False, timeout=10)
|
|
result["dmesg"] = stdout
|
|
except:
|
|
pass
|
|
|
|
# journalctl
|
|
if check_command_exists('journalctl'):
|
|
try:
|
|
cmd = ['journalctl', '--no-pager', '-n', '5000']
|
|
if since:
|
|
cmd.extend(['--since', since])
|
|
if until:
|
|
cmd.extend(['--until', until])
|
|
|
|
_, stdout, _ = execute_command(cmd, check_returncode=False, timeout=15)
|
|
result["journalctl"] = stdout
|
|
except:
|
|
pass
|
|
|
|
# /var/log/kern.log
|
|
kern_log_path = '/var/log/kern.log'
|
|
if os.path.exists(kern_log_path):
|
|
try:
|
|
with open(kern_log_path, 'r', encoding='utf-8', errors='ignore') as f:
|
|
lines = f.readlines()[-5000:] # 最后 5000 行
|
|
result["kern_log"] = ''.join(lines)
|
|
except:
|
|
pass
|
|
|
|
return result
|
|
|
|
|
|
if __name__ == '__main__':
|
|
import json
|
|
print(json.dumps(analyze_logs(), indent=2, ensure_ascii=False))
|