""" ServerGuard - CPU 检测与压力测试模块 检查 CPU 状态、温度、错误日志,并执行压力测试。 """ import os import re import time from typing import Dict, Any, List, Optional import sys sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from utils import ( execute_command, check_command_exists, parse_key_value_output, safe_int, safe_float, require_root ) def run_cpu_check(stress_test: bool = False, stress_duration: int = 300) -> Dict[str, Any]: """ 执行 CPU 检测。 Args: stress_test: 是否执行压力测试 stress_duration: 压力测试持续时间(秒) Returns: Dict[str, Any]: 检测结果 """ result = { "status": "success", "cpu_info": {}, "temperature": {}, "mce_errors": {}, "load_average": {}, "stress_test": {} } try: # 获取基本信息 result["cpu_info"] = get_cpu_details() # 获取温度 result["temperature"] = get_cpu_temperature() if result["temperature"].get("status") == "warning": result["status"] = "warning" # 获取负载 result["load_average"] = get_load_average() # 检查 MCE 错误 result["mce_errors"] = check_mce_errors() if result["mce_errors"].get("count", 0) > 0: result["status"] = "warning" # 执行压力测试 if stress_test: result["stress_test"] = run_cpu_stress_test(stress_duration) if not result["stress_test"].get("passed", False): result["status"] = "error" except Exception as e: result["status"] = "error" result["error"] = str(e) return result def get_cpu_details() -> Dict[str, Any]: """获取 CPU 详细信息。""" info = { "model": "Unknown", "architecture": "Unknown", "cores": 0, "threads": 0, "current_frequency_mhz": 0, "bogomips": 0, "flags": [] } try: with open('/proc/cpuinfo', 'r') as f: content = f.read() # 解析第一个 CPU 的信息 cpu_sections = content.split('\n\n') if cpu_sections: first_cpu = cpu_sections[0] data = {} for line in first_cpu.split('\n'): if ':' in line: key, value = line.split(':', 1) data[key.strip()] = value.strip() info["model"] = data.get('model name', 'Unknown') info["vendor"] = data.get('vendor_id', 'Unknown') info["architecture"] = data.get('cpu family', 'Unknown') info["bogomips"] = safe_float(data.get('bogomips', 0)) if 'flags' in data: info["flags"] = data['flags'].split() # 统计核心数和线程数 info["threads"] = content.count('processor\t:') info["cores"] = len(set(re.findall(r'physical id\t:\s*(\d+)', content))) if info["cores"] == 0: info["cores"] = info["threads"] # 获取当前频率 if os.path.exists('/proc/cpuinfo'): with open('/proc/cpuinfo', 'r') as f: for line in f: if 'cpu MHz' in line: info["current_frequency_mhz"] = safe_float(line.split(':')[1].strip()) break # 获取缩放频率信息 freq_info = get_cpu_frequency_info() if freq_info: info["frequency_info"] = freq_info except Exception as e: info["error"] = str(e) return info def get_cpu_frequency_info() -> Dict[str, Any]: """获取 CPU 频率信息。""" info = {} # 尝试从 cpufreq 获取 cpu0_path = '/sys/devices/system/cpu/cpu0/cpufreq' if os.path.exists(cpu0_path): try: files = { "min_mhz": "scaling_min_freq", "max_mhz": "scaling_max_freq", "current_mhz": "scaling_cur_freq", "governor": "scaling_governor", "driver": "scaling_driver" } for key, filename in files.items(): filepath = os.path.join(cpu0_path, filename) if os.path.exists(filepath): with open(filepath, 'r') as f: value = f.read().strip() if 'freq' in filename: # 频率值通常以 kHz 存储 info[key] = round(safe_int(value) / 1000, 2) else: info[key] = value except: pass return info def get_cpu_temperature() -> Dict[str, Any]: """获取 CPU 温度信息。""" result = { "status": "success", "sensors": {}, "current_c": None, "high_threshold_c": None, "critical_threshold_c": None } temperatures = [] # 方法 1: 使用 sensors 命令 (lm-sensors) if check_command_exists('sensors'): try: _, stdout, _ = execute_command( ['sensors', '-u'], check_returncode=False, timeout=10 ) # 解析 sensors -u 输出 current_chip = None current_adapter = None for line in stdout.split('\n'): line = line.strip() # 检测芯片名称 if line and not line.startswith('Adapter:') and not ':' in line and not line.startswith('temp'): current_chip = line.rstrip(':') result["sensors"][current_chip] = {} continue if line.startswith('Adapter:'): current_adapter = line.split(':', 1)[1].strip() if current_chip: result["sensors"][current_chip]["adapter"] = current_adapter continue # 解析温度输入值 if 'temp' in line and '_input' in line: match = re.match(r'(temp\d+)_input:\s*([\d.]+)', line) if match: temp_name = match.group(1) temp_value = safe_float(match.group(2)) if current_chip: if temp_name not in result["sensors"][current_chip]: result["sensors"][current_chip][temp_name] = {} result["sensors"][current_chip][temp_name]["current"] = temp_value temperatures.append(temp_value) # 解析高温阈值 if 'temp' in line and '_max' in line: match = re.match(r'(temp\d+)_max:\s*([\d.]+)', line) if match: temp_name = match.group(1) temp_value = safe_float(match.group(2)) if current_chip and temp_name in result["sensors"][current_chip]: result["sensors"][current_chip][temp_name]["high"] = temp_value # 解析临界温度 if 'temp' in line and '_crit' in line: match = re.match(r'(temp\d+)_crit:\s*([\d.]+)', line) if match: temp_name = match.group(1) temp_value = safe_float(match.group(2)) if current_chip and temp_name in result["sensors"][current_chip]: result["sensors"][current_chip][temp_name]["critical"] = temp_value except: pass # 方法 2: 直接读取 thermal zone if not temperatures: try: thermal_path = '/sys/class/thermal' if os.path.exists(thermal_path): for zone in os.listdir(thermal_path): if zone.startswith('thermal_zone'): zone_path = os.path.join(thermal_path, zone) # 读取类型 type_file = os.path.join(zone_path, 'type') zone_type = 'unknown' if os.path.exists(type_file): with open(type_file, 'r') as f: zone_type = f.read().strip() # 读取温度 (单位是毫摄氏度) temp_file = os.path.join(zone_path, 'temp') if os.path.exists(temp_file): with open(temp_file, 'r') as f: temp_mc = safe_int(f.read().strip()) temp_c = temp_mc / 1000.0 if 'x86_pkg_temp' in zone_type or 'cpu' in zone_type.lower(): result["sensors"][zone] = { "type": zone_type, "current": temp_c } temperatures.append(temp_c) except: pass # 方法 3: 尝试从 hwmon 读取 if not temperatures: try: hwmon_path = '/sys/class/hwmon' if os.path.exists(hwmon_path): for hwmon in os.listdir(hwmon_path): hwmon_dir = os.path.join(hwmon_path, hwmon) # 读取名称 name_file = os.path.join(hwmon_dir, 'name') if os.path.exists(name_file): with open(name_file, 'r') as f: name = f.read().strip() else: name = hwmon # 查找温度输入 for file in os.listdir(hwmon_dir): if file.startswith('temp') and file.endswith('_input'): temp_file = os.path.join(hwmon_dir, file) with open(temp_file, 'r') as f: temp_mc = safe_int(f.read().strip()) temp_c = temp_mc / 1000.0 sensor_name = file.replace('_input', '') result["sensors"][f"{name}_{sensor_name}"] = { "current": temp_c } temperatures.append(temp_c) except: pass # 计算平均温度 if temperatures: result["current_c"] = round(sum(temperatures) / len(temperatures), 1) result["max_c"] = round(max(temperatures), 1) # 检查温度警告 if result["max_c"] > 85: result["status"] = "warning" result["warning"] = f"CPU 温度过高: {result['max_c']}°C" else: result["status"] = "unknown" result["warning"] = "无法获取 CPU 温度信息" return result def get_load_average() -> Dict[str, Any]: """获取系统负载信息。""" result = {} try: with open('/proc/loadavg', 'r') as f: load_data = f.read().strip().split() if len(load_data) >= 3: result["1min"] = safe_float(load_data[0]) result["5min"] = safe_float(load_data[1]) result["15min"] = safe_float(load_data[2]) # 获取 CPU 核心数以计算相对负载 num_cores = os.cpu_count() or 1 result["cores"] = num_cores result["relative_1min"] = round(result["1min"] / num_cores, 2) result["relative_5min"] = round(result["5min"] / num_cores, 2) result["relative_15min"] = round(result["15min"] / num_cores, 2) except: pass return result def check_mce_errors() -> Dict[str, Any]: """检查 Machine Check Exception (MCE) 错误。""" result = { "count": 0, "errors": [], "status": "ok" } # 方法 1: 检查 dmesg if check_command_exists('dmesg'): try: _, stdout, _ = execute_command( ['dmesg'], check_returncode=False, timeout=10 ) mce_keywords = ['Machine check events logged', 'Hardware Error', 'CMCI storm'] for line in stdout.split('\n'): for keyword in mce_keywords: if keyword in line: result["count"] += 1 if len(result["errors"]) < 10: # 限制错误数量 result["errors"].append(line.strip()) result["status"] = "warning" break except: pass # 方法 2: 检查 mcelog if check_command_exists('mcelog'): try: # 尝试读取 mcelog 输出 _, stdout, _ = execute_command( ['mcelog', '--client'], check_returncode=False, timeout=5 ) if stdout.strip() and 'no machine check' not in stdout.lower(): result["count"] += stdout.count('MCE') result["status"] = "warning" result["mcelog_available"] = True except: pass # 方法 3: 检查 /dev/mcelog if os.path.exists('/dev/mcelog'): result["mcelog_device"] = True return result @require_root def run_cpu_stress_test(duration: int = 300) -> Dict[str, Any]: """ 运行 CPU 压力测试。 Args: duration: 测试持续时间(秒) Returns: Dict[str, Any]: 测试结果 """ import logging logger = logging.getLogger(__name__) result = { "passed": False, "duration_seconds": duration, "cpu_cores": os.cpu_count() or 1, "start_time": None, "end_time": None, "max_temperature": None, "tool_used": None, "errors": [] } # 使用 stress-ng 进行压力测试(首选) if check_command_exists('stress-ng'): result["tool_used"] = "stress-ng" try: logger.info(f"[CPU STRESS TEST] 开始使用 stress-ng 进行压力测试,持续时间: {duration}秒") result["start_time"] = time.strftime('%Y-%m-%d %H:%M:%S') # 获取测试前温度 temp_before = get_cpu_temperature() temp_before_val = temp_before.get("max_c", "N/A") logger.info(f"[CPU STRESS TEST] 测试前温度: {temp_before_val}°C") # 运行 stress-ng # --cpu 0 使用所有 CPU 核心 # --timeout 指定超时时间 # --metrics-brief 输出简要指标 cmd = [ 'stress-ng', '--cpu', '0', '--timeout', str(duration), '--metrics-brief' ] logger.info(f"[CPU STRESS TEST] 执行命令: {' '.join(cmd)}") _, stdout, stderr = execute_command( cmd, timeout=duration + 30, # 给一些额外时间 check_returncode=False ) result["end_time"] = time.strftime('%Y-%m-%d %H:%M:%S') logger.info("[CPU STRESS TEST] stress-ng 执行完成") # 获取测试后温度 temp_after = get_cpu_temperature() temp_after_val = temp_after.get("max_c", "N/A") logger.info(f"[CPU STRESS TEST] 测试后温度: {temp_after_val}°C") # 分析输出 output = stdout + stderr # 检查是否有错误 if 'error' in output.lower() or 'fail' in output.lower(): result["passed"] = False result["errors"].append("压力测试过程中发现错误") logger.error("[CPU STRESS TEST] 压力测试执行过程中发现错误") else: result["passed"] = True logger.info("[CPU STRESS TEST] 压力测试通过") # 提取性能指标 bogo_ops = re.search(r'stress-ng:\s+cpu:\s+(\d+)\s+bogo ops', output) if bogo_ops: result["bogo_ops"] = safe_int(bogo_ops.group(1)) logger.info(f"[CPU STRESS TEST] Bogo ops: {result['bogo_ops']}") bogo_ops_per_sec = re.search(r'(\d+\.\d+)\s+bogo ops per second', output) if bogo_ops_per_sec: result["bogo_ops_per_second"] = safe_float(bogo_ops_per_sec.group(1)) # 温度分析 if temp_after.get("max_c"): result["max_temperature"] = temp_after["max_c"] if temp_after["max_c"] > 95: result["warnings"] = [f"测试期间温度过高: {temp_after['max_c']}°C"] result["temperature_before"] = temp_before result["temperature_after"] = temp_after except Exception as e: result["passed"] = False result["errors"].append(str(e)) logger.exception(f"[CPU STRESS TEST] stress-ng 执行异常: {e}") # 备选: 使用 stress elif check_command_exists('stress'): result["tool_used"] = "stress" try: logger.info(f"[CPU STRESS TEST] 开始使用 stress 进行压力测试,持续时间: {duration}秒") result["start_time"] = time.strftime('%Y-%m-%d %H:%M:%S') temp_before = get_cpu_temperature() temp_before_val = temp_before.get("max_c", "N/A") logger.info(f"[CPU STRESS TEST] 测试前温度: {temp_before_val}°C") num_cores = os.cpu_count() or 1 logger.info(f"[CPU STRESS TEST] 使用 {num_cores} 个 CPU 核心") _, stdout, stderr = execute_command( ['stress', '--cpu', str(num_cores), '--timeout', str(duration)], timeout=duration + 30, check_returncode=False ) result["end_time"] = time.strftime('%Y-%m-%d %H:%M:%S') logger.info("[CPU STRESS TEST] stress 执行完成") temp_after = get_cpu_temperature() temp_after_val = temp_after.get("max_c", "N/A") logger.info(f"[CPU STRESS TEST] 测试后温度: {temp_after_val}°C") result["passed"] = True result["temperature_before"] = temp_before result["temperature_after"] = temp_after if temp_after.get("max_c"): result["max_temperature"] = temp_after["max_c"] except Exception as e: result["passed"] = False result["errors"].append(str(e)) logger.exception(f"[CPU STRESS TEST] stress 执行异常: {e}") else: result["passed"] = False result["errors"].append("未找到压力测试工具 (stress-ng 或 stress)") result["note"] = "请安装 stress-ng 或 stress: yum install stress / apt install stress-ng" logger.error("[CPU STRESS TEST] 未找到压力测试工具 (stress-ng 或 stress)") return result if __name__ == '__main__': import json print(json.dumps(run_cpu_check(stress_test=False), indent=2, ensure_ascii=False))