""" ServerGuard - 电源与主板传感器监控模块 监控电源、主板传感器数据,包括温度、电压、风扇转速等。 """ import os import re from typing import Dict, Any, List, Optional import sys sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from utils import ( execute_command, check_command_exists, parse_key_value_output, safe_int, safe_float, require_root ) def run_sensors_check() -> Dict[str, Any]: """ 执行传感器检测。 Returns: Dict[str, Any]: 检测结果 """ result = { "status": "success", "lm_sensors": {}, "ipmi_sensors": {}, "thermal_zones": {}, "power_supplies": {}, "ipmi_sel": {} } try: # 获取 lm-sensors 数据 result["lm_sensors"] = get_lm_sensors_data() # 获取 IPMI 传感器数据 result["ipmi_sensors"] = get_ipmi_sensors_data() # 获取 thermal zone 数据 result["thermal_zones"] = get_thermal_zones() # 获取电源信息 result["power_supplies"] = get_power_supply_info() # 获取 IPMI SEL 日志 result["ipmi_sel"] = get_ipmi_sel_logs() # 检查警告条件 warnings = check_sensor_warnings(result) if warnings: result["warnings"] = warnings result["status"] = "warning" except Exception as e: result["status"] = "error" result["error"] = str(e) return result def get_lm_sensors_data() -> Dict[str, Any]: """获取 lm-sensors 传感器数据。""" result = { "available": False, "chips": {} } if not check_command_exists('sensors'): result["error"] = "lm-sensors 未安装" return result try: # 检测传感器芯片 _, stdout, _ = execute_command( ['sensors', '-u'], check_returncode=False, timeout=15 ) if not stdout.strip(): result["error"] = "无传感器数据,可能需要运行 sensors-detect" return result result["available"] = True # 解析 sensors -u 输出 current_chip = None current_adapter = None current_feature = None for line in stdout.split('\n'): line = line.rstrip() if not line: continue # 检测芯片名称行(以冒号结尾的非缩进行) if not line.startswith(' ') and line.endswith(':'): current_chip = line.rstrip(':') result["chips"][current_chip] = { "features": {} } current_feature = None continue # 检测 Adapter 行 if line.strip().startswith('Adapter:'): current_adapter = line.split(':', 1)[1].strip() if current_chip: result["chips"][current_chip]["adapter"] = current_adapter continue # 检测功能名称行(缩进的非冒号结尾行) if line.startswith(' ') and not line.startswith(' ') and not line.endswith(':'): current_feature = line.strip().rstrip(':') if current_chip: result["chips"][current_chip]["features"][current_feature] = {} continue # 检测属性行(四个空格缩进) if line.startswith(' ') and ':' in line and current_chip and current_feature: key_value = line.strip().split(':', 1) if len(key_value) == 2: key = key_value[0].strip() value_str = key_value[1].strip() # 提取数值 value_match = re.search(r'([\d.]+)', value_str) if value_match: value = safe_float(value_match.group(1)) feature_data = result["chips"][current_chip]["features"][current_feature] # 分类存储 if '_input' in key: feature_data["value"] = value elif '_max' in key: feature_data["max"] = value elif '_min' in key: feature_data["min"] = value elif '_crit' in key: feature_data["critical"] = value elif '_alarm' in key: feature_data["alarm"] = value > 0 else: feature_data[key] = value # 提取常用传感器的汇总数据 result["summary"] = extract_sensor_summary(result["chips"]) except Exception as e: result["error"] = str(e) return result def extract_sensor_summary(chips: Dict[str, Any]) -> Dict[str, Any]: """从传感器数据中提取常用指标的汇总。""" summary = { "temperatures": {}, "voltages": {}, "fans": {}, "powers": {}, "currents": {} } for chip_name, chip_data in chips.items(): for feature_name, feature_data in chip_data.get("features", {}).items(): value = feature_data.get("value") if value is None: continue feature_lower = feature_name.lower() # 温度传感器 if 'temp' in feature_lower or 'thermal' in feature_lower: # 提取传感器编号 temp_match = re.search(r'temp(\d+)', feature_lower) if temp_match: temp_id = temp_match.group(1) summary["temperatures"][f"{chip_name}_temp{temp_id}"] = { "value": value, "max": feature_data.get("max"), "critical": feature_data.get("critical"), "alarm": feature_data.get("alarm", False) } # 电压传感器 elif 'in' in feature_lower or 'voltage' in feature_lower or 'vcc' in feature_lower: summary["voltages"][f"{chip_name}_{feature_name}"] = { "value": value, "min": feature_data.get("min"), "max": feature_data.get("max"), "alarm": feature_data.get("alarm", False) } # 风扇转速 elif 'fan' in feature_lower: fan_match = re.search(r'fan(\d+)', feature_lower) if fan_match: fan_id = fan_match.group(1) summary["fans"][f"{chip_name}_fan{fan_id}"] = { "rpm": value, "min": feature_data.get("min"), "alarm": feature_data.get("alarm", False) } # 功率传感器 elif 'power' in feature_lower or 'watt' in feature_lower: summary["powers"][f"{chip_name}_{feature_name}"] = { "value": value, "max": feature_data.get("max") } # 电流传感器 elif 'curr' in feature_lower or 'amp' in feature_lower: summary["currents"][f"{chip_name}_{feature_name}"] = { "value": value, "max": feature_data.get("max") } return summary def get_ipmi_sensors_data() -> Dict[str, Any]: """获取 IPMI 传感器数据。""" result = { "available": False, "sensors": {} } if not check_command_exists('ipmitool'): result["note"] = "ipmitool 未安装" return result try: # 检查 IPMI 是否可用 _, stdout, stderr = execute_command( ['ipmitool', 'sensor'], check_returncode=False, timeout=10 ) if 'Could not open device' in stderr or 'Driver not found' in stderr: result["note"] = "IPMI 设备不可用" return result result["available"] = True # 解析传感器列表 for line in stdout.split('\n'): if not line.strip() or '|' not in line: continue parts = [p.strip() for p in line.split('|')] if len(parts) >= 4: sensor_name = parts[0] sensor_value = parts[1] sensor_unit = parts[2] sensor_status = parts[3] result["sensors"][sensor_name] = { "value": sensor_value, "unit": sensor_unit, "status": sensor_status } # 分类传感器 result["categories"] = categorize_ipmi_sensors(result["sensors"]) except Exception as e: result["error"] = str(e) return result def categorize_ipmi_sensors(sensors: Dict[str, Any]) -> Dict[str, Dict[str, Any]]: """将 IPMI 传感器分类。""" categories = { "temperatures": {}, "voltages": {}, "fans": {}, "power": {}, "currents": {}, "other": {} } for name, data in sensors.items(): name_lower = name.lower() unit = data.get("unit", "").lower() if 'temp' in name_lower or unit == 'degrees c': categories["temperatures"][name] = data elif 'volt' in name_lower or unit == 'volts' or 'vcc' in name_lower or '3.3v' in name_lower or '5v' in name_lower or '12v' in name_lower: categories["voltages"][name] = data elif 'fan' in name_lower or 'rpm' in unit: categories["fans"][name] = data elif 'power' in name_lower or 'watt' in unit: categories["power"][name] = data elif 'current' in name_lower or 'amp' in unit: categories["currents"][name] = data else: categories["other"][name] = data return categories def get_thermal_zones() -> Dict[str, Any]: """从 thermal zone 获取温度信息。""" result = { "zones": {}, "policies": {} } thermal_path = '/sys/class/thermal' if not os.path.exists(thermal_path): return result try: for zone_name in os.listdir(thermal_path): if not zone_name.startswith('thermal_zone'): continue zone_path = os.path.join(thermal_path, zone_name) zone_info = {} # 读取类型 type_file = os.path.join(zone_path, 'type') if os.path.exists(type_file): with open(type_file, 'r') as f: zone_info["type"] = f.read().strip() # 读取温度 (毫摄氏度转换为摄氏度) temp_file = os.path.join(zone_path, 'temp') if os.path.exists(temp_file): with open(temp_file, 'r') as f: temp_mc = safe_int(f.read().strip()) zone_info["temperature_c"] = temp_mc / 1000.0 # 读取策略 policy_file = os.path.join(zone_path, 'policy') if os.path.exists(policy_file): with open(policy_file, 'r') as f: zone_info["policy"] = f.read().strip() # 读取临界温度 trip_point_file = os.path.join(zone_path, 'trip_point_0_temp') if os.path.exists(trip_point_file): with open(trip_point_file, 'r') as f: zone_info["critical_temp_c"] = safe_int(f.read().strip()) / 1000.0 result["zones"][zone_name] = zone_info # 读取 thermal 策略 for policy_file in os.listdir('/sys/class/thermal'): if policy_file.startswith('cooling_device'): policy_path = os.path.join('/sys/class/thermal', policy_file) policy_info = {} type_file = os.path.join(policy_path, 'type') if os.path.exists(type_file): with open(type_file, 'r') as f: policy_info["type"] = f.read().strip() cur_state_file = os.path.join(policy_path, 'cur_state') if os.path.exists(cur_state_file): with open(cur_state_file, 'r') as f: policy_info["current_state"] = safe_int(f.read().strip()) max_state_file = os.path.join(policy_path, 'max_state') if os.path.exists(max_state_file): with open(max_state_file, 'r') as f: policy_info["max_state"] = safe_int(f.read().strip()) result["policies"][policy_file] = policy_info except Exception as e: result["error"] = str(e) return result def get_power_supply_info() -> Dict[str, Any]: """获取电源信息。""" result = { "supplies": [] } power_supply_path = '/sys/class/power_supply' if not os.path.exists(power_supply_path): return result try: for supply_name in os.listdir(power_supply_path): supply_path = os.path.join(power_supply_path, supply_name) supply_info = {"name": supply_name} # 读取所有属性文件 for attr in os.listdir(supply_path): attr_path = os.path.join(supply_path, attr) if os.path.isfile(attr_path): try: with open(attr_path, 'r') as f: value = f.read().strip() # 尝试转换为数字 if value.isdigit(): supply_info[attr] = safe_int(value) else: try: supply_info[attr] = safe_float(value) except: supply_info[attr] = value except: pass result["supplies"].append(supply_info) except Exception as e: result["error"] = str(e) return result def get_ipmi_sel_logs() -> Dict[str, Any]: """获取 IPMI SEL(System Event Log)日志。""" result = { "available": False, "entries": [], "hardware_errors": [], "critical_events": [] } if not check_command_exists('ipmitool'): result["note"] = "ipmitool 未安装" return result try: # 获取 SEL 列表 _, stdout, stderr = execute_command( ['ipmitool', 'sel', 'elist'], check_returncode=False, timeout=15 ) if 'Could not open device' in stderr or 'Driver not found' in stderr: result["note"] = "IPMI 设备不可用" return result result["available"] = True # 解析 SEL 条目 critical_keywords = ['critical', 'failure', 'error', 'thermal', 'voltage', 'power'] hardware_keywords = ['memory', 'processor', 'hard drive', 'fan', 'power supply', 'temperature'] for line in stdout.split('\n'): if not line.strip(): continue # SEL 格式: ID | Date/Time | Source | Event parts = [p.strip() for p in line.split('|')] if len(parts) >= 4: entry = { "id": parts[0], "datetime": parts[1], "source": parts[2], "event": parts[3] } result["entries"].append(entry) # 检查是否为关键事件 event_lower = entry["event"].lower() if any(kw in event_lower for kw in critical_keywords): result["critical_events"].append(entry) # 检查是否为硬件错误 if any(kw in event_lower for kw in hardware_keywords): result["hardware_errors"].append(entry) result["total_entries"] = len(result["entries"]) result["critical_count"] = len(result["critical_events"]) result["hardware_error_count"] = len(result["hardware_errors"]) except Exception as e: result["error"] = str(e) return result def check_sensor_warnings(sensor_data: Dict[str, Any]) -> List[str]: """检查传感器警告条件。""" warnings = [] # 检查 lm-sensors 告警 lm_sensors = sensor_data.get("lm_sensors", {}) summary = lm_sensors.get("summary", {}) # 温度告警 for name, temp_data in summary.get("temperatures", {}).items(): if temp_data.get("alarm"): warnings.append(f"温度传感器 {name} 告警: {temp_data.get('value')}°C") elif temp_data.get("value", 0) > 90: warnings.append(f"温度传感器 {name} 温度过高: {temp_data.get('value')}°C") # 电压告警 for name, volt_data in summary.get("voltages", {}).items(): if volt_data.get("alarm"): warnings.append(f"电压传感器 {name} 告警: {volt_data.get('value')}V") # 风扇告警 for name, fan_data in summary.get("fans", {}).items(): if fan_data.get("alarm"): warnings.append(f"风扇 {name} 告警: {fan_data.get('rpm')} RPM") elif fan_data.get("rpm", 0) == 0 and fan_data.get("min", 0) > 0: warnings.append(f"风扇 {name} 可能已停止: {fan_data.get('rpm')} RPM") # 检查 IPMI 告警 ipmi_sensors = sensor_data.get("ipmi_sensors", {}) for name, data in ipmi_sensors.get("sensors", {}).items(): status = data.get("status", "").lower() if status in ['critical', 'non-recoverable', 'warning']: warnings.append(f"IPMI 传感器 {name} 状态异常: {data.get('status')}") # 检查 IPMI SEL 关键事件 ipmi_sel = sensor_data.get("ipmi_sel", {}) if ipmi_sel.get("critical_count", 0) > 0: warnings.append(f"IPMI SEL 中有 {ipmi_sel['critical_count']} 个关键事件") # 检查 thermal zone 温度 thermal_zones = sensor_data.get("thermal_zones", {}) for zone_name, zone_data in thermal_zones.get("zones", {}).items(): temp = zone_data.get("temperature_c", 0) critical = zone_data.get("critical_temp_c", 100) if temp > critical * 0.9: # 超过临界温度的 90% warnings.append(f"Thermal zone {zone_name} 温度接近临界值: {temp}°C (临界: {critical}°C)") return warnings if __name__ == '__main__': import json print(json.dumps(run_sensors_check(), indent=2, ensure_ascii=False))