Files
ServerGuard/modules/sensors.py
2026-03-02 14:14:40 +08:00

546 lines
19 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
ServerGuard - 电源与主板传感器监控模块
监控电源、主板传感器数据,包括温度、电压、风扇转速等。
"""
import os
import re
from typing import Dict, Any, List, Optional
import sys
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from utils import (
execute_command, check_command_exists, parse_key_value_output,
safe_int, safe_float, require_root
)
def run_sensors_check() -> Dict[str, Any]:
"""
执行传感器检测。
Returns:
Dict[str, Any]: 检测结果
"""
result = {
"status": "success",
"lm_sensors": {},
"ipmi_sensors": {},
"thermal_zones": {},
"power_supplies": {},
"ipmi_sel": {}
}
try:
# 获取 lm-sensors 数据
result["lm_sensors"] = get_lm_sensors_data()
# 获取 IPMI 传感器数据
result["ipmi_sensors"] = get_ipmi_sensors_data()
# 获取 thermal zone 数据
result["thermal_zones"] = get_thermal_zones()
# 获取电源信息
result["power_supplies"] = get_power_supply_info()
# 获取 IPMI SEL 日志
result["ipmi_sel"] = get_ipmi_sel_logs()
# 检查警告条件
warnings = check_sensor_warnings(result)
if warnings:
result["warnings"] = warnings
result["status"] = "warning"
except Exception as e:
result["status"] = "error"
result["error"] = str(e)
return result
def get_lm_sensors_data() -> Dict[str, Any]:
"""获取 lm-sensors 传感器数据。"""
result = {
"available": False,
"chips": {}
}
if not check_command_exists('sensors'):
result["error"] = "lm-sensors 未安装"
return result
try:
# 检测传感器芯片
_, stdout, _ = execute_command(
['sensors', '-u'],
check_returncode=False, timeout=15
)
if not stdout.strip():
result["error"] = "无传感器数据,可能需要运行 sensors-detect"
return result
result["available"] = True
# 解析 sensors -u 输出
current_chip = None
current_adapter = None
current_feature = None
for line in stdout.split('\n'):
line = line.rstrip()
if not line:
continue
# 检测芯片名称行(以冒号结尾的非缩进行)
if not line.startswith(' ') and line.endswith(':'):
current_chip = line.rstrip(':')
result["chips"][current_chip] = {
"features": {}
}
current_feature = None
continue
# 检测 Adapter 行
if line.strip().startswith('Adapter:'):
current_adapter = line.split(':', 1)[1].strip()
if current_chip:
result["chips"][current_chip]["adapter"] = current_adapter
continue
# 检测功能名称行(缩进的非冒号结尾行)
if line.startswith(' ') and not line.startswith(' ') and not line.endswith(':'):
current_feature = line.strip().rstrip(':')
if current_chip:
result["chips"][current_chip]["features"][current_feature] = {}
continue
# 检测属性行(四个空格缩进)
if line.startswith(' ') and ':' in line and current_chip and current_feature:
key_value = line.strip().split(':', 1)
if len(key_value) == 2:
key = key_value[0].strip()
value_str = key_value[1].strip()
# 提取数值
value_match = re.search(r'([\d.]+)', value_str)
if value_match:
value = safe_float(value_match.group(1))
feature_data = result["chips"][current_chip]["features"][current_feature]
# 分类存储
if '_input' in key:
feature_data["value"] = value
elif '_max' in key:
feature_data["max"] = value
elif '_min' in key:
feature_data["min"] = value
elif '_crit' in key:
feature_data["critical"] = value
elif '_alarm' in key:
feature_data["alarm"] = value > 0
else:
feature_data[key] = value
# 提取常用传感器的汇总数据
result["summary"] = extract_sensor_summary(result["chips"])
except Exception as e:
result["error"] = str(e)
return result
def extract_sensor_summary(chips: Dict[str, Any]) -> Dict[str, Any]:
"""从传感器数据中提取常用指标的汇总。"""
summary = {
"temperatures": {},
"voltages": {},
"fans": {},
"powers": {},
"currents": {}
}
for chip_name, chip_data in chips.items():
for feature_name, feature_data in chip_data.get("features", {}).items():
value = feature_data.get("value")
if value is None:
continue
feature_lower = feature_name.lower()
# 温度传感器
if 'temp' in feature_lower or 'thermal' in feature_lower:
# 提取传感器编号
temp_match = re.search(r'temp(\d+)', feature_lower)
if temp_match:
temp_id = temp_match.group(1)
summary["temperatures"][f"{chip_name}_temp{temp_id}"] = {
"value": value,
"max": feature_data.get("max"),
"critical": feature_data.get("critical"),
"alarm": feature_data.get("alarm", False)
}
# 电压传感器
elif 'in' in feature_lower or 'voltage' in feature_lower or 'vcc' in feature_lower:
summary["voltages"][f"{chip_name}_{feature_name}"] = {
"value": value,
"min": feature_data.get("min"),
"max": feature_data.get("max"),
"alarm": feature_data.get("alarm", False)
}
# 风扇转速
elif 'fan' in feature_lower:
fan_match = re.search(r'fan(\d+)', feature_lower)
if fan_match:
fan_id = fan_match.group(1)
summary["fans"][f"{chip_name}_fan{fan_id}"] = {
"rpm": value,
"min": feature_data.get("min"),
"alarm": feature_data.get("alarm", False)
}
# 功率传感器
elif 'power' in feature_lower or 'watt' in feature_lower:
summary["powers"][f"{chip_name}_{feature_name}"] = {
"value": value,
"max": feature_data.get("max")
}
# 电流传感器
elif 'curr' in feature_lower or 'amp' in feature_lower:
summary["currents"][f"{chip_name}_{feature_name}"] = {
"value": value,
"max": feature_data.get("max")
}
return summary
def get_ipmi_sensors_data() -> Dict[str, Any]:
"""获取 IPMI 传感器数据。"""
result = {
"available": False,
"sensors": {}
}
if not check_command_exists('ipmitool'):
result["note"] = "ipmitool 未安装"
return result
try:
# 检查 IPMI 是否可用
_, stdout, stderr = execute_command(
['ipmitool', 'sensor'],
check_returncode=False, timeout=10
)
if 'Could not open device' in stderr or 'Driver not found' in stderr:
result["note"] = "IPMI 设备不可用"
return result
result["available"] = True
# 解析传感器列表
for line in stdout.split('\n'):
if not line.strip() or '|' not in line:
continue
parts = [p.strip() for p in line.split('|')]
if len(parts) >= 4:
sensor_name = parts[0]
sensor_value = parts[1]
sensor_unit = parts[2]
sensor_status = parts[3]
result["sensors"][sensor_name] = {
"value": sensor_value,
"unit": sensor_unit,
"status": sensor_status
}
# 分类传感器
result["categories"] = categorize_ipmi_sensors(result["sensors"])
except Exception as e:
result["error"] = str(e)
return result
def categorize_ipmi_sensors(sensors: Dict[str, Any]) -> Dict[str, Dict[str, Any]]:
"""将 IPMI 传感器分类。"""
categories = {
"temperatures": {},
"voltages": {},
"fans": {},
"power": {},
"currents": {},
"other": {}
}
for name, data in sensors.items():
name_lower = name.lower()
unit = data.get("unit", "").lower()
if 'temp' in name_lower or unit == 'degrees c':
categories["temperatures"][name] = data
elif 'volt' in name_lower or unit == 'volts' or 'vcc' in name_lower or '3.3v' in name_lower or '5v' in name_lower or '12v' in name_lower:
categories["voltages"][name] = data
elif 'fan' in name_lower or 'rpm' in unit:
categories["fans"][name] = data
elif 'power' in name_lower or 'watt' in unit:
categories["power"][name] = data
elif 'current' in name_lower or 'amp' in unit:
categories["currents"][name] = data
else:
categories["other"][name] = data
return categories
def get_thermal_zones() -> Dict[str, Any]:
"""从 thermal zone 获取温度信息。"""
result = {
"zones": {},
"policies": {}
}
thermal_path = '/sys/class/thermal'
if not os.path.exists(thermal_path):
return result
try:
for zone_name in os.listdir(thermal_path):
if not zone_name.startswith('thermal_zone'):
continue
zone_path = os.path.join(thermal_path, zone_name)
zone_info = {}
# 读取类型
type_file = os.path.join(zone_path, 'type')
if os.path.exists(type_file):
with open(type_file, 'r') as f:
zone_info["type"] = f.read().strip()
# 读取温度 (毫摄氏度转换为摄氏度)
temp_file = os.path.join(zone_path, 'temp')
if os.path.exists(temp_file):
with open(temp_file, 'r') as f:
temp_mc = safe_int(f.read().strip())
zone_info["temperature_c"] = temp_mc / 1000.0
# 读取策略
policy_file = os.path.join(zone_path, 'policy')
if os.path.exists(policy_file):
with open(policy_file, 'r') as f:
zone_info["policy"] = f.read().strip()
# 读取临界温度
trip_point_file = os.path.join(zone_path, 'trip_point_0_temp')
if os.path.exists(trip_point_file):
with open(trip_point_file, 'r') as f:
zone_info["critical_temp_c"] = safe_int(f.read().strip()) / 1000.0
result["zones"][zone_name] = zone_info
# 读取 thermal 策略
for policy_file in os.listdir('/sys/class/thermal'):
if policy_file.startswith('cooling_device'):
policy_path = os.path.join('/sys/class/thermal', policy_file)
policy_info = {}
type_file = os.path.join(policy_path, 'type')
if os.path.exists(type_file):
with open(type_file, 'r') as f:
policy_info["type"] = f.read().strip()
cur_state_file = os.path.join(policy_path, 'cur_state')
if os.path.exists(cur_state_file):
with open(cur_state_file, 'r') as f:
policy_info["current_state"] = safe_int(f.read().strip())
max_state_file = os.path.join(policy_path, 'max_state')
if os.path.exists(max_state_file):
with open(max_state_file, 'r') as f:
policy_info["max_state"] = safe_int(f.read().strip())
result["policies"][policy_file] = policy_info
except Exception as e:
result["error"] = str(e)
return result
def get_power_supply_info() -> Dict[str, Any]:
"""获取电源信息。"""
result = {
"supplies": []
}
power_supply_path = '/sys/class/power_supply'
if not os.path.exists(power_supply_path):
return result
try:
for supply_name in os.listdir(power_supply_path):
supply_path = os.path.join(power_supply_path, supply_name)
supply_info = {"name": supply_name}
# 读取所有属性文件
for attr in os.listdir(supply_path):
attr_path = os.path.join(supply_path, attr)
if os.path.isfile(attr_path):
try:
with open(attr_path, 'r') as f:
value = f.read().strip()
# 尝试转换为数字
if value.isdigit():
supply_info[attr] = safe_int(value)
else:
try:
supply_info[attr] = safe_float(value)
except:
supply_info[attr] = value
except:
pass
result["supplies"].append(supply_info)
except Exception as e:
result["error"] = str(e)
return result
def get_ipmi_sel_logs() -> Dict[str, Any]:
"""获取 IPMI SELSystem Event Log日志。"""
result = {
"available": False,
"entries": [],
"hardware_errors": [],
"critical_events": []
}
if not check_command_exists('ipmitool'):
result["note"] = "ipmitool 未安装"
return result
try:
# 获取 SEL 列表
_, stdout, stderr = execute_command(
['ipmitool', 'sel', 'elist'],
check_returncode=False, timeout=15
)
if 'Could not open device' in stderr or 'Driver not found' in stderr:
result["note"] = "IPMI 设备不可用"
return result
result["available"] = True
# 解析 SEL 条目
critical_keywords = ['critical', 'failure', 'error', 'thermal', 'voltage', 'power']
hardware_keywords = ['memory', 'processor', 'hard drive', 'fan', 'power supply', 'temperature']
for line in stdout.split('\n'):
if not line.strip():
continue
# SEL 格式: ID | Date/Time | Source | Event
parts = [p.strip() for p in line.split('|')]
if len(parts) >= 4:
entry = {
"id": parts[0],
"datetime": parts[1],
"source": parts[2],
"event": parts[3]
}
result["entries"].append(entry)
# 检查是否为关键事件
event_lower = entry["event"].lower()
if any(kw in event_lower for kw in critical_keywords):
result["critical_events"].append(entry)
# 检查是否为硬件错误
if any(kw in event_lower for kw in hardware_keywords):
result["hardware_errors"].append(entry)
result["total_entries"] = len(result["entries"])
result["critical_count"] = len(result["critical_events"])
result["hardware_error_count"] = len(result["hardware_errors"])
except Exception as e:
result["error"] = str(e)
return result
def check_sensor_warnings(sensor_data: Dict[str, Any]) -> List[str]:
"""检查传感器警告条件。"""
warnings = []
# 检查 lm-sensors 告警
lm_sensors = sensor_data.get("lm_sensors", {})
summary = lm_sensors.get("summary", {})
# 温度告警
for name, temp_data in summary.get("temperatures", {}).items():
if temp_data.get("alarm"):
warnings.append(f"温度传感器 {name} 告警: {temp_data.get('value')}°C")
elif temp_data.get("value", 0) > 90:
warnings.append(f"温度传感器 {name} 温度过高: {temp_data.get('value')}°C")
# 电压告警
for name, volt_data in summary.get("voltages", {}).items():
if volt_data.get("alarm"):
warnings.append(f"电压传感器 {name} 告警: {volt_data.get('value')}V")
# 风扇告警
for name, fan_data in summary.get("fans", {}).items():
if fan_data.get("alarm"):
warnings.append(f"风扇 {name} 告警: {fan_data.get('rpm')} RPM")
elif fan_data.get("rpm", 0) == 0 and fan_data.get("min", 0) > 0:
warnings.append(f"风扇 {name} 可能已停止: {fan_data.get('rpm')} RPM")
# 检查 IPMI 告警
ipmi_sensors = sensor_data.get("ipmi_sensors", {})
for name, data in ipmi_sensors.get("sensors", {}).items():
status = data.get("status", "").lower()
if status in ['critical', 'non-recoverable', 'warning']:
warnings.append(f"IPMI 传感器 {name} 状态异常: {data.get('status')}")
# 检查 IPMI SEL 关键事件
ipmi_sel = sensor_data.get("ipmi_sel", {})
if ipmi_sel.get("critical_count", 0) > 0:
warnings.append(f"IPMI SEL 中有 {ipmi_sel['critical_count']} 个关键事件")
# 检查 thermal zone 温度
thermal_zones = sensor_data.get("thermal_zones", {})
for zone_name, zone_data in thermal_zones.get("zones", {}).items():
temp = zone_data.get("temperature_c", 0)
critical = zone_data.get("critical_temp_c", 100)
if temp > critical * 0.9: # 超过临界温度的 90%
warnings.append(f"Thermal zone {zone_name} 温度接近临界值: {temp}°C (临界: {critical}°C)")
return warnings
if __name__ == '__main__':
import json
print(json.dumps(run_sensors_check(), indent=2, ensure_ascii=False))