546 lines
19 KiB
Python
546 lines
19 KiB
Python
"""
|
||
ServerGuard - 电源与主板传感器监控模块
|
||
|
||
监控电源、主板传感器数据,包括温度、电压、风扇转速等。
|
||
"""
|
||
|
||
import os
|
||
import re
|
||
from typing import Dict, Any, List, Optional
|
||
|
||
import sys
|
||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||
|
||
from utils import (
|
||
execute_command, check_command_exists, parse_key_value_output,
|
||
safe_int, safe_float, require_root
|
||
)
|
||
|
||
|
||
def run_sensors_check() -> Dict[str, Any]:
|
||
"""
|
||
执行传感器检测。
|
||
|
||
Returns:
|
||
Dict[str, Any]: 检测结果
|
||
"""
|
||
result = {
|
||
"status": "success",
|
||
"lm_sensors": {},
|
||
"ipmi_sensors": {},
|
||
"thermal_zones": {},
|
||
"power_supplies": {},
|
||
"ipmi_sel": {}
|
||
}
|
||
|
||
try:
|
||
# 获取 lm-sensors 数据
|
||
result["lm_sensors"] = get_lm_sensors_data()
|
||
|
||
# 获取 IPMI 传感器数据
|
||
result["ipmi_sensors"] = get_ipmi_sensors_data()
|
||
|
||
# 获取 thermal zone 数据
|
||
result["thermal_zones"] = get_thermal_zones()
|
||
|
||
# 获取电源信息
|
||
result["power_supplies"] = get_power_supply_info()
|
||
|
||
# 获取 IPMI SEL 日志
|
||
result["ipmi_sel"] = get_ipmi_sel_logs()
|
||
|
||
# 检查警告条件
|
||
warnings = check_sensor_warnings(result)
|
||
if warnings:
|
||
result["warnings"] = warnings
|
||
result["status"] = "warning"
|
||
|
||
except Exception as e:
|
||
result["status"] = "error"
|
||
result["error"] = str(e)
|
||
|
||
return result
|
||
|
||
|
||
def get_lm_sensors_data() -> Dict[str, Any]:
|
||
"""获取 lm-sensors 传感器数据。"""
|
||
result = {
|
||
"available": False,
|
||
"chips": {}
|
||
}
|
||
|
||
if not check_command_exists('sensors'):
|
||
result["error"] = "lm-sensors 未安装"
|
||
return result
|
||
|
||
try:
|
||
# 检测传感器芯片
|
||
_, stdout, _ = execute_command(
|
||
['sensors', '-u'],
|
||
check_returncode=False, timeout=15
|
||
)
|
||
|
||
if not stdout.strip():
|
||
result["error"] = "无传感器数据,可能需要运行 sensors-detect"
|
||
return result
|
||
|
||
result["available"] = True
|
||
|
||
# 解析 sensors -u 输出
|
||
current_chip = None
|
||
current_adapter = None
|
||
current_feature = None
|
||
|
||
for line in stdout.split('\n'):
|
||
line = line.rstrip()
|
||
|
||
if not line:
|
||
continue
|
||
|
||
# 检测芯片名称行(以冒号结尾的非缩进行)
|
||
if not line.startswith(' ') and line.endswith(':'):
|
||
current_chip = line.rstrip(':')
|
||
result["chips"][current_chip] = {
|
||
"features": {}
|
||
}
|
||
current_feature = None
|
||
continue
|
||
|
||
# 检测 Adapter 行
|
||
if line.strip().startswith('Adapter:'):
|
||
current_adapter = line.split(':', 1)[1].strip()
|
||
if current_chip:
|
||
result["chips"][current_chip]["adapter"] = current_adapter
|
||
continue
|
||
|
||
# 检测功能名称行(缩进的非冒号结尾行)
|
||
if line.startswith(' ') and not line.startswith(' ') and not line.endswith(':'):
|
||
current_feature = line.strip().rstrip(':')
|
||
if current_chip:
|
||
result["chips"][current_chip]["features"][current_feature] = {}
|
||
continue
|
||
|
||
# 检测属性行(四个空格缩进)
|
||
if line.startswith(' ') and ':' in line and current_chip and current_feature:
|
||
key_value = line.strip().split(':', 1)
|
||
if len(key_value) == 2:
|
||
key = key_value[0].strip()
|
||
value_str = key_value[1].strip()
|
||
|
||
# 提取数值
|
||
value_match = re.search(r'([\d.]+)', value_str)
|
||
if value_match:
|
||
value = safe_float(value_match.group(1))
|
||
|
||
feature_data = result["chips"][current_chip]["features"][current_feature]
|
||
|
||
# 分类存储
|
||
if '_input' in key:
|
||
feature_data["value"] = value
|
||
elif '_max' in key:
|
||
feature_data["max"] = value
|
||
elif '_min' in key:
|
||
feature_data["min"] = value
|
||
elif '_crit' in key:
|
||
feature_data["critical"] = value
|
||
elif '_alarm' in key:
|
||
feature_data["alarm"] = value > 0
|
||
else:
|
||
feature_data[key] = value
|
||
|
||
# 提取常用传感器的汇总数据
|
||
result["summary"] = extract_sensor_summary(result["chips"])
|
||
|
||
except Exception as e:
|
||
result["error"] = str(e)
|
||
|
||
return result
|
||
|
||
|
||
def extract_sensor_summary(chips: Dict[str, Any]) -> Dict[str, Any]:
|
||
"""从传感器数据中提取常用指标的汇总。"""
|
||
summary = {
|
||
"temperatures": {},
|
||
"voltages": {},
|
||
"fans": {},
|
||
"powers": {},
|
||
"currents": {}
|
||
}
|
||
|
||
for chip_name, chip_data in chips.items():
|
||
for feature_name, feature_data in chip_data.get("features", {}).items():
|
||
value = feature_data.get("value")
|
||
if value is None:
|
||
continue
|
||
|
||
feature_lower = feature_name.lower()
|
||
|
||
# 温度传感器
|
||
if 'temp' in feature_lower or 'thermal' in feature_lower:
|
||
# 提取传感器编号
|
||
temp_match = re.search(r'temp(\d+)', feature_lower)
|
||
if temp_match:
|
||
temp_id = temp_match.group(1)
|
||
summary["temperatures"][f"{chip_name}_temp{temp_id}"] = {
|
||
"value": value,
|
||
"max": feature_data.get("max"),
|
||
"critical": feature_data.get("critical"),
|
||
"alarm": feature_data.get("alarm", False)
|
||
}
|
||
|
||
# 电压传感器
|
||
elif 'in' in feature_lower or 'voltage' in feature_lower or 'vcc' in feature_lower:
|
||
summary["voltages"][f"{chip_name}_{feature_name}"] = {
|
||
"value": value,
|
||
"min": feature_data.get("min"),
|
||
"max": feature_data.get("max"),
|
||
"alarm": feature_data.get("alarm", False)
|
||
}
|
||
|
||
# 风扇转速
|
||
elif 'fan' in feature_lower:
|
||
fan_match = re.search(r'fan(\d+)', feature_lower)
|
||
if fan_match:
|
||
fan_id = fan_match.group(1)
|
||
summary["fans"][f"{chip_name}_fan{fan_id}"] = {
|
||
"rpm": value,
|
||
"min": feature_data.get("min"),
|
||
"alarm": feature_data.get("alarm", False)
|
||
}
|
||
|
||
# 功率传感器
|
||
elif 'power' in feature_lower or 'watt' in feature_lower:
|
||
summary["powers"][f"{chip_name}_{feature_name}"] = {
|
||
"value": value,
|
||
"max": feature_data.get("max")
|
||
}
|
||
|
||
# 电流传感器
|
||
elif 'curr' in feature_lower or 'amp' in feature_lower:
|
||
summary["currents"][f"{chip_name}_{feature_name}"] = {
|
||
"value": value,
|
||
"max": feature_data.get("max")
|
||
}
|
||
|
||
return summary
|
||
|
||
|
||
def get_ipmi_sensors_data() -> Dict[str, Any]:
|
||
"""获取 IPMI 传感器数据。"""
|
||
result = {
|
||
"available": False,
|
||
"sensors": {}
|
||
}
|
||
|
||
if not check_command_exists('ipmitool'):
|
||
result["note"] = "ipmitool 未安装"
|
||
return result
|
||
|
||
try:
|
||
# 检查 IPMI 是否可用
|
||
_, stdout, stderr = execute_command(
|
||
['ipmitool', 'sensor'],
|
||
check_returncode=False, timeout=10
|
||
)
|
||
|
||
if 'Could not open device' in stderr or 'Driver not found' in stderr:
|
||
result["note"] = "IPMI 设备不可用"
|
||
return result
|
||
|
||
result["available"] = True
|
||
|
||
# 解析传感器列表
|
||
for line in stdout.split('\n'):
|
||
if not line.strip() or '|' not in line:
|
||
continue
|
||
|
||
parts = [p.strip() for p in line.split('|')]
|
||
if len(parts) >= 4:
|
||
sensor_name = parts[0]
|
||
sensor_value = parts[1]
|
||
sensor_unit = parts[2]
|
||
sensor_status = parts[3]
|
||
|
||
result["sensors"][sensor_name] = {
|
||
"value": sensor_value,
|
||
"unit": sensor_unit,
|
||
"status": sensor_status
|
||
}
|
||
|
||
# 分类传感器
|
||
result["categories"] = categorize_ipmi_sensors(result["sensors"])
|
||
|
||
except Exception as e:
|
||
result["error"] = str(e)
|
||
|
||
return result
|
||
|
||
|
||
def categorize_ipmi_sensors(sensors: Dict[str, Any]) -> Dict[str, Dict[str, Any]]:
|
||
"""将 IPMI 传感器分类。"""
|
||
categories = {
|
||
"temperatures": {},
|
||
"voltages": {},
|
||
"fans": {},
|
||
"power": {},
|
||
"currents": {},
|
||
"other": {}
|
||
}
|
||
|
||
for name, data in sensors.items():
|
||
name_lower = name.lower()
|
||
unit = data.get("unit", "").lower()
|
||
|
||
if 'temp' in name_lower or unit == 'degrees c':
|
||
categories["temperatures"][name] = data
|
||
elif 'volt' in name_lower or unit == 'volts' or 'vcc' in name_lower or '3.3v' in name_lower or '5v' in name_lower or '12v' in name_lower:
|
||
categories["voltages"][name] = data
|
||
elif 'fan' in name_lower or 'rpm' in unit:
|
||
categories["fans"][name] = data
|
||
elif 'power' in name_lower or 'watt' in unit:
|
||
categories["power"][name] = data
|
||
elif 'current' in name_lower or 'amp' in unit:
|
||
categories["currents"][name] = data
|
||
else:
|
||
categories["other"][name] = data
|
||
|
||
return categories
|
||
|
||
|
||
def get_thermal_zones() -> Dict[str, Any]:
|
||
"""从 thermal zone 获取温度信息。"""
|
||
result = {
|
||
"zones": {},
|
||
"policies": {}
|
||
}
|
||
|
||
thermal_path = '/sys/class/thermal'
|
||
|
||
if not os.path.exists(thermal_path):
|
||
return result
|
||
|
||
try:
|
||
for zone_name in os.listdir(thermal_path):
|
||
if not zone_name.startswith('thermal_zone'):
|
||
continue
|
||
|
||
zone_path = os.path.join(thermal_path, zone_name)
|
||
zone_info = {}
|
||
|
||
# 读取类型
|
||
type_file = os.path.join(zone_path, 'type')
|
||
if os.path.exists(type_file):
|
||
with open(type_file, 'r') as f:
|
||
zone_info["type"] = f.read().strip()
|
||
|
||
# 读取温度 (毫摄氏度转换为摄氏度)
|
||
temp_file = os.path.join(zone_path, 'temp')
|
||
if os.path.exists(temp_file):
|
||
with open(temp_file, 'r') as f:
|
||
temp_mc = safe_int(f.read().strip())
|
||
zone_info["temperature_c"] = temp_mc / 1000.0
|
||
|
||
# 读取策略
|
||
policy_file = os.path.join(zone_path, 'policy')
|
||
if os.path.exists(policy_file):
|
||
with open(policy_file, 'r') as f:
|
||
zone_info["policy"] = f.read().strip()
|
||
|
||
# 读取临界温度
|
||
trip_point_file = os.path.join(zone_path, 'trip_point_0_temp')
|
||
if os.path.exists(trip_point_file):
|
||
with open(trip_point_file, 'r') as f:
|
||
zone_info["critical_temp_c"] = safe_int(f.read().strip()) / 1000.0
|
||
|
||
result["zones"][zone_name] = zone_info
|
||
|
||
# 读取 thermal 策略
|
||
for policy_file in os.listdir('/sys/class/thermal'):
|
||
if policy_file.startswith('cooling_device'):
|
||
policy_path = os.path.join('/sys/class/thermal', policy_file)
|
||
policy_info = {}
|
||
|
||
type_file = os.path.join(policy_path, 'type')
|
||
if os.path.exists(type_file):
|
||
with open(type_file, 'r') as f:
|
||
policy_info["type"] = f.read().strip()
|
||
|
||
cur_state_file = os.path.join(policy_path, 'cur_state')
|
||
if os.path.exists(cur_state_file):
|
||
with open(cur_state_file, 'r') as f:
|
||
policy_info["current_state"] = safe_int(f.read().strip())
|
||
|
||
max_state_file = os.path.join(policy_path, 'max_state')
|
||
if os.path.exists(max_state_file):
|
||
with open(max_state_file, 'r') as f:
|
||
policy_info["max_state"] = safe_int(f.read().strip())
|
||
|
||
result["policies"][policy_file] = policy_info
|
||
|
||
except Exception as e:
|
||
result["error"] = str(e)
|
||
|
||
return result
|
||
|
||
|
||
def get_power_supply_info() -> Dict[str, Any]:
|
||
"""获取电源信息。"""
|
||
result = {
|
||
"supplies": []
|
||
}
|
||
|
||
power_supply_path = '/sys/class/power_supply'
|
||
|
||
if not os.path.exists(power_supply_path):
|
||
return result
|
||
|
||
try:
|
||
for supply_name in os.listdir(power_supply_path):
|
||
supply_path = os.path.join(power_supply_path, supply_name)
|
||
supply_info = {"name": supply_name}
|
||
|
||
# 读取所有属性文件
|
||
for attr in os.listdir(supply_path):
|
||
attr_path = os.path.join(supply_path, attr)
|
||
if os.path.isfile(attr_path):
|
||
try:
|
||
with open(attr_path, 'r') as f:
|
||
value = f.read().strip()
|
||
# 尝试转换为数字
|
||
if value.isdigit():
|
||
supply_info[attr] = safe_int(value)
|
||
else:
|
||
try:
|
||
supply_info[attr] = safe_float(value)
|
||
except:
|
||
supply_info[attr] = value
|
||
except:
|
||
pass
|
||
|
||
result["supplies"].append(supply_info)
|
||
|
||
except Exception as e:
|
||
result["error"] = str(e)
|
||
|
||
return result
|
||
|
||
|
||
def get_ipmi_sel_logs() -> Dict[str, Any]:
|
||
"""获取 IPMI SEL(System Event Log)日志。"""
|
||
result = {
|
||
"available": False,
|
||
"entries": [],
|
||
"hardware_errors": [],
|
||
"critical_events": []
|
||
}
|
||
|
||
if not check_command_exists('ipmitool'):
|
||
result["note"] = "ipmitool 未安装"
|
||
return result
|
||
|
||
try:
|
||
# 获取 SEL 列表
|
||
_, stdout, stderr = execute_command(
|
||
['ipmitool', 'sel', 'elist'],
|
||
check_returncode=False, timeout=15
|
||
)
|
||
|
||
if 'Could not open device' in stderr or 'Driver not found' in stderr:
|
||
result["note"] = "IPMI 设备不可用"
|
||
return result
|
||
|
||
result["available"] = True
|
||
|
||
# 解析 SEL 条目
|
||
critical_keywords = ['critical', 'failure', 'error', 'thermal', 'voltage', 'power']
|
||
hardware_keywords = ['memory', 'processor', 'hard drive', 'fan', 'power supply', 'temperature']
|
||
|
||
for line in stdout.split('\n'):
|
||
if not line.strip():
|
||
continue
|
||
|
||
# SEL 格式: ID | Date/Time | Source | Event
|
||
parts = [p.strip() for p in line.split('|')]
|
||
if len(parts) >= 4:
|
||
entry = {
|
||
"id": parts[0],
|
||
"datetime": parts[1],
|
||
"source": parts[2],
|
||
"event": parts[3]
|
||
}
|
||
|
||
result["entries"].append(entry)
|
||
|
||
# 检查是否为关键事件
|
||
event_lower = entry["event"].lower()
|
||
if any(kw in event_lower for kw in critical_keywords):
|
||
result["critical_events"].append(entry)
|
||
|
||
# 检查是否为硬件错误
|
||
if any(kw in event_lower for kw in hardware_keywords):
|
||
result["hardware_errors"].append(entry)
|
||
|
||
result["total_entries"] = len(result["entries"])
|
||
result["critical_count"] = len(result["critical_events"])
|
||
result["hardware_error_count"] = len(result["hardware_errors"])
|
||
|
||
except Exception as e:
|
||
result["error"] = str(e)
|
||
|
||
return result
|
||
|
||
|
||
def check_sensor_warnings(sensor_data: Dict[str, Any]) -> List[str]:
|
||
"""检查传感器警告条件。"""
|
||
warnings = []
|
||
|
||
# 检查 lm-sensors 告警
|
||
lm_sensors = sensor_data.get("lm_sensors", {})
|
||
summary = lm_sensors.get("summary", {})
|
||
|
||
# 温度告警
|
||
for name, temp_data in summary.get("temperatures", {}).items():
|
||
if temp_data.get("alarm"):
|
||
warnings.append(f"温度传感器 {name} 告警: {temp_data.get('value')}°C")
|
||
elif temp_data.get("value", 0) > 90:
|
||
warnings.append(f"温度传感器 {name} 温度过高: {temp_data.get('value')}°C")
|
||
|
||
# 电压告警
|
||
for name, volt_data in summary.get("voltages", {}).items():
|
||
if volt_data.get("alarm"):
|
||
warnings.append(f"电压传感器 {name} 告警: {volt_data.get('value')}V")
|
||
|
||
# 风扇告警
|
||
for name, fan_data in summary.get("fans", {}).items():
|
||
if fan_data.get("alarm"):
|
||
warnings.append(f"风扇 {name} 告警: {fan_data.get('rpm')} RPM")
|
||
elif fan_data.get("rpm", 0) == 0 and fan_data.get("min", 0) > 0:
|
||
warnings.append(f"风扇 {name} 可能已停止: {fan_data.get('rpm')} RPM")
|
||
|
||
# 检查 IPMI 告警
|
||
ipmi_sensors = sensor_data.get("ipmi_sensors", {})
|
||
for name, data in ipmi_sensors.get("sensors", {}).items():
|
||
status = data.get("status", "").lower()
|
||
if status in ['critical', 'non-recoverable', 'warning']:
|
||
warnings.append(f"IPMI 传感器 {name} 状态异常: {data.get('status')}")
|
||
|
||
# 检查 IPMI SEL 关键事件
|
||
ipmi_sel = sensor_data.get("ipmi_sel", {})
|
||
if ipmi_sel.get("critical_count", 0) > 0:
|
||
warnings.append(f"IPMI SEL 中有 {ipmi_sel['critical_count']} 个关键事件")
|
||
|
||
# 检查 thermal zone 温度
|
||
thermal_zones = sensor_data.get("thermal_zones", {})
|
||
for zone_name, zone_data in thermal_zones.get("zones", {}).items():
|
||
temp = zone_data.get("temperature_c", 0)
|
||
critical = zone_data.get("critical_temp_c", 100)
|
||
if temp > critical * 0.9: # 超过临界温度的 90%
|
||
warnings.append(f"Thermal zone {zone_name} 温度接近临界值: {temp}°C (临界: {critical}°C)")
|
||
|
||
return warnings
|
||
|
||
|
||
if __name__ == '__main__':
|
||
import json
|
||
print(json.dumps(run_sensors_check(), indent=2, ensure_ascii=False))
|