first commit

This commit is contained in:
zj
2026-03-02 14:14:40 +08:00
commit c4f4fefa0a
20 changed files with 6037 additions and 0 deletions

545
modules/sensors.py Normal file
View File

@@ -0,0 +1,545 @@
"""
ServerGuard - 电源与主板传感器监控模块
监控电源、主板传感器数据,包括温度、电压、风扇转速等。
"""
import os
import re
from typing import Dict, Any, List, Optional
import sys
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from utils import (
execute_command, check_command_exists, parse_key_value_output,
safe_int, safe_float, require_root
)
def run_sensors_check() -> Dict[str, Any]:
"""
执行传感器检测。
Returns:
Dict[str, Any]: 检测结果
"""
result = {
"status": "success",
"lm_sensors": {},
"ipmi_sensors": {},
"thermal_zones": {},
"power_supplies": {},
"ipmi_sel": {}
}
try:
# 获取 lm-sensors 数据
result["lm_sensors"] = get_lm_sensors_data()
# 获取 IPMI 传感器数据
result["ipmi_sensors"] = get_ipmi_sensors_data()
# 获取 thermal zone 数据
result["thermal_zones"] = get_thermal_zones()
# 获取电源信息
result["power_supplies"] = get_power_supply_info()
# 获取 IPMI SEL 日志
result["ipmi_sel"] = get_ipmi_sel_logs()
# 检查警告条件
warnings = check_sensor_warnings(result)
if warnings:
result["warnings"] = warnings
result["status"] = "warning"
except Exception as e:
result["status"] = "error"
result["error"] = str(e)
return result
def get_lm_sensors_data() -> Dict[str, Any]:
"""获取 lm-sensors 传感器数据。"""
result = {
"available": False,
"chips": {}
}
if not check_command_exists('sensors'):
result["error"] = "lm-sensors 未安装"
return result
try:
# 检测传感器芯片
_, stdout, _ = execute_command(
['sensors', '-u'],
check_returncode=False, timeout=15
)
if not stdout.strip():
result["error"] = "无传感器数据,可能需要运行 sensors-detect"
return result
result["available"] = True
# 解析 sensors -u 输出
current_chip = None
current_adapter = None
current_feature = None
for line in stdout.split('\n'):
line = line.rstrip()
if not line:
continue
# 检测芯片名称行(以冒号结尾的非缩进行)
if not line.startswith(' ') and line.endswith(':'):
current_chip = line.rstrip(':')
result["chips"][current_chip] = {
"features": {}
}
current_feature = None
continue
# 检测 Adapter 行
if line.strip().startswith('Adapter:'):
current_adapter = line.split(':', 1)[1].strip()
if current_chip:
result["chips"][current_chip]["adapter"] = current_adapter
continue
# 检测功能名称行(缩进的非冒号结尾行)
if line.startswith(' ') and not line.startswith(' ') and not line.endswith(':'):
current_feature = line.strip().rstrip(':')
if current_chip:
result["chips"][current_chip]["features"][current_feature] = {}
continue
# 检测属性行(四个空格缩进)
if line.startswith(' ') and ':' in line and current_chip and current_feature:
key_value = line.strip().split(':', 1)
if len(key_value) == 2:
key = key_value[0].strip()
value_str = key_value[1].strip()
# 提取数值
value_match = re.search(r'([\d.]+)', value_str)
if value_match:
value = safe_float(value_match.group(1))
feature_data = result["chips"][current_chip]["features"][current_feature]
# 分类存储
if '_input' in key:
feature_data["value"] = value
elif '_max' in key:
feature_data["max"] = value
elif '_min' in key:
feature_data["min"] = value
elif '_crit' in key:
feature_data["critical"] = value
elif '_alarm' in key:
feature_data["alarm"] = value > 0
else:
feature_data[key] = value
# 提取常用传感器的汇总数据
result["summary"] = extract_sensor_summary(result["chips"])
except Exception as e:
result["error"] = str(e)
return result
def extract_sensor_summary(chips: Dict[str, Any]) -> Dict[str, Any]:
"""从传感器数据中提取常用指标的汇总。"""
summary = {
"temperatures": {},
"voltages": {},
"fans": {},
"powers": {},
"currents": {}
}
for chip_name, chip_data in chips.items():
for feature_name, feature_data in chip_data.get("features", {}).items():
value = feature_data.get("value")
if value is None:
continue
feature_lower = feature_name.lower()
# 温度传感器
if 'temp' in feature_lower or 'thermal' in feature_lower:
# 提取传感器编号
temp_match = re.search(r'temp(\d+)', feature_lower)
if temp_match:
temp_id = temp_match.group(1)
summary["temperatures"][f"{chip_name}_temp{temp_id}"] = {
"value": value,
"max": feature_data.get("max"),
"critical": feature_data.get("critical"),
"alarm": feature_data.get("alarm", False)
}
# 电压传感器
elif 'in' in feature_lower or 'voltage' in feature_lower or 'vcc' in feature_lower:
summary["voltages"][f"{chip_name}_{feature_name}"] = {
"value": value,
"min": feature_data.get("min"),
"max": feature_data.get("max"),
"alarm": feature_data.get("alarm", False)
}
# 风扇转速
elif 'fan' in feature_lower:
fan_match = re.search(r'fan(\d+)', feature_lower)
if fan_match:
fan_id = fan_match.group(1)
summary["fans"][f"{chip_name}_fan{fan_id}"] = {
"rpm": value,
"min": feature_data.get("min"),
"alarm": feature_data.get("alarm", False)
}
# 功率传感器
elif 'power' in feature_lower or 'watt' in feature_lower:
summary["powers"][f"{chip_name}_{feature_name}"] = {
"value": value,
"max": feature_data.get("max")
}
# 电流传感器
elif 'curr' in feature_lower or 'amp' in feature_lower:
summary["currents"][f"{chip_name}_{feature_name}"] = {
"value": value,
"max": feature_data.get("max")
}
return summary
def get_ipmi_sensors_data() -> Dict[str, Any]:
"""获取 IPMI 传感器数据。"""
result = {
"available": False,
"sensors": {}
}
if not check_command_exists('ipmitool'):
result["note"] = "ipmitool 未安装"
return result
try:
# 检查 IPMI 是否可用
_, stdout, stderr = execute_command(
['ipmitool', 'sensor'],
check_returncode=False, timeout=10
)
if 'Could not open device' in stderr or 'Driver not found' in stderr:
result["note"] = "IPMI 设备不可用"
return result
result["available"] = True
# 解析传感器列表
for line in stdout.split('\n'):
if not line.strip() or '|' not in line:
continue
parts = [p.strip() for p in line.split('|')]
if len(parts) >= 4:
sensor_name = parts[0]
sensor_value = parts[1]
sensor_unit = parts[2]
sensor_status = parts[3]
result["sensors"][sensor_name] = {
"value": sensor_value,
"unit": sensor_unit,
"status": sensor_status
}
# 分类传感器
result["categories"] = categorize_ipmi_sensors(result["sensors"])
except Exception as e:
result["error"] = str(e)
return result
def categorize_ipmi_sensors(sensors: Dict[str, Any]) -> Dict[str, Dict[str, Any]]:
"""将 IPMI 传感器分类。"""
categories = {
"temperatures": {},
"voltages": {},
"fans": {},
"power": {},
"currents": {},
"other": {}
}
for name, data in sensors.items():
name_lower = name.lower()
unit = data.get("unit", "").lower()
if 'temp' in name_lower or unit == 'degrees c':
categories["temperatures"][name] = data
elif 'volt' in name_lower or unit == 'volts' or 'vcc' in name_lower or '3.3v' in name_lower or '5v' in name_lower or '12v' in name_lower:
categories["voltages"][name] = data
elif 'fan' in name_lower or 'rpm' in unit:
categories["fans"][name] = data
elif 'power' in name_lower or 'watt' in unit:
categories["power"][name] = data
elif 'current' in name_lower or 'amp' in unit:
categories["currents"][name] = data
else:
categories["other"][name] = data
return categories
def get_thermal_zones() -> Dict[str, Any]:
"""从 thermal zone 获取温度信息。"""
result = {
"zones": {},
"policies": {}
}
thermal_path = '/sys/class/thermal'
if not os.path.exists(thermal_path):
return result
try:
for zone_name in os.listdir(thermal_path):
if not zone_name.startswith('thermal_zone'):
continue
zone_path = os.path.join(thermal_path, zone_name)
zone_info = {}
# 读取类型
type_file = os.path.join(zone_path, 'type')
if os.path.exists(type_file):
with open(type_file, 'r') as f:
zone_info["type"] = f.read().strip()
# 读取温度 (毫摄氏度转换为摄氏度)
temp_file = os.path.join(zone_path, 'temp')
if os.path.exists(temp_file):
with open(temp_file, 'r') as f:
temp_mc = safe_int(f.read().strip())
zone_info["temperature_c"] = temp_mc / 1000.0
# 读取策略
policy_file = os.path.join(zone_path, 'policy')
if os.path.exists(policy_file):
with open(policy_file, 'r') as f:
zone_info["policy"] = f.read().strip()
# 读取临界温度
trip_point_file = os.path.join(zone_path, 'trip_point_0_temp')
if os.path.exists(trip_point_file):
with open(trip_point_file, 'r') as f:
zone_info["critical_temp_c"] = safe_int(f.read().strip()) / 1000.0
result["zones"][zone_name] = zone_info
# 读取 thermal 策略
for policy_file in os.listdir('/sys/class/thermal'):
if policy_file.startswith('cooling_device'):
policy_path = os.path.join('/sys/class/thermal', policy_file)
policy_info = {}
type_file = os.path.join(policy_path, 'type')
if os.path.exists(type_file):
with open(type_file, 'r') as f:
policy_info["type"] = f.read().strip()
cur_state_file = os.path.join(policy_path, 'cur_state')
if os.path.exists(cur_state_file):
with open(cur_state_file, 'r') as f:
policy_info["current_state"] = safe_int(f.read().strip())
max_state_file = os.path.join(policy_path, 'max_state')
if os.path.exists(max_state_file):
with open(max_state_file, 'r') as f:
policy_info["max_state"] = safe_int(f.read().strip())
result["policies"][policy_file] = policy_info
except Exception as e:
result["error"] = str(e)
return result
def get_power_supply_info() -> Dict[str, Any]:
"""获取电源信息。"""
result = {
"supplies": []
}
power_supply_path = '/sys/class/power_supply'
if not os.path.exists(power_supply_path):
return result
try:
for supply_name in os.listdir(power_supply_path):
supply_path = os.path.join(power_supply_path, supply_name)
supply_info = {"name": supply_name}
# 读取所有属性文件
for attr in os.listdir(supply_path):
attr_path = os.path.join(supply_path, attr)
if os.path.isfile(attr_path):
try:
with open(attr_path, 'r') as f:
value = f.read().strip()
# 尝试转换为数字
if value.isdigit():
supply_info[attr] = safe_int(value)
else:
try:
supply_info[attr] = safe_float(value)
except:
supply_info[attr] = value
except:
pass
result["supplies"].append(supply_info)
except Exception as e:
result["error"] = str(e)
return result
def get_ipmi_sel_logs() -> Dict[str, Any]:
"""获取 IPMI SELSystem Event Log日志。"""
result = {
"available": False,
"entries": [],
"hardware_errors": [],
"critical_events": []
}
if not check_command_exists('ipmitool'):
result["note"] = "ipmitool 未安装"
return result
try:
# 获取 SEL 列表
_, stdout, stderr = execute_command(
['ipmitool', 'sel', 'elist'],
check_returncode=False, timeout=15
)
if 'Could not open device' in stderr or 'Driver not found' in stderr:
result["note"] = "IPMI 设备不可用"
return result
result["available"] = True
# 解析 SEL 条目
critical_keywords = ['critical', 'failure', 'error', 'thermal', 'voltage', 'power']
hardware_keywords = ['memory', 'processor', 'hard drive', 'fan', 'power supply', 'temperature']
for line in stdout.split('\n'):
if not line.strip():
continue
# SEL 格式: ID | Date/Time | Source | Event
parts = [p.strip() for p in line.split('|')]
if len(parts) >= 4:
entry = {
"id": parts[0],
"datetime": parts[1],
"source": parts[2],
"event": parts[3]
}
result["entries"].append(entry)
# 检查是否为关键事件
event_lower = entry["event"].lower()
if any(kw in event_lower for kw in critical_keywords):
result["critical_events"].append(entry)
# 检查是否为硬件错误
if any(kw in event_lower for kw in hardware_keywords):
result["hardware_errors"].append(entry)
result["total_entries"] = len(result["entries"])
result["critical_count"] = len(result["critical_events"])
result["hardware_error_count"] = len(result["hardware_errors"])
except Exception as e:
result["error"] = str(e)
return result
def check_sensor_warnings(sensor_data: Dict[str, Any]) -> List[str]:
"""检查传感器警告条件。"""
warnings = []
# 检查 lm-sensors 告警
lm_sensors = sensor_data.get("lm_sensors", {})
summary = lm_sensors.get("summary", {})
# 温度告警
for name, temp_data in summary.get("temperatures", {}).items():
if temp_data.get("alarm"):
warnings.append(f"温度传感器 {name} 告警: {temp_data.get('value')}°C")
elif temp_data.get("value", 0) > 90:
warnings.append(f"温度传感器 {name} 温度过高: {temp_data.get('value')}°C")
# 电压告警
for name, volt_data in summary.get("voltages", {}).items():
if volt_data.get("alarm"):
warnings.append(f"电压传感器 {name} 告警: {volt_data.get('value')}V")
# 风扇告警
for name, fan_data in summary.get("fans", {}).items():
if fan_data.get("alarm"):
warnings.append(f"风扇 {name} 告警: {fan_data.get('rpm')} RPM")
elif fan_data.get("rpm", 0) == 0 and fan_data.get("min", 0) > 0:
warnings.append(f"风扇 {name} 可能已停止: {fan_data.get('rpm')} RPM")
# 检查 IPMI 告警
ipmi_sensors = sensor_data.get("ipmi_sensors", {})
for name, data in ipmi_sensors.get("sensors", {}).items():
status = data.get("status", "").lower()
if status in ['critical', 'non-recoverable', 'warning']:
warnings.append(f"IPMI 传感器 {name} 状态异常: {data.get('status')}")
# 检查 IPMI SEL 关键事件
ipmi_sel = sensor_data.get("ipmi_sel", {})
if ipmi_sel.get("critical_count", 0) > 0:
warnings.append(f"IPMI SEL 中有 {ipmi_sel['critical_count']} 个关键事件")
# 检查 thermal zone 温度
thermal_zones = sensor_data.get("thermal_zones", {})
for zone_name, zone_data in thermal_zones.get("zones", {}).items():
temp = zone_data.get("temperature_c", 0)
critical = zone_data.get("critical_temp_c", 100)
if temp > critical * 0.9: # 超过临界温度的 90%
warnings.append(f"Thermal zone {zone_name} 温度接近临界值: {temp}°C (临界: {critical}°C)")
return warnings
if __name__ == '__main__':
import json
print(json.dumps(run_sensors_check(), indent=2, ensure_ascii=False))