Files
ServerGuard/modules/storage.py
2026-03-02 14:14:40 +08:00

603 lines
19 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
ServerGuard - 存储设备检测模块
检查硬盘/SSD 的健康状况、SMART 数据、RAID 状态。
"""
import os
import re
import json
from typing import Dict, Any, List, Optional
import sys
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from utils import (
execute_command, check_command_exists, parse_key_value_output,
safe_int, safe_float, format_bytes, require_root
)
def run_storage_check() -> Dict[str, Any]:
"""
执行存储设备检测。
Returns:
Dict[str, Any]: 检测结果
"""
result = {
"status": "success",
"devices": [],
"raid_status": {},
"io_stats": {}
}
try:
# 获取存储设备列表
devices = get_storage_devices()
# 检测每个设备
for device in devices:
device_info = check_device(device)
result["devices"].append(device_info)
# 如果有严重问题,标记警告状态
if device_info.get("health") in ['FAILED', 'WARNING']:
result["status"] = "warning"
# 检查 RAID 状态
result["raid_status"] = check_raid_status()
# 获取 I/O 统计
result["io_stats"] = get_io_statistics()
except Exception as e:
result["status"] = "error"
result["error"] = str(e)
return result
def get_storage_devices() -> List[Dict[str, str]]:
"""获取存储设备列表。"""
devices = []
# 方法 1: 使用 lsblk
if check_command_exists('lsblk'):
try:
_, stdout, _ = execute_command(
['lsblk', '-d', '-n', '-o', 'NAME,TYPE,ROTA', '-J'],
check_returncode=False, timeout=10
)
data = json.loads(stdout)
for dev in data.get('blockdevices', []):
if dev.get('type') == 'disk':
devices.append({
"name": dev['name'],
"path": f"/dev/{dev['name']}",
"type": "hdd" if dev.get('rota') else "ssd"
})
except:
pass
# 方法 2: 扫描 /sys/block
if not devices:
try:
for name in os.listdir('/sys/block'):
if name.startswith(('sd', 'hd', 'nvme', 'vd', 'xvd', 'mmcblk')):
dev_type = "unknown"
try:
with open(f'/sys/block/{name}/queue/rotational', 'r') as f:
dev_type = "hdd" if f.read().strip() == '1' else "ssd"
except:
pass
devices.append({
"name": name,
"path": f"/dev/{name}",
"type": dev_type
})
except:
pass
return devices
def check_device(device: Dict[str, str]) -> Dict[str, Any]:
"""检查单个存储设备。"""
result = {
"name": device["name"],
"path": device["path"],
"type": device.get("type", "unknown"),
"model": "Unknown",
"serial": "Unknown",
"firmware": "Unknown",
"size_bytes": 0,
"size_human": "Unknown",
"health": "UNKNOWN",
"smart_status": {},
"temperature_c": None,
"power_on_hours": None,
"start_stop_count": None,
"reallocated_sectors": None,
"pending_sectors": None,
"test_result": None
}
# 获取设备基本信息
result.update(get_device_info(device["path"]))
# 获取 SMART 数据
smart_data = get_smart_data(device["path"])
result["smart_status"] = smart_data
# 分析健康状态
result["health"] = analyze_health(smart_data)
# 提取关键属性
if "attributes" in smart_data:
attrs = smart_data["attributes"]
# 温度
for temp_attr in ['194 Temperature_Celsius', '190 Airflow_Temperature_Cel', 'Temperature']:
if temp_attr in attrs:
temp_val = attrs[temp_attr].get('raw_value')
if temp_val:
result["temperature_c"] = safe_int(temp_val.split()[0])
break
# 运行时间
if '9 Power_On_Hours' in attrs:
result["power_on_hours"] = safe_int(attrs['9 Power_On_Hours'].get('raw_value', 0))
# 启动次数
if '4 Start_Stop_Count' in attrs:
result["start_stop_count"] = safe_int(attrs['4 Start_Stop_Count'].get('raw_value', 0))
# 重映射扇区
if '5 Reallocated_Sector_Ct' in attrs:
result["reallocated_sectors"] = safe_int(attrs['5 Reallocated_Sector_Ct'].get('raw_value', 0))
# 待处理扇区
if '197 Current_Pending_Sector' in attrs:
result["pending_sectors"] = safe_int(attrs['197 Current_Pending_Sector'].get('raw_value', 0))
# NVMe 特殊处理
if device["name"].startswith('nvme'):
nvme_data = get_nvme_data(device["path"])
result["nvme_data"] = nvme_data
if nvme_data.get("temperature"):
result["temperature_c"] = nvme_data["temperature"]
if nvme_data.get("health"):
result["health"] = nvme_data["health"]
return result
def get_device_info(device_path: str) -> Dict[str, Any]:
"""获取设备基本信息。"""
info = {}
# 使用 smartctl -i 获取信息
if check_command_exists('smartctl'):
try:
_, stdout, _ = execute_command(
['smartctl', '-i', device_path],
check_returncode=False, timeout=10
)
patterns = {
"model": r'Device Model:\s*(.+)',
"serial": r'Serial Number:\s*(\S+)',
"firmware": r'Firmware Version:\s*(\S+)',
"size_human": r'User Capacity:\s*(.+)',
"sector_size": r'Sector Size:\s*(.+)',
"rotation_rate": r'Rotation Rate:\s*(.+)',
"form_factor": r'Form Factor:\s*(.+)',
"transport": r'Transport protocol:\s*(.+)'
}
for key, pattern in patterns.items():
match = re.search(pattern, stdout)
if match:
info[key] = match.group(1).strip()
# 提取容量字节数
size_match = re.search(r'User Capacity:\s*[\d,]+\s*bytes\s*\[(\d+)\]', stdout)
if size_match:
info["size_bytes"] = safe_int(size_match.group(1))
# 是否为 SSD
if 'Solid State Device' in stdout or 'Rotation Rate: Solid State Device' in stdout:
info["is_ssd"] = True
elif 'Rotation Rate' in stdout and 'Solid State' not in stdout:
info["is_ssd"] = False
except:
pass
# 备用:从 /sys 获取大小
if "size_bytes" not in info or info["size_bytes"] == 0:
try:
dev_name = os.path.basename(device_path)
with open(f'/sys/block/{dev_name}/size', 'r') as f:
sectors = safe_int(f.read().strip())
info["size_bytes"] = sectors * 512
info["size_human"] = format_bytes(info["size_bytes"])
except:
pass
return info
def get_smart_data(device_path: str) -> Dict[str, Any]:
"""获取 SMART 数据。"""
result = {
"supported": False,
"enabled": False,
"overall": "UNKNOWN",
"attributes": {},
"self_tests": []
}
if not check_command_exists('smartctl'):
result["error"] = "smartctl 未安装"
return result
try:
# 检查 SMART 支持
_, stdout, _ = execute_command(
['smartctl', '-i', device_path],
check_returncode=False, timeout=10
)
if 'SMART support is: Available' in stdout:
result["supported"] = True
if 'SMART support is: Enabled' in stdout:
result["enabled"] = True
# 获取所有 SMART 数据
_, stdout, _ = execute_command(
['smartctl', '-a', device_path],
check_returncode=False, timeout=15
)
# 解析整体健康状态
if 'PASSED' in stdout or 'OK' in stdout:
result["overall"] = "PASSED"
elif 'FAILED' in stdout:
result["overall"] = "FAILED"
# 解析 SMART 属性表 (ATA 设备)
if 'ID#' in stdout and 'ATTRIBUTE_NAME' in stdout:
lines = stdout.split('\n')
in_attributes = False
for line in lines:
if 'ID#' in line and 'ATTRIBUTE_NAME' in line:
in_attributes = True
continue
if in_attributes:
if not line.strip() or line.startswith('SMART'):
break
# 解析属性行
# 格式: ID# ATTRIBUTE_NAME FLAG VALUE WORST THRESH TYPE UPDATED WHEN_FAILED RAW_VALUE
parts = line.split()
if len(parts) >= 10:
attr_id = parts[0]
attr_name = parts[1]
attr_key = f"{attr_id} {attr_name}"
result["attributes"][attr_key] = {
"flag": parts[2],
"value": safe_int(parts[3]),
"worst": safe_int(parts[4]),
"thresh": safe_int(parts[5]),
"type": parts[6],
"updated": parts[7],
"when_failed": parts[8] if parts[8] != '-' else None,
"raw_value": ' '.join(parts[9:])
}
# 解析自检日志
if 'SMART Self-test log' in stdout:
self_test_section = False
for line in stdout.split('\n'):
if 'SMART Self-test log' in line:
self_test_section = True
continue
if self_test_section and line.strip() and not line.startswith('SMART'):
if '#' in line:
result["self_tests"].append(line.strip())
# 解析错误日志
if 'SMART Error Log' in stdout:
error_match = re.search(r'Error (\d+)\s+occurred at', stdout)
if error_match:
result["error_count"] = safe_int(error_match.group(1))
except Exception as e:
result["error"] = str(e)
return result
def get_nvme_data(device_path: str) -> Dict[str, Any]:
"""获取 NVMe 设备特有数据。"""
result = {
"health": "UNKNOWN",
"temperature": None,
"available_spare": None,
"percentage_used": None,
"data_units_read": None,
"data_units_written": None,
"host_reads": None,
"host_writes": None
}
if not check_command_exists('nvme'):
return result
try:
# 获取 SMART 日志
_, stdout, _ = execute_command(
['nvme', 'smart-log', device_path],
check_returncode=False, timeout=10
)
# 解析关键指标
temp_match = re.search(r'temperature\s*:\s*(\d+)', stdout)
if temp_match:
result["temperature"] = safe_int(temp_match.group(1)) - 273 # 转换为摄氏度
spare_match = re.search(r'available spare\s*:\s*(\d+)%', stdout)
if spare_match:
result["available_spare"] = safe_int(spare_match.group(1))
used_match = re.search(r'percentage used\s*:\s*(\d+)%', stdout)
if used_match:
result["percentage_used"] = safe_int(used_match.group(1))
# 评估健康状态
if result["percentage_used"] is not None:
if result["percentage_used"] < 90:
result["health"] = "PASSED"
else:
result["health"] = "WARNING"
if result["available_spare"] is not None and result["available_spare"] < 10:
result["health"] = "WARNING"
except:
pass
return result
def analyze_health(smart_data: Dict[str, Any]) -> str:
"""分析设备健康状态。"""
if not smart_data.get("supported"):
return "UNKNOWN"
if smart_data.get("overall") == "FAILED":
return "FAILED"
# 检查关键属性
attrs = smart_data.get("attributes", {})
critical_attrs = {
'5 Reallocated_Sector_Ct': 'reallocated_sectors',
'197 Current_Pending_Sector': 'pending_sectors',
'198 Offline_Uncorrectable': 'offline_uncorrectable',
'196 Reallocation_Event_Count': 'reallocation_events'
}
for attr_name, description in critical_attrs.items():
if attr_name in attrs:
raw_value = attrs[attr_name].get('raw_value', '0')
value = safe_int(raw_value.split()[0])
if value > 0:
return "WARNING"
# 检查温度
for temp_attr in ['194 Temperature_Celsius', '190 Airflow_Temperature_Cel']:
if temp_attr in attrs:
temp = attrs[temp_attr].get('value', 0)
if temp > 60: # 温度阈值
return "WARNING"
return "PASSED"
def check_raid_status() -> Dict[str, Any]:
"""检查 RAID 阵列状态。"""
result = {
"raid_available": False,
"controllers": [],
"arrays": []
}
# 检查软件 RAID (mdadm)
if check_command_exists('mdadm'):
try:
_, stdout, _ = execute_command(
['mdadm', '--detail', '--scan'],
check_returncode=False, timeout=10
)
if stdout.strip():
result["software_raid"] = True
result["mdadm_config"] = stdout.strip()
# 获取详细信息
_, detail, _ = execute_command(
['cat', '/proc/mdstat'],
check_returncode=False, timeout=5
)
result["mdstat"] = detail
# 解析每个阵列
for line in detail.split('\n'):
if line.startswith('md'):
parts = line.split()
array_info = {
"name": parts[0],
"status": "active" if "active" in line else "inactive"
}
# 检查是否有降级
if '_' in line or 'recovery' in line:
array_info["degraded"] = True
result["status"] = "warning"
result["arrays"].append(array_info)
except:
pass
# 检查硬件 RAID (MegaCli/storcli)
if check_command_exists('storcli'):
try:
_, stdout, _ = execute_command(
['storcli', '/c0', 'show'],
check_returncode=False, timeout=10
)
result["hardware_raid"] = True
result["controller_type"] = "LSI/Broadcom"
result["storcli_output"] = stdout[:500] # 保存部分输出
except:
pass
elif check_command_exists('MegaCli'):
try:
_, stdout, _ = execute_command(
['MegaCli', '-AdpAllInfo', '-aALL'],
check_returncode=False, timeout=10
)
result["hardware_raid"] = True
result["controller_type"] = "LSI"
result["megacli_output"] = stdout[:500]
except:
pass
return result
def get_io_statistics() -> Dict[str, Any]:
"""获取 I/O 统计信息。"""
result = {}
# 从 /proc/diskstats 获取
try:
with open('/proc/diskstats', 'r') as f:
for line in f:
parts = line.split()
if len(parts) >= 14:
device = parts[2]
# 只关注物理磁盘
if device.startswith(('sd', 'hd', 'nvme', 'vd')) and not device[-1].isdigit():
result[device] = {
"reads_completed": safe_int(parts[3]),
"reads_merged": safe_int(parts[4]),
"sectors_read": safe_int(parts[5]),
"time_reading_ms": safe_int(parts[6]),
"writes_completed": safe_int(parts[7]),
"writes_merged": safe_int(parts[8]),
"sectors_written": safe_int(parts[9]),
"time_writing_ms": safe_int(parts[10]),
"ios_in_progress": safe_int(parts[11]),
"time_doing_ios_ms": safe_int(parts[12]),
"weighted_time_ios_ms": safe_int(parts[13])
}
except:
pass
return result
@require_root
def run_io_test(device_path: str, test_size_mb: int = 100) -> Dict[str, Any]:
"""
运行简单的 I/O 性能测试。
Args:
device_path: 设备路径
test_size_mb: 测试大小MB
Returns:
Dict[str, Any]: 测试结果
"""
result = {
"passed": False,
"device": device_path,
"test_size_mb": test_size_mb,
"read_speed_mbps": None,
"write_speed_mbps": None,
"errors": []
}
# 使用 fio 进行测试
if check_command_exists('fio'):
try:
import tempfile
with tempfile.NamedTemporaryFile(mode='w', suffix='.fio', delete=False) as f:
fio_config = f"""
[global]
directory=/tmp
filename=serverguard_test
direct=1
size={test_size_mb}M
unlink=1
[seq_read]
stonewall
rw=read
bs=1M
[seq_write]
stonewall
rw=write
bs=1M
"""
f.write(fio_config)
fio_file = f.name
try:
_, stdout, stderr = execute_command(
['fio', fio_file, '--output-format=json'],
timeout=120,
check_returncode=False
)
data = json.loads(stdout)
for job in data.get('jobs', []):
job_name = job.get('jobname', '')
read_bw = job.get('read', {}).get('bw', 0) / 1024 # 转换为 MB/s
write_bw = job.get('write', {}).get('bw', 0) / 1024
if 'read' in job_name.lower() and read_bw > 0:
result["read_speed_mbps"] = round(read_bw, 2)
if 'write' in job_name.lower() and write_bw > 0:
result["write_speed_mbps"] = round(write_bw, 2)
result["passed"] = True
finally:
os.unlink(fio_file)
except Exception as e:
result["errors"].append(str(e))
else:
result["errors"].append("fio 未安装")
return result
if __name__ == '__main__':
import json
print(json.dumps(run_storage_check(), indent=2, ensure_ascii=False))