first commit
This commit is contained in:
602
modules/storage.py
Normal file
602
modules/storage.py
Normal file
@@ -0,0 +1,602 @@
|
||||
"""
|
||||
ServerGuard - 存储设备检测模块
|
||||
|
||||
检查硬盘/SSD 的健康状况、SMART 数据、RAID 状态。
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import json
|
||||
from typing import Dict, Any, List, Optional
|
||||
|
||||
import sys
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
from utils import (
|
||||
execute_command, check_command_exists, parse_key_value_output,
|
||||
safe_int, safe_float, format_bytes, require_root
|
||||
)
|
||||
|
||||
|
||||
def run_storage_check() -> Dict[str, Any]:
|
||||
"""
|
||||
执行存储设备检测。
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: 检测结果
|
||||
"""
|
||||
result = {
|
||||
"status": "success",
|
||||
"devices": [],
|
||||
"raid_status": {},
|
||||
"io_stats": {}
|
||||
}
|
||||
|
||||
try:
|
||||
# 获取存储设备列表
|
||||
devices = get_storage_devices()
|
||||
|
||||
# 检测每个设备
|
||||
for device in devices:
|
||||
device_info = check_device(device)
|
||||
result["devices"].append(device_info)
|
||||
|
||||
# 如果有严重问题,标记警告状态
|
||||
if device_info.get("health") in ['FAILED', 'WARNING']:
|
||||
result["status"] = "warning"
|
||||
|
||||
# 检查 RAID 状态
|
||||
result["raid_status"] = check_raid_status()
|
||||
|
||||
# 获取 I/O 统计
|
||||
result["io_stats"] = get_io_statistics()
|
||||
|
||||
except Exception as e:
|
||||
result["status"] = "error"
|
||||
result["error"] = str(e)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def get_storage_devices() -> List[Dict[str, str]]:
|
||||
"""获取存储设备列表。"""
|
||||
devices = []
|
||||
|
||||
# 方法 1: 使用 lsblk
|
||||
if check_command_exists('lsblk'):
|
||||
try:
|
||||
_, stdout, _ = execute_command(
|
||||
['lsblk', '-d', '-n', '-o', 'NAME,TYPE,ROTA', '-J'],
|
||||
check_returncode=False, timeout=10
|
||||
)
|
||||
|
||||
data = json.loads(stdout)
|
||||
for dev in data.get('blockdevices', []):
|
||||
if dev.get('type') == 'disk':
|
||||
devices.append({
|
||||
"name": dev['name'],
|
||||
"path": f"/dev/{dev['name']}",
|
||||
"type": "hdd" if dev.get('rota') else "ssd"
|
||||
})
|
||||
except:
|
||||
pass
|
||||
|
||||
# 方法 2: 扫描 /sys/block
|
||||
if not devices:
|
||||
try:
|
||||
for name in os.listdir('/sys/block'):
|
||||
if name.startswith(('sd', 'hd', 'nvme', 'vd', 'xvd', 'mmcblk')):
|
||||
dev_type = "unknown"
|
||||
try:
|
||||
with open(f'/sys/block/{name}/queue/rotational', 'r') as f:
|
||||
dev_type = "hdd" if f.read().strip() == '1' else "ssd"
|
||||
except:
|
||||
pass
|
||||
|
||||
devices.append({
|
||||
"name": name,
|
||||
"path": f"/dev/{name}",
|
||||
"type": dev_type
|
||||
})
|
||||
except:
|
||||
pass
|
||||
|
||||
return devices
|
||||
|
||||
|
||||
def check_device(device: Dict[str, str]) -> Dict[str, Any]:
|
||||
"""检查单个存储设备。"""
|
||||
result = {
|
||||
"name": device["name"],
|
||||
"path": device["path"],
|
||||
"type": device.get("type", "unknown"),
|
||||
"model": "Unknown",
|
||||
"serial": "Unknown",
|
||||
"firmware": "Unknown",
|
||||
"size_bytes": 0,
|
||||
"size_human": "Unknown",
|
||||
"health": "UNKNOWN",
|
||||
"smart_status": {},
|
||||
"temperature_c": None,
|
||||
"power_on_hours": None,
|
||||
"start_stop_count": None,
|
||||
"reallocated_sectors": None,
|
||||
"pending_sectors": None,
|
||||
"test_result": None
|
||||
}
|
||||
|
||||
# 获取设备基本信息
|
||||
result.update(get_device_info(device["path"]))
|
||||
|
||||
# 获取 SMART 数据
|
||||
smart_data = get_smart_data(device["path"])
|
||||
result["smart_status"] = smart_data
|
||||
|
||||
# 分析健康状态
|
||||
result["health"] = analyze_health(smart_data)
|
||||
|
||||
# 提取关键属性
|
||||
if "attributes" in smart_data:
|
||||
attrs = smart_data["attributes"]
|
||||
|
||||
# 温度
|
||||
for temp_attr in ['194 Temperature_Celsius', '190 Airflow_Temperature_Cel', 'Temperature']:
|
||||
if temp_attr in attrs:
|
||||
temp_val = attrs[temp_attr].get('raw_value')
|
||||
if temp_val:
|
||||
result["temperature_c"] = safe_int(temp_val.split()[0])
|
||||
break
|
||||
|
||||
# 运行时间
|
||||
if '9 Power_On_Hours' in attrs:
|
||||
result["power_on_hours"] = safe_int(attrs['9 Power_On_Hours'].get('raw_value', 0))
|
||||
|
||||
# 启动次数
|
||||
if '4 Start_Stop_Count' in attrs:
|
||||
result["start_stop_count"] = safe_int(attrs['4 Start_Stop_Count'].get('raw_value', 0))
|
||||
|
||||
# 重映射扇区
|
||||
if '5 Reallocated_Sector_Ct' in attrs:
|
||||
result["reallocated_sectors"] = safe_int(attrs['5 Reallocated_Sector_Ct'].get('raw_value', 0))
|
||||
|
||||
# 待处理扇区
|
||||
if '197 Current_Pending_Sector' in attrs:
|
||||
result["pending_sectors"] = safe_int(attrs['197 Current_Pending_Sector'].get('raw_value', 0))
|
||||
|
||||
# NVMe 特殊处理
|
||||
if device["name"].startswith('nvme'):
|
||||
nvme_data = get_nvme_data(device["path"])
|
||||
result["nvme_data"] = nvme_data
|
||||
if nvme_data.get("temperature"):
|
||||
result["temperature_c"] = nvme_data["temperature"]
|
||||
if nvme_data.get("health"):
|
||||
result["health"] = nvme_data["health"]
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def get_device_info(device_path: str) -> Dict[str, Any]:
|
||||
"""获取设备基本信息。"""
|
||||
info = {}
|
||||
|
||||
# 使用 smartctl -i 获取信息
|
||||
if check_command_exists('smartctl'):
|
||||
try:
|
||||
_, stdout, _ = execute_command(
|
||||
['smartctl', '-i', device_path],
|
||||
check_returncode=False, timeout=10
|
||||
)
|
||||
|
||||
patterns = {
|
||||
"model": r'Device Model:\s*(.+)',
|
||||
"serial": r'Serial Number:\s*(\S+)',
|
||||
"firmware": r'Firmware Version:\s*(\S+)',
|
||||
"size_human": r'User Capacity:\s*(.+)',
|
||||
"sector_size": r'Sector Size:\s*(.+)',
|
||||
"rotation_rate": r'Rotation Rate:\s*(.+)',
|
||||
"form_factor": r'Form Factor:\s*(.+)',
|
||||
"transport": r'Transport protocol:\s*(.+)'
|
||||
}
|
||||
|
||||
for key, pattern in patterns.items():
|
||||
match = re.search(pattern, stdout)
|
||||
if match:
|
||||
info[key] = match.group(1).strip()
|
||||
|
||||
# 提取容量字节数
|
||||
size_match = re.search(r'User Capacity:\s*[\d,]+\s*bytes\s*\[(\d+)\]', stdout)
|
||||
if size_match:
|
||||
info["size_bytes"] = safe_int(size_match.group(1))
|
||||
|
||||
# 是否为 SSD
|
||||
if 'Solid State Device' in stdout or 'Rotation Rate: Solid State Device' in stdout:
|
||||
info["is_ssd"] = True
|
||||
elif 'Rotation Rate' in stdout and 'Solid State' not in stdout:
|
||||
info["is_ssd"] = False
|
||||
|
||||
except:
|
||||
pass
|
||||
|
||||
# 备用:从 /sys 获取大小
|
||||
if "size_bytes" not in info or info["size_bytes"] == 0:
|
||||
try:
|
||||
dev_name = os.path.basename(device_path)
|
||||
with open(f'/sys/block/{dev_name}/size', 'r') as f:
|
||||
sectors = safe_int(f.read().strip())
|
||||
info["size_bytes"] = sectors * 512
|
||||
info["size_human"] = format_bytes(info["size_bytes"])
|
||||
except:
|
||||
pass
|
||||
|
||||
return info
|
||||
|
||||
|
||||
def get_smart_data(device_path: str) -> Dict[str, Any]:
|
||||
"""获取 SMART 数据。"""
|
||||
result = {
|
||||
"supported": False,
|
||||
"enabled": False,
|
||||
"overall": "UNKNOWN",
|
||||
"attributes": {},
|
||||
"self_tests": []
|
||||
}
|
||||
|
||||
if not check_command_exists('smartctl'):
|
||||
result["error"] = "smartctl 未安装"
|
||||
return result
|
||||
|
||||
try:
|
||||
# 检查 SMART 支持
|
||||
_, stdout, _ = execute_command(
|
||||
['smartctl', '-i', device_path],
|
||||
check_returncode=False, timeout=10
|
||||
)
|
||||
|
||||
if 'SMART support is: Available' in stdout:
|
||||
result["supported"] = True
|
||||
if 'SMART support is: Enabled' in stdout:
|
||||
result["enabled"] = True
|
||||
|
||||
# 获取所有 SMART 数据
|
||||
_, stdout, _ = execute_command(
|
||||
['smartctl', '-a', device_path],
|
||||
check_returncode=False, timeout=15
|
||||
)
|
||||
|
||||
# 解析整体健康状态
|
||||
if 'PASSED' in stdout or 'OK' in stdout:
|
||||
result["overall"] = "PASSED"
|
||||
elif 'FAILED' in stdout:
|
||||
result["overall"] = "FAILED"
|
||||
|
||||
# 解析 SMART 属性表 (ATA 设备)
|
||||
if 'ID#' in stdout and 'ATTRIBUTE_NAME' in stdout:
|
||||
lines = stdout.split('\n')
|
||||
in_attributes = False
|
||||
|
||||
for line in lines:
|
||||
if 'ID#' in line and 'ATTRIBUTE_NAME' in line:
|
||||
in_attributes = True
|
||||
continue
|
||||
|
||||
if in_attributes:
|
||||
if not line.strip() or line.startswith('SMART'):
|
||||
break
|
||||
|
||||
# 解析属性行
|
||||
# 格式: ID# ATTRIBUTE_NAME FLAG VALUE WORST THRESH TYPE UPDATED WHEN_FAILED RAW_VALUE
|
||||
parts = line.split()
|
||||
if len(parts) >= 10:
|
||||
attr_id = parts[0]
|
||||
attr_name = parts[1]
|
||||
attr_key = f"{attr_id} {attr_name}"
|
||||
|
||||
result["attributes"][attr_key] = {
|
||||
"flag": parts[2],
|
||||
"value": safe_int(parts[3]),
|
||||
"worst": safe_int(parts[4]),
|
||||
"thresh": safe_int(parts[5]),
|
||||
"type": parts[6],
|
||||
"updated": parts[7],
|
||||
"when_failed": parts[8] if parts[8] != '-' else None,
|
||||
"raw_value": ' '.join(parts[9:])
|
||||
}
|
||||
|
||||
# 解析自检日志
|
||||
if 'SMART Self-test log' in stdout:
|
||||
self_test_section = False
|
||||
for line in stdout.split('\n'):
|
||||
if 'SMART Self-test log' in line:
|
||||
self_test_section = True
|
||||
continue
|
||||
if self_test_section and line.strip() and not line.startswith('SMART'):
|
||||
if '#' in line:
|
||||
result["self_tests"].append(line.strip())
|
||||
|
||||
# 解析错误日志
|
||||
if 'SMART Error Log' in stdout:
|
||||
error_match = re.search(r'Error (\d+)\s+occurred at', stdout)
|
||||
if error_match:
|
||||
result["error_count"] = safe_int(error_match.group(1))
|
||||
|
||||
except Exception as e:
|
||||
result["error"] = str(e)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def get_nvme_data(device_path: str) -> Dict[str, Any]:
|
||||
"""获取 NVMe 设备特有数据。"""
|
||||
result = {
|
||||
"health": "UNKNOWN",
|
||||
"temperature": None,
|
||||
"available_spare": None,
|
||||
"percentage_used": None,
|
||||
"data_units_read": None,
|
||||
"data_units_written": None,
|
||||
"host_reads": None,
|
||||
"host_writes": None
|
||||
}
|
||||
|
||||
if not check_command_exists('nvme'):
|
||||
return result
|
||||
|
||||
try:
|
||||
# 获取 SMART 日志
|
||||
_, stdout, _ = execute_command(
|
||||
['nvme', 'smart-log', device_path],
|
||||
check_returncode=False, timeout=10
|
||||
)
|
||||
|
||||
# 解析关键指标
|
||||
temp_match = re.search(r'temperature\s*:\s*(\d+)', stdout)
|
||||
if temp_match:
|
||||
result["temperature"] = safe_int(temp_match.group(1)) - 273 # 转换为摄氏度
|
||||
|
||||
spare_match = re.search(r'available spare\s*:\s*(\d+)%', stdout)
|
||||
if spare_match:
|
||||
result["available_spare"] = safe_int(spare_match.group(1))
|
||||
|
||||
used_match = re.search(r'percentage used\s*:\s*(\d+)%', stdout)
|
||||
if used_match:
|
||||
result["percentage_used"] = safe_int(used_match.group(1))
|
||||
|
||||
# 评估健康状态
|
||||
if result["percentage_used"] is not None:
|
||||
if result["percentage_used"] < 90:
|
||||
result["health"] = "PASSED"
|
||||
else:
|
||||
result["health"] = "WARNING"
|
||||
|
||||
if result["available_spare"] is not None and result["available_spare"] < 10:
|
||||
result["health"] = "WARNING"
|
||||
|
||||
except:
|
||||
pass
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def analyze_health(smart_data: Dict[str, Any]) -> str:
|
||||
"""分析设备健康状态。"""
|
||||
if not smart_data.get("supported"):
|
||||
return "UNKNOWN"
|
||||
|
||||
if smart_data.get("overall") == "FAILED":
|
||||
return "FAILED"
|
||||
|
||||
# 检查关键属性
|
||||
attrs = smart_data.get("attributes", {})
|
||||
|
||||
critical_attrs = {
|
||||
'5 Reallocated_Sector_Ct': 'reallocated_sectors',
|
||||
'197 Current_Pending_Sector': 'pending_sectors',
|
||||
'198 Offline_Uncorrectable': 'offline_uncorrectable',
|
||||
'196 Reallocation_Event_Count': 'reallocation_events'
|
||||
}
|
||||
|
||||
for attr_name, description in critical_attrs.items():
|
||||
if attr_name in attrs:
|
||||
raw_value = attrs[attr_name].get('raw_value', '0')
|
||||
value = safe_int(raw_value.split()[0])
|
||||
if value > 0:
|
||||
return "WARNING"
|
||||
|
||||
# 检查温度
|
||||
for temp_attr in ['194 Temperature_Celsius', '190 Airflow_Temperature_Cel']:
|
||||
if temp_attr in attrs:
|
||||
temp = attrs[temp_attr].get('value', 0)
|
||||
if temp > 60: # 温度阈值
|
||||
return "WARNING"
|
||||
|
||||
return "PASSED"
|
||||
|
||||
|
||||
def check_raid_status() -> Dict[str, Any]:
|
||||
"""检查 RAID 阵列状态。"""
|
||||
result = {
|
||||
"raid_available": False,
|
||||
"controllers": [],
|
||||
"arrays": []
|
||||
}
|
||||
|
||||
# 检查软件 RAID (mdadm)
|
||||
if check_command_exists('mdadm'):
|
||||
try:
|
||||
_, stdout, _ = execute_command(
|
||||
['mdadm', '--detail', '--scan'],
|
||||
check_returncode=False, timeout=10
|
||||
)
|
||||
|
||||
if stdout.strip():
|
||||
result["software_raid"] = True
|
||||
result["mdadm_config"] = stdout.strip()
|
||||
|
||||
# 获取详细信息
|
||||
_, detail, _ = execute_command(
|
||||
['cat', '/proc/mdstat'],
|
||||
check_returncode=False, timeout=5
|
||||
)
|
||||
result["mdstat"] = detail
|
||||
|
||||
# 解析每个阵列
|
||||
for line in detail.split('\n'):
|
||||
if line.startswith('md'):
|
||||
parts = line.split()
|
||||
array_info = {
|
||||
"name": parts[0],
|
||||
"status": "active" if "active" in line else "inactive"
|
||||
}
|
||||
|
||||
# 检查是否有降级
|
||||
if '_' in line or 'recovery' in line:
|
||||
array_info["degraded"] = True
|
||||
result["status"] = "warning"
|
||||
|
||||
result["arrays"].append(array_info)
|
||||
|
||||
except:
|
||||
pass
|
||||
|
||||
# 检查硬件 RAID (MegaCli/storcli)
|
||||
if check_command_exists('storcli'):
|
||||
try:
|
||||
_, stdout, _ = execute_command(
|
||||
['storcli', '/c0', 'show'],
|
||||
check_returncode=False, timeout=10
|
||||
)
|
||||
result["hardware_raid"] = True
|
||||
result["controller_type"] = "LSI/Broadcom"
|
||||
result["storcli_output"] = stdout[:500] # 保存部分输出
|
||||
except:
|
||||
pass
|
||||
elif check_command_exists('MegaCli'):
|
||||
try:
|
||||
_, stdout, _ = execute_command(
|
||||
['MegaCli', '-AdpAllInfo', '-aALL'],
|
||||
check_returncode=False, timeout=10
|
||||
)
|
||||
result["hardware_raid"] = True
|
||||
result["controller_type"] = "LSI"
|
||||
result["megacli_output"] = stdout[:500]
|
||||
except:
|
||||
pass
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def get_io_statistics() -> Dict[str, Any]:
|
||||
"""获取 I/O 统计信息。"""
|
||||
result = {}
|
||||
|
||||
# 从 /proc/diskstats 获取
|
||||
try:
|
||||
with open('/proc/diskstats', 'r') as f:
|
||||
for line in f:
|
||||
parts = line.split()
|
||||
if len(parts) >= 14:
|
||||
device = parts[2]
|
||||
# 只关注物理磁盘
|
||||
if device.startswith(('sd', 'hd', 'nvme', 'vd')) and not device[-1].isdigit():
|
||||
result[device] = {
|
||||
"reads_completed": safe_int(parts[3]),
|
||||
"reads_merged": safe_int(parts[4]),
|
||||
"sectors_read": safe_int(parts[5]),
|
||||
"time_reading_ms": safe_int(parts[6]),
|
||||
"writes_completed": safe_int(parts[7]),
|
||||
"writes_merged": safe_int(parts[8]),
|
||||
"sectors_written": safe_int(parts[9]),
|
||||
"time_writing_ms": safe_int(parts[10]),
|
||||
"ios_in_progress": safe_int(parts[11]),
|
||||
"time_doing_ios_ms": safe_int(parts[12]),
|
||||
"weighted_time_ios_ms": safe_int(parts[13])
|
||||
}
|
||||
except:
|
||||
pass
|
||||
|
||||
return result
|
||||
|
||||
|
||||
@require_root
|
||||
def run_io_test(device_path: str, test_size_mb: int = 100) -> Dict[str, Any]:
|
||||
"""
|
||||
运行简单的 I/O 性能测试。
|
||||
|
||||
Args:
|
||||
device_path: 设备路径
|
||||
test_size_mb: 测试大小(MB)
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: 测试结果
|
||||
"""
|
||||
result = {
|
||||
"passed": False,
|
||||
"device": device_path,
|
||||
"test_size_mb": test_size_mb,
|
||||
"read_speed_mbps": None,
|
||||
"write_speed_mbps": None,
|
||||
"errors": []
|
||||
}
|
||||
|
||||
# 使用 fio 进行测试
|
||||
if check_command_exists('fio'):
|
||||
try:
|
||||
import tempfile
|
||||
|
||||
with tempfile.NamedTemporaryFile(mode='w', suffix='.fio', delete=False) as f:
|
||||
fio_config = f"""
|
||||
[global]
|
||||
directory=/tmp
|
||||
filename=serverguard_test
|
||||
direct=1
|
||||
size={test_size_mb}M
|
||||
unlink=1
|
||||
|
||||
[seq_read]
|
||||
stonewall
|
||||
rw=read
|
||||
bs=1M
|
||||
|
||||
[seq_write]
|
||||
stonewall
|
||||
rw=write
|
||||
bs=1M
|
||||
"""
|
||||
f.write(fio_config)
|
||||
fio_file = f.name
|
||||
|
||||
try:
|
||||
_, stdout, stderr = execute_command(
|
||||
['fio', fio_file, '--output-format=json'],
|
||||
timeout=120,
|
||||
check_returncode=False
|
||||
)
|
||||
|
||||
data = json.loads(stdout)
|
||||
|
||||
for job in data.get('jobs', []):
|
||||
job_name = job.get('jobname', '')
|
||||
read_bw = job.get('read', {}).get('bw', 0) / 1024 # 转换为 MB/s
|
||||
write_bw = job.get('write', {}).get('bw', 0) / 1024
|
||||
|
||||
if 'read' in job_name.lower() and read_bw > 0:
|
||||
result["read_speed_mbps"] = round(read_bw, 2)
|
||||
if 'write' in job_name.lower() and write_bw > 0:
|
||||
result["write_speed_mbps"] = round(write_bw, 2)
|
||||
|
||||
result["passed"] = True
|
||||
|
||||
finally:
|
||||
os.unlink(fio_file)
|
||||
|
||||
except Exception as e:
|
||||
result["errors"].append(str(e))
|
||||
else:
|
||||
result["errors"].append("fio 未安装")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
import json
|
||||
print(json.dumps(run_storage_check(), indent=2, ensure_ascii=False))
|
||||
Reference in New Issue
Block a user