""" ServerGuard - 存储设备检测模块 检查硬盘/SSD 的健康状况、SMART 数据、RAID 状态。 """ import os import re import json from typing import Dict, Any, List, Optional import sys sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from utils import ( execute_command, check_command_exists, parse_key_value_output, safe_int, safe_float, format_bytes, require_root ) def run_storage_check() -> Dict[str, Any]: """ 执行存储设备检测。 Returns: Dict[str, Any]: 检测结果 """ result = { "status": "success", "devices": [], "raid_status": {}, "io_stats": {} } try: # 获取存储设备列表 devices = get_storage_devices() # 检测每个设备 for device in devices: device_info = check_device(device) result["devices"].append(device_info) # 如果有严重问题,标记警告状态 if device_info.get("health") in ['FAILED', 'WARNING']: result["status"] = "warning" # 检查 RAID 状态 result["raid_status"] = check_raid_status() # 获取 I/O 统计 result["io_stats"] = get_io_statistics() except Exception as e: result["status"] = "error" result["error"] = str(e) return result def get_storage_devices() -> List[Dict[str, str]]: """获取存储设备列表。""" devices = [] # 方法 1: 使用 lsblk if check_command_exists('lsblk'): try: _, stdout, _ = execute_command( ['lsblk', '-d', '-n', '-o', 'NAME,TYPE,ROTA', '-J'], check_returncode=False, timeout=10 ) data = json.loads(stdout) for dev in data.get('blockdevices', []): if dev.get('type') == 'disk': devices.append({ "name": dev['name'], "path": f"/dev/{dev['name']}", "type": "hdd" if dev.get('rota') else "ssd" }) except: pass # 方法 2: 扫描 /sys/block if not devices: try: for name in os.listdir('/sys/block'): if name.startswith(('sd', 'hd', 'nvme', 'vd', 'xvd', 'mmcblk')): dev_type = "unknown" try: with open(f'/sys/block/{name}/queue/rotational', 'r') as f: dev_type = "hdd" if f.read().strip() == '1' else "ssd" except: pass devices.append({ "name": name, "path": f"/dev/{name}", "type": dev_type }) except: pass return devices def check_device(device: Dict[str, str]) -> Dict[str, Any]: """检查单个存储设备。""" result = { "name": device["name"], "path": device["path"], "type": device.get("type", "unknown"), "model": "Unknown", "serial": "Unknown", "firmware": "Unknown", "size_bytes": 0, "size_human": "Unknown", "health": "UNKNOWN", "smart_status": {}, "temperature_c": None, "power_on_hours": None, "start_stop_count": None, "reallocated_sectors": None, "pending_sectors": None, "test_result": None } # 获取设备基本信息 result.update(get_device_info(device["path"])) # 获取 SMART 数据 smart_data = get_smart_data(device["path"]) result["smart_status"] = smart_data # 分析健康状态 result["health"] = analyze_health(smart_data) # 提取关键属性 if "attributes" in smart_data: attrs = smart_data["attributes"] # 温度 for temp_attr in ['194 Temperature_Celsius', '190 Airflow_Temperature_Cel', 'Temperature']: if temp_attr in attrs: temp_val = attrs[temp_attr].get('raw_value') if temp_val: result["temperature_c"] = safe_int(temp_val.split()[0]) break # 运行时间 if '9 Power_On_Hours' in attrs: result["power_on_hours"] = safe_int(attrs['9 Power_On_Hours'].get('raw_value', 0)) # 启动次数 if '4 Start_Stop_Count' in attrs: result["start_stop_count"] = safe_int(attrs['4 Start_Stop_Count'].get('raw_value', 0)) # 重映射扇区 if '5 Reallocated_Sector_Ct' in attrs: result["reallocated_sectors"] = safe_int(attrs['5 Reallocated_Sector_Ct'].get('raw_value', 0)) # 待处理扇区 if '197 Current_Pending_Sector' in attrs: result["pending_sectors"] = safe_int(attrs['197 Current_Pending_Sector'].get('raw_value', 0)) # NVMe 特殊处理 if device["name"].startswith('nvme'): nvme_data = get_nvme_data(device["path"]) result["nvme_data"] = nvme_data if nvme_data.get("temperature"): result["temperature_c"] = nvme_data["temperature"] if nvme_data.get("health"): result["health"] = nvme_data["health"] return result def get_device_info(device_path: str) -> Dict[str, Any]: """获取设备基本信息。""" info = {} # 使用 smartctl -i 获取信息 if check_command_exists('smartctl'): try: _, stdout, _ = execute_command( ['smartctl', '-i', device_path], check_returncode=False, timeout=10 ) patterns = { "model": r'Device Model:\s*(.+)', "serial": r'Serial Number:\s*(\S+)', "firmware": r'Firmware Version:\s*(\S+)', "size_human": r'User Capacity:\s*(.+)', "sector_size": r'Sector Size:\s*(.+)', "rotation_rate": r'Rotation Rate:\s*(.+)', "form_factor": r'Form Factor:\s*(.+)', "transport": r'Transport protocol:\s*(.+)' } for key, pattern in patterns.items(): match = re.search(pattern, stdout) if match: info[key] = match.group(1).strip() # 提取容量字节数 size_match = re.search(r'User Capacity:\s*[\d,]+\s*bytes\s*\[(\d+)\]', stdout) if size_match: info["size_bytes"] = safe_int(size_match.group(1)) # 是否为 SSD if 'Solid State Device' in stdout or 'Rotation Rate: Solid State Device' in stdout: info["is_ssd"] = True elif 'Rotation Rate' in stdout and 'Solid State' not in stdout: info["is_ssd"] = False except: pass # 备用:从 /sys 获取大小 if "size_bytes" not in info or info["size_bytes"] == 0: try: dev_name = os.path.basename(device_path) with open(f'/sys/block/{dev_name}/size', 'r') as f: sectors = safe_int(f.read().strip()) info["size_bytes"] = sectors * 512 info["size_human"] = format_bytes(info["size_bytes"]) except: pass return info def get_smart_data(device_path: str) -> Dict[str, Any]: """获取 SMART 数据。""" result = { "supported": False, "enabled": False, "overall": "UNKNOWN", "attributes": {}, "self_tests": [] } if not check_command_exists('smartctl'): result["error"] = "smartctl 未安装" return result try: # 检查 SMART 支持 _, stdout, _ = execute_command( ['smartctl', '-i', device_path], check_returncode=False, timeout=10 ) if 'SMART support is: Available' in stdout: result["supported"] = True if 'SMART support is: Enabled' in stdout: result["enabled"] = True # 获取所有 SMART 数据 _, stdout, _ = execute_command( ['smartctl', '-a', device_path], check_returncode=False, timeout=15 ) # 解析整体健康状态 if 'PASSED' in stdout or 'OK' in stdout: result["overall"] = "PASSED" elif 'FAILED' in stdout: result["overall"] = "FAILED" # 解析 SMART 属性表 (ATA 设备) if 'ID#' in stdout and 'ATTRIBUTE_NAME' in stdout: lines = stdout.split('\n') in_attributes = False for line in lines: if 'ID#' in line and 'ATTRIBUTE_NAME' in line: in_attributes = True continue if in_attributes: if not line.strip() or line.startswith('SMART'): break # 解析属性行 # 格式: ID# ATTRIBUTE_NAME FLAG VALUE WORST THRESH TYPE UPDATED WHEN_FAILED RAW_VALUE parts = line.split() if len(parts) >= 10: attr_id = parts[0] attr_name = parts[1] attr_key = f"{attr_id} {attr_name}" result["attributes"][attr_key] = { "flag": parts[2], "value": safe_int(parts[3]), "worst": safe_int(parts[4]), "thresh": safe_int(parts[5]), "type": parts[6], "updated": parts[7], "when_failed": parts[8] if parts[8] != '-' else None, "raw_value": ' '.join(parts[9:]) } # 解析自检日志 if 'SMART Self-test log' in stdout: self_test_section = False for line in stdout.split('\n'): if 'SMART Self-test log' in line: self_test_section = True continue if self_test_section and line.strip() and not line.startswith('SMART'): if '#' in line: result["self_tests"].append(line.strip()) # 解析错误日志 if 'SMART Error Log' in stdout: error_match = re.search(r'Error (\d+)\s+occurred at', stdout) if error_match: result["error_count"] = safe_int(error_match.group(1)) except Exception as e: result["error"] = str(e) return result def get_nvme_data(device_path: str) -> Dict[str, Any]: """获取 NVMe 设备特有数据。""" result = { "health": "UNKNOWN", "temperature": None, "available_spare": None, "percentage_used": None, "data_units_read": None, "data_units_written": None, "host_reads": None, "host_writes": None } if not check_command_exists('nvme'): return result try: # 获取 SMART 日志 _, stdout, _ = execute_command( ['nvme', 'smart-log', device_path], check_returncode=False, timeout=10 ) # 解析关键指标 temp_match = re.search(r'temperature\s*:\s*(\d+)', stdout) if temp_match: result["temperature"] = safe_int(temp_match.group(1)) - 273 # 转换为摄氏度 spare_match = re.search(r'available spare\s*:\s*(\d+)%', stdout) if spare_match: result["available_spare"] = safe_int(spare_match.group(1)) used_match = re.search(r'percentage used\s*:\s*(\d+)%', stdout) if used_match: result["percentage_used"] = safe_int(used_match.group(1)) # 评估健康状态 if result["percentage_used"] is not None: if result["percentage_used"] < 90: result["health"] = "PASSED" else: result["health"] = "WARNING" if result["available_spare"] is not None and result["available_spare"] < 10: result["health"] = "WARNING" except: pass return result def analyze_health(smart_data: Dict[str, Any]) -> str: """分析设备健康状态。""" if not smart_data.get("supported"): return "UNKNOWN" if smart_data.get("overall") == "FAILED": return "FAILED" # 检查关键属性 attrs = smart_data.get("attributes", {}) critical_attrs = { '5 Reallocated_Sector_Ct': 'reallocated_sectors', '197 Current_Pending_Sector': 'pending_sectors', '198 Offline_Uncorrectable': 'offline_uncorrectable', '196 Reallocation_Event_Count': 'reallocation_events' } for attr_name, description in critical_attrs.items(): if attr_name in attrs: raw_value = attrs[attr_name].get('raw_value', '0') value = safe_int(raw_value.split()[0]) if value > 0: return "WARNING" # 检查温度 for temp_attr in ['194 Temperature_Celsius', '190 Airflow_Temperature_Cel']: if temp_attr in attrs: temp = attrs[temp_attr].get('value', 0) if temp > 60: # 温度阈值 return "WARNING" return "PASSED" def check_raid_status() -> Dict[str, Any]: """检查 RAID 阵列状态。""" result = { "raid_available": False, "controllers": [], "arrays": [] } # 检查软件 RAID (mdadm) if check_command_exists('mdadm'): try: _, stdout, _ = execute_command( ['mdadm', '--detail', '--scan'], check_returncode=False, timeout=10 ) if stdout.strip(): result["software_raid"] = True result["mdadm_config"] = stdout.strip() # 获取详细信息 _, detail, _ = execute_command( ['cat', '/proc/mdstat'], check_returncode=False, timeout=5 ) result["mdstat"] = detail # 解析每个阵列 for line in detail.split('\n'): if line.startswith('md'): parts = line.split() array_info = { "name": parts[0], "status": "active" if "active" in line else "inactive" } # 检查是否有降级 if '_' in line or 'recovery' in line: array_info["degraded"] = True result["status"] = "warning" result["arrays"].append(array_info) except: pass # 检查硬件 RAID (MegaCli/storcli) if check_command_exists('storcli'): try: _, stdout, _ = execute_command( ['storcli', '/c0', 'show'], check_returncode=False, timeout=10 ) result["hardware_raid"] = True result["controller_type"] = "LSI/Broadcom" result["storcli_output"] = stdout[:500] # 保存部分输出 except: pass elif check_command_exists('MegaCli'): try: _, stdout, _ = execute_command( ['MegaCli', '-AdpAllInfo', '-aALL'], check_returncode=False, timeout=10 ) result["hardware_raid"] = True result["controller_type"] = "LSI" result["megacli_output"] = stdout[:500] except: pass return result def get_io_statistics() -> Dict[str, Any]: """获取 I/O 统计信息。""" result = {} # 从 /proc/diskstats 获取 try: with open('/proc/diskstats', 'r') as f: for line in f: parts = line.split() if len(parts) >= 14: device = parts[2] # 只关注物理磁盘 if device.startswith(('sd', 'hd', 'nvme', 'vd')) and not device[-1].isdigit(): result[device] = { "reads_completed": safe_int(parts[3]), "reads_merged": safe_int(parts[4]), "sectors_read": safe_int(parts[5]), "time_reading_ms": safe_int(parts[6]), "writes_completed": safe_int(parts[7]), "writes_merged": safe_int(parts[8]), "sectors_written": safe_int(parts[9]), "time_writing_ms": safe_int(parts[10]), "ios_in_progress": safe_int(parts[11]), "time_doing_ios_ms": safe_int(parts[12]), "weighted_time_ios_ms": safe_int(parts[13]) } except: pass return result @require_root def run_io_test(device_path: str, test_size_mb: int = 100) -> Dict[str, Any]: """ 运行简单的 I/O 性能测试。 Args: device_path: 设备路径 test_size_mb: 测试大小(MB) Returns: Dict[str, Any]: 测试结果 """ result = { "passed": False, "device": device_path, "test_size_mb": test_size_mb, "read_speed_mbps": None, "write_speed_mbps": None, "errors": [] } # 使用 fio 进行测试 if check_command_exists('fio'): try: import tempfile with tempfile.NamedTemporaryFile(mode='w', suffix='.fio', delete=False) as f: fio_config = f""" [global] directory=/tmp filename=serverguard_test direct=1 size={test_size_mb}M unlink=1 [seq_read] stonewall rw=read bs=1M [seq_write] stonewall rw=write bs=1M """ f.write(fio_config) fio_file = f.name try: _, stdout, stderr = execute_command( ['fio', fio_file, '--output-format=json'], timeout=120, check_returncode=False ) data = json.loads(stdout) for job in data.get('jobs', []): job_name = job.get('jobname', '') read_bw = job.get('read', {}).get('bw', 0) / 1024 # 转换为 MB/s write_bw = job.get('write', {}).get('bw', 0) / 1024 if 'read' in job_name.lower() and read_bw > 0: result["read_speed_mbps"] = round(read_bw, 2) if 'write' in job_name.lower() and write_bw > 0: result["write_speed_mbps"] = round(write_bw, 2) result["passed"] = True finally: os.unlink(fio_file) except Exception as e: result["errors"].append(str(e)) else: result["errors"].append("fio 未安装") return result if __name__ == '__main__': import json print(json.dumps(run_storage_check(), indent=2, ensure_ascii=False))