增强日志
This commit is contained in:
57
README.md
57
README.md
@@ -19,6 +19,7 @@ ServerGuard 是一款基于 Python 的 Linux 命令行工具,用于诊断服
|
|||||||
- **Python**: 3.6 或更高版本
|
- **Python**: 3.6 或更高版本
|
||||||
- **权限**: root 权限(大多数硬件诊断功能需要)
|
- **权限**: root 权限(大多数硬件诊断功能需要)
|
||||||
- **架构**: x86_64 (AMD64)
|
- **架构**: x86_64 (AMD64)
|
||||||
|
- **磁盘空间**: 至少 100MB 可用空间(用于日志和报告)
|
||||||
|
|
||||||
## 克隆及安装方法
|
## 克隆及安装方法
|
||||||
|
|
||||||
@@ -313,6 +314,62 @@ python3 quick_test.py
|
|||||||
python3 -m unittest discover tests/ -v
|
python3 -m unittest discover tests/ -v
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## 日志记录
|
||||||
|
|
||||||
|
ServerGuard 会实时记录详细的测试日志,方便排查问题:
|
||||||
|
|
||||||
|
### 日志文件位置
|
||||||
|
|
||||||
|
默认日志文件路径:`/var/log/serverguard.log`
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 查看实时日志
|
||||||
|
tail -f /var/log/serverguard.log
|
||||||
|
|
||||||
|
# 查看最近 100 行日志
|
||||||
|
tail -n 100 /var/log/serverguard.log
|
||||||
|
```
|
||||||
|
|
||||||
|
### 日志内容说明
|
||||||
|
|
||||||
|
日志包含以下关键信息:
|
||||||
|
- **启动信息**: 程序启动时间、命令行参数、Python版本等
|
||||||
|
- **进度记录**: 每个模块的检测开始和结束时间
|
||||||
|
- **详细步骤**: 压力测试前后的温度、执行的命令等
|
||||||
|
- **错误信息**: 详细的异常信息和堆栈跟踪
|
||||||
|
|
||||||
|
### 日志示例
|
||||||
|
|
||||||
|
```
|
||||||
|
2026-03-02 15:41:28 - ServerGuard 启动
|
||||||
|
2026-03-02 15:41:28 - [DIAGNOSTIC START] 全面硬件诊断
|
||||||
|
2026-03-02 15:41:28 - [PROGRESS] 模块 1/7: system
|
||||||
|
2026-03-02 15:41:28 - [MODULE START] cpu - stress_test=True, duration=300
|
||||||
|
2026-03-02 15:41:28 - [CPU STRESS TEST] 测试前温度: 45°C
|
||||||
|
2026-03-02 15:46:28 - [CPU STRESS TEST] 测试后温度: 78°C
|
||||||
|
2026-03-02 15:46:28 - [CPU STRESS TEST] 压力测试通过
|
||||||
|
```
|
||||||
|
|
||||||
|
### 排查机器重启/关机问题
|
||||||
|
|
||||||
|
如果测试过程中机器意外关机或重启,查看日志文件:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 查找最后的日志记录
|
||||||
|
tail -n 50 /var/log/serverguard.log
|
||||||
|
|
||||||
|
# 查找压力测试相关的日志
|
||||||
|
grep "STRESS TEST" /var/log/serverguard.log
|
||||||
|
|
||||||
|
# 查找错误信息
|
||||||
|
grep -i "error\|exception\|failed" /var/log/serverguard.log
|
||||||
|
```
|
||||||
|
|
||||||
|
**常见情况分析:**
|
||||||
|
- 如果在 `[CPU STRESS TEST]` 或 `[MEMORY STRESS TEST]` 后没有 `[END]` 记录,说明机器在压力测试中出现问题
|
||||||
|
- 查看压力测试前后的温度记录,判断是否因过热导致关机
|
||||||
|
- 查看系统日志 `dmesg` 或 `/var/log/messages` 确认硬件错误
|
||||||
|
|
||||||
## 故障排除
|
## 故障排除
|
||||||
|
|
||||||
### 1. 提示 "未找到压力测试工具"
|
### 1. 提示 "未找到压力测试工具"
|
||||||
|
|||||||
220
main.py
220
main.py
@@ -167,7 +167,7 @@ def confirm_stress_test(duration: int, auto_confirm: bool = False) -> bool:
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
def run_module(module_name: str, stress_test: bool = False, stress_duration: int = 300) -> Dict[str, Any]:
|
def run_module(module_name: str, stress_test: bool = False, stress_duration: int = 300, progress_logger=None) -> Dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
运行指定的检测模块。
|
运行指定的检测模块。
|
||||||
|
|
||||||
@@ -175,6 +175,7 @@ def run_module(module_name: str, stress_test: bool = False, stress_duration: int
|
|||||||
module_name: 模块名称
|
module_name: 模块名称
|
||||||
stress_test: 是否执行压力测试
|
stress_test: 是否执行压力测试
|
||||||
stress_duration: 压力测试持续时间
|
stress_duration: 压力测试持续时间
|
||||||
|
progress_logger: 进度日志记录器
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Dict[str, Any]: 模块检测结果
|
Dict[str, Any]: 模块检测结果
|
||||||
@@ -196,39 +197,89 @@ def run_module(module_name: str, stress_test: bool = False, stress_duration: int
|
|||||||
logger.error(f"未知模块: {module_name}")
|
logger.error(f"未知模块: {module_name}")
|
||||||
return {"status": "error", "error": f"未知模块: {module_name}"}
|
return {"status": "error", "error": f"未知模块: {module_name}"}
|
||||||
|
|
||||||
|
# 记录开始
|
||||||
|
if progress_logger:
|
||||||
|
progress_logger.start(f"模块检测: {module_name}")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
logger.info(f"[MODULE START] {module_name} - stress_test={stress_test}, duration={stress_duration}")
|
||||||
|
|
||||||
module = __import__(module_map[module_name], fromlist=[''])
|
module = __import__(module_map[module_name], fromlist=[''])
|
||||||
|
|
||||||
|
result = None
|
||||||
if module_name == 'system':
|
if module_name == 'system':
|
||||||
return module.get_system_info()
|
if progress_logger:
|
||||||
|
progress_logger.log("开始收集系统信息")
|
||||||
|
result = module.get_system_info()
|
||||||
elif module_name == 'cpu':
|
elif module_name == 'cpu':
|
||||||
return module.run_cpu_check(stress_test, stress_duration)
|
if progress_logger:
|
||||||
|
progress_logger.log(f"开始CPU检测 (stress_test={stress_test})")
|
||||||
|
result = module.run_cpu_check(stress_test, stress_duration)
|
||||||
elif module_name == 'memory':
|
elif module_name == 'memory':
|
||||||
return module.run_memory_check(stress_test, stress_duration)
|
if progress_logger:
|
||||||
|
progress_logger.log(f"开始内存检测 (stress_test={stress_test})")
|
||||||
|
result = module.run_memory_check(stress_test, stress_duration)
|
||||||
elif module_name == 'storage':
|
elif module_name == 'storage':
|
||||||
return module.run_storage_check()
|
if progress_logger:
|
||||||
|
progress_logger.log("开始存储设备检测")
|
||||||
|
result = module.run_storage_check()
|
||||||
elif module_name == 'sensors':
|
elif module_name == 'sensors':
|
||||||
return module.run_sensors_check()
|
if progress_logger:
|
||||||
|
progress_logger.log("开始传感器监控")
|
||||||
|
result = module.run_sensors_check()
|
||||||
elif module_name == 'gpu':
|
elif module_name == 'gpu':
|
||||||
return module.run_gpu_check()
|
if progress_logger:
|
||||||
|
progress_logger.log("开始显卡检测")
|
||||||
|
result = module.run_gpu_check()
|
||||||
elif module_name == 'logs':
|
elif module_name == 'logs':
|
||||||
return module.analyze_logs()
|
if progress_logger:
|
||||||
|
progress_logger.log("开始日志分析")
|
||||||
|
result = module.analyze_logs()
|
||||||
|
|
||||||
|
# 记录结果
|
||||||
|
status = result.get("status", "unknown") if result else "unknown"
|
||||||
|
logger.info(f"[MODULE END] {module_name} - Status: {status}")
|
||||||
|
|
||||||
|
# 如果结果中有错误信息,记录到日志
|
||||||
|
if result and result.get("error"):
|
||||||
|
logger.error(f"[MODULE ERROR] {module_name}: {result['error']}")
|
||||||
|
|
||||||
|
if progress_logger:
|
||||||
|
progress_logger.end(status=status)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"运行模块 {module_name} 时出错: {e}")
|
error_msg = f"运行模块 {module_name} 时出错: {e}"
|
||||||
|
logger.exception(error_msg)
|
||||||
|
|
||||||
|
if progress_logger:
|
||||||
|
progress_logger.end(status="error", message=str(e))
|
||||||
|
|
||||||
return {"status": "error", "error": str(e)}
|
return {"status": "error", "error": str(e)}
|
||||||
|
|
||||||
|
|
||||||
def run_quick_check() -> Dict[str, Any]:
|
def run_quick_check(progress_logger=None) -> Dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
执行快速检测(非侵入性)。
|
执行快速检测(非侵入性)。
|
||||||
|
|
||||||
|
Args:
|
||||||
|
progress_logger: 进度日志记录器
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Dict[str, Any]: 检测结果
|
Dict[str, Any]: 检测结果
|
||||||
"""
|
"""
|
||||||
import logging
|
import logging
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# 记录检测开始
|
||||||
|
logger.info("=" * 70)
|
||||||
|
logger.info("[DIAGNOSTIC START] 快速硬件检测")
|
||||||
|
logger.info("=" * 70)
|
||||||
|
|
||||||
|
if progress_logger:
|
||||||
|
progress_logger.start("快速硬件检测")
|
||||||
|
|
||||||
print("正在执行快速硬件检测...")
|
print("正在执行快速硬件检测...")
|
||||||
print("-" * 60)
|
print("-" * 60)
|
||||||
|
|
||||||
@@ -239,37 +290,54 @@ def run_quick_check() -> Dict[str, Any]:
|
|||||||
}
|
}
|
||||||
|
|
||||||
modules_to_run = ['system', 'cpu', 'memory', 'storage', 'sensors', 'gpu', 'logs']
|
modules_to_run = ['system', 'cpu', 'memory', 'storage', 'sensors', 'gpu', 'logs']
|
||||||
|
total_modules = len(modules_to_run)
|
||||||
|
|
||||||
for module_name in modules_to_run:
|
for idx, module_name in enumerate(modules_to_run, 1):
|
||||||
|
logger.info(f"[PROGRESS] 模块 {idx}/{total_modules}: {module_name}")
|
||||||
print(f"正在检测: {module_name}...", end=' ', flush=True)
|
print(f"正在检测: {module_name}...", end=' ', flush=True)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
result = run_module(module_name, stress_test=False)
|
result = run_module(module_name, stress_test=False, progress_logger=progress_logger)
|
||||||
results["modules"][module_name] = result
|
results["modules"][module_name] = result
|
||||||
status = result.get("status", "unknown")
|
status = result.get("status", "unknown")
|
||||||
|
|
||||||
if status == "success":
|
if status == "success":
|
||||||
print("[完成]")
|
print("[完成]")
|
||||||
|
logger.info(f"[MODULE SUCCESS] {module_name}")
|
||||||
elif status == "warning":
|
elif status == "warning":
|
||||||
print("[警告]")
|
print("[警告]")
|
||||||
|
logger.warning(f"[MODULE WARNING] {module_name}")
|
||||||
elif status == "error":
|
elif status == "error":
|
||||||
print("[错误]")
|
print("[错误]")
|
||||||
|
logger.error(f"[MODULE ERROR] {module_name}: {result.get('error', 'Unknown error')}")
|
||||||
else:
|
else:
|
||||||
print(f"[{status}]")
|
print(f"[{status}]")
|
||||||
|
logger.info(f"[MODULE {status.upper()}] {module_name}")
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"模块 {module_name} 执行失败: {e}")
|
error_msg = f"模块 {module_name} 执行失败: {e}"
|
||||||
|
logger.exception(error_msg)
|
||||||
results["modules"][module_name] = {"status": "error", "error": str(e)}
|
results["modules"][module_name] = {"status": "error", "error": str(e)}
|
||||||
print("[失败]")
|
print("[失败]")
|
||||||
|
|
||||||
print("-" * 60)
|
print("-" * 60)
|
||||||
|
logger.info("[DIAGNOSTIC END] 快速硬件检测完成")
|
||||||
|
logger.info("=" * 70)
|
||||||
|
|
||||||
|
if progress_logger:
|
||||||
|
progress_logger.end(status="success")
|
||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
||||||
def run_full_diagnostic(stress_duration: int, auto_confirm: bool = False) -> Dict[str, Any]:
|
def run_full_diagnostic(stress_duration: int, auto_confirm: bool = False, progress_logger=None) -> Dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
执行全面诊断(包含压力测试)。
|
执行全面诊断(包含压力测试)。
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
stress_duration: 压力测试持续时间
|
stress_duration: 压力测试持续时间
|
||||||
auto_confirm: 是否自动确认
|
auto_confirm: 是否自动确认
|
||||||
|
progress_logger: 进度日志记录器
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Dict[str, Any]: 检测结果
|
Dict[str, Any]: 检测结果
|
||||||
@@ -281,6 +349,14 @@ def run_full_diagnostic(stress_duration: int, auto_confirm: bool = False) -> Dic
|
|||||||
print("诊断已取消")
|
print("诊断已取消")
|
||||||
sys.exit(0)
|
sys.exit(0)
|
||||||
|
|
||||||
|
# 记录诊断开始
|
||||||
|
logger.info("=" * 70)
|
||||||
|
logger.info(f"[DIAGNOSTIC START] 全面硬件诊断 (stress_duration={stress_duration}s)")
|
||||||
|
logger.info("=" * 70)
|
||||||
|
|
||||||
|
if progress_logger:
|
||||||
|
progress_logger.start("全面硬件诊断")
|
||||||
|
|
||||||
print("\n正在执行全面硬件诊断...")
|
print("\n正在执行全面硬件诊断...")
|
||||||
print("=" * 60)
|
print("=" * 60)
|
||||||
|
|
||||||
@@ -293,38 +369,73 @@ def run_full_diagnostic(stress_duration: int, auto_confirm: bool = False) -> Dic
|
|||||||
|
|
||||||
# 先执行快速检测
|
# 先执行快速检测
|
||||||
modules_to_run = ['system', 'cpu', 'memory', 'storage', 'sensors', 'gpu', 'logs']
|
modules_to_run = ['system', 'cpu', 'memory', 'storage', 'sensors', 'gpu', 'logs']
|
||||||
|
total_modules = len(modules_to_run)
|
||||||
|
|
||||||
for module_name in modules_to_run:
|
for idx, module_name in enumerate(modules_to_run, 1):
|
||||||
|
logger.info(f"[PROGRESS] 模块 {idx}/{total_modules}: {module_name}")
|
||||||
print(f"\n正在检测: {module_name}...")
|
print(f"\n正在检测: {module_name}...")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# CPU 和内存执行压力测试
|
# CPU 和内存执行压力测试
|
||||||
do_stress = module_name in ['cpu', 'memory']
|
do_stress = module_name in ['cpu', 'memory']
|
||||||
result = run_module(module_name, stress_test=do_stress, stress_duration=stress_duration)
|
|
||||||
|
if do_stress:
|
||||||
|
logger.warning(f"[STRESS TEST] {module_name} 压力测试即将开始 (duration={stress_duration}s)")
|
||||||
|
print(f" ⚠ 即将执行 {module_name} 压力测试,持续时间 {stress_duration} 秒")
|
||||||
|
|
||||||
|
result = run_module(module_name, stress_test=do_stress, stress_duration=stress_duration, progress_logger=progress_logger)
|
||||||
results["modules"][module_name] = result
|
results["modules"][module_name] = result
|
||||||
status = result.get("status", "unknown")
|
status = result.get("status", "unknown")
|
||||||
|
|
||||||
print(f" 状态: {status}")
|
print(f" 状态: {status}")
|
||||||
|
|
||||||
|
if status == "success":
|
||||||
|
logger.info(f"[MODULE SUCCESS] {module_name}")
|
||||||
|
elif status == "warning":
|
||||||
|
logger.warning(f"[MODULE WARNING] {module_name}")
|
||||||
|
elif status == "error":
|
||||||
|
logger.error(f"[MODULE ERROR] {module_name}: {result.get('error', 'Unknown error')}")
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"模块 {module_name} 执行失败: {e}")
|
error_msg = f"模块 {module_name} 执行失败: {e}"
|
||||||
|
logger.exception(error_msg)
|
||||||
results["modules"][module_name] = {"status": "error", "error": str(e)}
|
results["modules"][module_name] = {"status": "error", "error": str(e)}
|
||||||
print(f" 状态: 失败 - {e}")
|
print(f" 状态: 失败 - {e}")
|
||||||
|
|
||||||
print("\n" + "=" * 60)
|
print("\n" + "=" * 60)
|
||||||
|
logger.info("[DIAGNOSTIC END] 全面硬件诊断完成")
|
||||||
|
logger.info("=" * 70)
|
||||||
|
|
||||||
|
if progress_logger:
|
||||||
|
progress_logger.end(status="success")
|
||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
||||||
def run_specific_modules(module_list: str, stress_duration: int) -> Dict[str, Any]:
|
def run_specific_modules(module_list: str, stress_duration: int, progress_logger=None) -> Dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
运行指定的模块列表。
|
运行指定的模块列表。
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
module_list: 逗号分隔的模块名称
|
module_list: 逗号分隔的模块名称
|
||||||
stress_duration: 压力测试持续时间
|
stress_duration: 压力测试持续时间
|
||||||
|
progress_logger: 进度日志记录器
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Dict[str, Any]: 检测结果
|
Dict[str, Any]: 检测结果
|
||||||
"""
|
"""
|
||||||
|
import logging
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
modules = [m.strip() for m in module_list.split(',')]
|
modules = [m.strip() for m in module_list.split(',')]
|
||||||
|
|
||||||
|
logger.info("=" * 70)
|
||||||
|
logger.info(f"[DIAGNOSTIC START] 自定义模块检测: {', '.join(modules)}")
|
||||||
|
logger.info("=" * 70)
|
||||||
|
|
||||||
|
if progress_logger:
|
||||||
|
progress_logger.start(f"自定义模块检测: {', '.join(modules)}")
|
||||||
|
|
||||||
results = {
|
results = {
|
||||||
"scan_type": "custom",
|
"scan_type": "custom",
|
||||||
"timestamp": get_file_timestamp(),
|
"timestamp": get_file_timestamp(),
|
||||||
@@ -334,18 +445,38 @@ def run_specific_modules(module_list: str, stress_duration: int) -> Dict[str, An
|
|||||||
print(f"正在执行自定义模块检测: {', '.join(modules)}")
|
print(f"正在执行自定义模块检测: {', '.join(modules)}")
|
||||||
print("-" * 60)
|
print("-" * 60)
|
||||||
|
|
||||||
for module_name in modules:
|
total_modules = len(modules)
|
||||||
|
|
||||||
|
for idx, module_name in enumerate(modules, 1):
|
||||||
|
logger.info(f"[PROGRESS] 模块 {idx}/{total_modules}: {module_name}")
|
||||||
print(f"正在检测: {module_name}...", end=' ', flush=True)
|
print(f"正在检测: {module_name}...", end=' ', flush=True)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
result = run_module(module_name, stress_test=False)
|
result = run_module(module_name, stress_test=False, progress_logger=progress_logger)
|
||||||
results["modules"][module_name] = result
|
results["modules"][module_name] = result
|
||||||
status = result.get("status", "unknown")
|
status = result.get("status", "unknown")
|
||||||
print(f"[{status}]")
|
print(f"[{status}]")
|
||||||
|
|
||||||
|
if status == "error":
|
||||||
|
logger.error(f"[MODULE ERROR] {module_name}: {result.get('error', 'Unknown error')}")
|
||||||
|
elif status == "warning":
|
||||||
|
logger.warning(f"[MODULE WARNING] {module_name}")
|
||||||
|
else:
|
||||||
|
logger.info(f"[MODULE SUCCESS] {module_name}")
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
error_msg = f"模块 {module_name} 执行失败: {e}"
|
||||||
|
logger.exception(error_msg)
|
||||||
results["modules"][module_name] = {"status": "error", "error": str(e)}
|
results["modules"][module_name] = {"status": "error", "error": str(e)}
|
||||||
print(f"[失败: {e}]")
|
print(f"[失败: {e}]")
|
||||||
|
|
||||||
print("-" * 60)
|
print("-" * 60)
|
||||||
|
logger.info("[DIAGNOSTIC END] 自定义模块检测完成")
|
||||||
|
logger.info("=" * 70)
|
||||||
|
|
||||||
|
if progress_logger:
|
||||||
|
progress_logger.end(status="success")
|
||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
||||||
@@ -362,6 +493,19 @@ def main():
|
|||||||
)
|
)
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# 创建进度日志记录器
|
||||||
|
from utils import ProgressLogger
|
||||||
|
progress_logger = ProgressLogger(log_file=args.log)
|
||||||
|
|
||||||
|
# 记录程序启动信息
|
||||||
|
logger.info("=" * 70)
|
||||||
|
logger.info("ServerGuard 启动")
|
||||||
|
logger.info(f"命令行参数: {' '.join(sys.argv)}")
|
||||||
|
logger.info(f"工作目录: {os.getcwd()}")
|
||||||
|
logger.info(f"Python版本: {sys.version}")
|
||||||
|
logger.info(f"用户ID: {os.getuid()}, 是否为root: {check_root_privileges()}")
|
||||||
|
logger.info("=" * 70)
|
||||||
|
|
||||||
# 列出模块
|
# 列出模块
|
||||||
if args.list_modules:
|
if args.list_modules:
|
||||||
list_available_modules()
|
list_available_modules()
|
||||||
@@ -375,21 +519,31 @@ def main():
|
|||||||
|
|
||||||
# 执行诊断
|
# 执行诊断
|
||||||
try:
|
try:
|
||||||
|
progress_logger.start("ServerGuard 诊断任务")
|
||||||
|
|
||||||
if args.quick:
|
if args.quick:
|
||||||
results = run_quick_check()
|
logger.info("执行模式: 快速检测")
|
||||||
|
results = run_quick_check(progress_logger=progress_logger)
|
||||||
elif args.full:
|
elif args.full:
|
||||||
results = run_full_diagnostic(args.stress_duration, args.yes)
|
logger.info("执行模式: 全面诊断")
|
||||||
|
results = run_full_diagnostic(args.stress_duration, args.yes, progress_logger=progress_logger)
|
||||||
elif args.module:
|
elif args.module:
|
||||||
results = run_specific_modules(args.module, args.stress_duration)
|
logger.info(f"执行模式: 自定义模块 - {args.module}")
|
||||||
|
results = run_specific_modules(args.module, args.stress_duration, progress_logger=progress_logger)
|
||||||
else:
|
else:
|
||||||
print("请指定操作模式: --quick, --full, --module 或 --list-modules")
|
print("请指定操作模式: --quick, --full, --module 或 --list-modules")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
|
# 记录诊断完成
|
||||||
|
progress_logger.end(status="success")
|
||||||
|
logger.info("诊断执行完成,正在生成报告...")
|
||||||
|
|
||||||
# 生成报告
|
# 生成报告
|
||||||
generator = ReportGenerator()
|
generator = ReportGenerator()
|
||||||
|
|
||||||
if args.output:
|
if args.output:
|
||||||
generator.save_report(results, args.format, args.output)
|
generator.save_report(results, args.format, args.output)
|
||||||
|
logger.info(f"报告已保存至: {args.output}")
|
||||||
print(f"\n报告已保存至: {args.output}")
|
print(f"\n报告已保存至: {args.output}")
|
||||||
else:
|
else:
|
||||||
report = generator.generate_report(results, args.format)
|
report = generator.generate_report(results, args.format)
|
||||||
@@ -403,14 +557,32 @@ def main():
|
|||||||
m.get("status") == "error"
|
m.get("status") == "error"
|
||||||
for m in results.get("modules", {}).values()
|
for m in results.get("modules", {}).values()
|
||||||
)
|
)
|
||||||
|
|
||||||
|
logger.info("=" * 70)
|
||||||
|
logger.info(f"ServerGuard 正常结束 - 退出码: {1 if has_errors else 0}")
|
||||||
|
logger.info("=" * 70)
|
||||||
|
|
||||||
sys.exit(1 if has_errors else 0)
|
sys.exit(1 if has_errors else 0)
|
||||||
|
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
|
logger.warning("操作已被用户中断 (KeyboardInterrupt)")
|
||||||
print("\n\n操作已被用户中断")
|
print("\n\n操作已被用户中断")
|
||||||
sys.exit(130)
|
sys.exit(130)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.exception("程序执行过程中发生错误")
|
error_msg = f"程序执行过程中发生严重错误: {e}"
|
||||||
|
logger.exception(error_msg)
|
||||||
|
logger.error("=" * 70)
|
||||||
|
logger.error(f"异常类型: {type(e).__name__}")
|
||||||
|
logger.error(f"异常信息: {e}")
|
||||||
|
logger.error("=" * 70)
|
||||||
|
|
||||||
|
# 尝试记录当前进度
|
||||||
|
if progress_logger and progress_logger.current_step:
|
||||||
|
logger.error(f"错误发生时正在执行的步骤: {progress_logger.current_step}")
|
||||||
|
|
||||||
print(f"\n错误: {e}")
|
print(f"\n错误: {e}")
|
||||||
|
print(f"\n详细错误信息已记录到日志: {args.log}")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -402,6 +402,9 @@ def run_cpu_stress_test(duration: int = 300) -> Dict[str, Any]:
|
|||||||
Returns:
|
Returns:
|
||||||
Dict[str, Any]: 测试结果
|
Dict[str, Any]: 测试结果
|
||||||
"""
|
"""
|
||||||
|
import logging
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
result = {
|
result = {
|
||||||
"passed": False,
|
"passed": False,
|
||||||
"duration_seconds": duration,
|
"duration_seconds": duration,
|
||||||
@@ -417,10 +420,13 @@ def run_cpu_stress_test(duration: int = 300) -> Dict[str, Any]:
|
|||||||
if check_command_exists('stress-ng'):
|
if check_command_exists('stress-ng'):
|
||||||
result["tool_used"] = "stress-ng"
|
result["tool_used"] = "stress-ng"
|
||||||
try:
|
try:
|
||||||
|
logger.info(f"[CPU STRESS TEST] 开始使用 stress-ng 进行压力测试,持续时间: {duration}秒")
|
||||||
result["start_time"] = time.strftime('%Y-%m-%d %H:%M:%S')
|
result["start_time"] = time.strftime('%Y-%m-%d %H:%M:%S')
|
||||||
|
|
||||||
# 获取测试前温度
|
# 获取测试前温度
|
||||||
temp_before = get_cpu_temperature()
|
temp_before = get_cpu_temperature()
|
||||||
|
temp_before_val = temp_before.get("max_c", "N/A")
|
||||||
|
logger.info(f"[CPU STRESS TEST] 测试前温度: {temp_before_val}°C")
|
||||||
|
|
||||||
# 运行 stress-ng
|
# 运行 stress-ng
|
||||||
# --cpu 0 使用所有 CPU 核心
|
# --cpu 0 使用所有 CPU 核心
|
||||||
@@ -433,6 +439,8 @@ def run_cpu_stress_test(duration: int = 300) -> Dict[str, Any]:
|
|||||||
'--metrics-brief'
|
'--metrics-brief'
|
||||||
]
|
]
|
||||||
|
|
||||||
|
logger.info(f"[CPU STRESS TEST] 执行命令: {' '.join(cmd)}")
|
||||||
|
|
||||||
_, stdout, stderr = execute_command(
|
_, stdout, stderr = execute_command(
|
||||||
cmd,
|
cmd,
|
||||||
timeout=duration + 30, # 给一些额外时间
|
timeout=duration + 30, # 给一些额外时间
|
||||||
@@ -440,9 +448,12 @@ def run_cpu_stress_test(duration: int = 300) -> Dict[str, Any]:
|
|||||||
)
|
)
|
||||||
|
|
||||||
result["end_time"] = time.strftime('%Y-%m-%d %H:%M:%S')
|
result["end_time"] = time.strftime('%Y-%m-%d %H:%M:%S')
|
||||||
|
logger.info("[CPU STRESS TEST] stress-ng 执行完成")
|
||||||
|
|
||||||
# 获取测试后温度
|
# 获取测试后温度
|
||||||
temp_after = get_cpu_temperature()
|
temp_after = get_cpu_temperature()
|
||||||
|
temp_after_val = temp_after.get("max_c", "N/A")
|
||||||
|
logger.info(f"[CPU STRESS TEST] 测试后温度: {temp_after_val}°C")
|
||||||
|
|
||||||
# 分析输出
|
# 分析输出
|
||||||
output = stdout + stderr
|
output = stdout + stderr
|
||||||
@@ -451,13 +462,16 @@ def run_cpu_stress_test(duration: int = 300) -> Dict[str, Any]:
|
|||||||
if 'error' in output.lower() or 'fail' in output.lower():
|
if 'error' in output.lower() or 'fail' in output.lower():
|
||||||
result["passed"] = False
|
result["passed"] = False
|
||||||
result["errors"].append("压力测试过程中发现错误")
|
result["errors"].append("压力测试过程中发现错误")
|
||||||
|
logger.error("[CPU STRESS TEST] 压力测试执行过程中发现错误")
|
||||||
else:
|
else:
|
||||||
result["passed"] = True
|
result["passed"] = True
|
||||||
|
logger.info("[CPU STRESS TEST] 压力测试通过")
|
||||||
|
|
||||||
# 提取性能指标
|
# 提取性能指标
|
||||||
bogo_ops = re.search(r'stress-ng:\s+cpu:\s+(\d+)\s+bogo ops', output)
|
bogo_ops = re.search(r'stress-ng:\s+cpu:\s+(\d+)\s+bogo ops', output)
|
||||||
if bogo_ops:
|
if bogo_ops:
|
||||||
result["bogo_ops"] = safe_int(bogo_ops.group(1))
|
result["bogo_ops"] = safe_int(bogo_ops.group(1))
|
||||||
|
logger.info(f"[CPU STRESS TEST] Bogo ops: {result['bogo_ops']}")
|
||||||
|
|
||||||
bogo_ops_per_sec = re.search(r'(\d+\.\d+)\s+bogo ops per second', output)
|
bogo_ops_per_sec = re.search(r'(\d+\.\d+)\s+bogo ops per second', output)
|
||||||
if bogo_ops_per_sec:
|
if bogo_ops_per_sec:
|
||||||
@@ -475,16 +489,22 @@ def run_cpu_stress_test(duration: int = 300) -> Dict[str, Any]:
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
result["passed"] = False
|
result["passed"] = False
|
||||||
result["errors"].append(str(e))
|
result["errors"].append(str(e))
|
||||||
|
logger.exception(f"[CPU STRESS TEST] stress-ng 执行异常: {e}")
|
||||||
|
|
||||||
# 备选: 使用 stress
|
# 备选: 使用 stress
|
||||||
elif check_command_exists('stress'):
|
elif check_command_exists('stress'):
|
||||||
result["tool_used"] = "stress"
|
result["tool_used"] = "stress"
|
||||||
try:
|
try:
|
||||||
|
logger.info(f"[CPU STRESS TEST] 开始使用 stress 进行压力测试,持续时间: {duration}秒")
|
||||||
result["start_time"] = time.strftime('%Y-%m-%d %H:%M:%S')
|
result["start_time"] = time.strftime('%Y-%m-%d %H:%M:%S')
|
||||||
|
|
||||||
temp_before = get_cpu_temperature()
|
temp_before = get_cpu_temperature()
|
||||||
|
temp_before_val = temp_before.get("max_c", "N/A")
|
||||||
|
logger.info(f"[CPU STRESS TEST] 测试前温度: {temp_before_val}°C")
|
||||||
|
|
||||||
num_cores = os.cpu_count() or 1
|
num_cores = os.cpu_count() or 1
|
||||||
|
logger.info(f"[CPU STRESS TEST] 使用 {num_cores} 个 CPU 核心")
|
||||||
|
|
||||||
_, stdout, stderr = execute_command(
|
_, stdout, stderr = execute_command(
|
||||||
['stress', '--cpu', str(num_cores), '--timeout', str(duration)],
|
['stress', '--cpu', str(num_cores), '--timeout', str(duration)],
|
||||||
timeout=duration + 30,
|
timeout=duration + 30,
|
||||||
@@ -492,7 +512,11 @@ def run_cpu_stress_test(duration: int = 300) -> Dict[str, Any]:
|
|||||||
)
|
)
|
||||||
|
|
||||||
result["end_time"] = time.strftime('%Y-%m-%d %H:%M:%S')
|
result["end_time"] = time.strftime('%Y-%m-%d %H:%M:%S')
|
||||||
|
logger.info("[CPU STRESS TEST] stress 执行完成")
|
||||||
|
|
||||||
temp_after = get_cpu_temperature()
|
temp_after = get_cpu_temperature()
|
||||||
|
temp_after_val = temp_after.get("max_c", "N/A")
|
||||||
|
logger.info(f"[CPU STRESS TEST] 测试后温度: {temp_after_val}°C")
|
||||||
|
|
||||||
result["passed"] = True
|
result["passed"] = True
|
||||||
result["temperature_before"] = temp_before
|
result["temperature_before"] = temp_before
|
||||||
@@ -504,11 +528,13 @@ def run_cpu_stress_test(duration: int = 300) -> Dict[str, Any]:
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
result["passed"] = False
|
result["passed"] = False
|
||||||
result["errors"].append(str(e))
|
result["errors"].append(str(e))
|
||||||
|
logger.exception(f"[CPU STRESS TEST] stress 执行异常: {e}")
|
||||||
|
|
||||||
else:
|
else:
|
||||||
result["passed"] = False
|
result["passed"] = False
|
||||||
result["errors"].append("未找到压力测试工具 (stress-ng 或 stress)")
|
result["errors"].append("未找到压力测试工具 (stress-ng 或 stress)")
|
||||||
result["note"] = "请安装 stress-ng 或 stress: yum install stress / apt install stress-ng"
|
result["note"] = "请安装 stress-ng 或 stress: yum install stress / apt install stress-ng"
|
||||||
|
logger.error("[CPU STRESS TEST] 未找到压力测试工具 (stress-ng 或 stress)")
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|||||||
@@ -358,6 +358,9 @@ def run_memtester(duration: int = 300) -> Dict[str, Any]:
|
|||||||
Returns:
|
Returns:
|
||||||
Dict[str, Any]: 测试结果
|
Dict[str, Any]: 测试结果
|
||||||
"""
|
"""
|
||||||
|
import logging
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
result = {
|
result = {
|
||||||
"passed": False,
|
"passed": False,
|
||||||
"size_mb": 0,
|
"size_mb": 0,
|
||||||
@@ -371,9 +374,12 @@ def run_memtester(duration: int = 300) -> Dict[str, Any]:
|
|||||||
|
|
||||||
if not check_command_exists('memtester'):
|
if not check_command_exists('memtester'):
|
||||||
result["errors"].append("memtester 未安装")
|
result["errors"].append("memtester 未安装")
|
||||||
|
logger.warning("[MEMORY STRESS TEST] memtester 未安装")
|
||||||
return result
|
return result
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
logger.info("[MEMORY STRESS TEST] 开始使用 memtester 进行内存测试")
|
||||||
|
|
||||||
# 计算测试内存大小
|
# 计算测试内存大小
|
||||||
# 留出一些内存给系统和 stress-ng 使用
|
# 留出一些内存给系统和 stress-ng 使用
|
||||||
with open('/proc/meminfo', 'r') as f:
|
with open('/proc/meminfo', 'r') as f:
|
||||||
@@ -391,8 +397,11 @@ def run_memtester(duration: int = 300) -> Dict[str, Any]:
|
|||||||
result["start_time"] = time.strftime('%Y-%m-%d %H:%M:%S')
|
result["start_time"] = time.strftime('%Y-%m-%d %H:%M:%S')
|
||||||
start_ts = time.time()
|
start_ts = time.time()
|
||||||
|
|
||||||
|
logger.info(f"[MEMORY STRESS TEST] 测试内存大小: {test_size_mb}MB")
|
||||||
|
|
||||||
# 运行 memtester
|
# 运行 memtester
|
||||||
cmd = ['memtester', f'{test_size_mb}M', '1']
|
cmd = ['memtester', f'{test_size_mb}M', '1']
|
||||||
|
logger.info(f"[MEMORY STRESS TEST] 执行命令: {' '.join(cmd)}")
|
||||||
|
|
||||||
_, stdout, stderr = execute_command(
|
_, stdout, stderr = execute_command(
|
||||||
cmd,
|
cmd,
|
||||||
@@ -403,25 +412,32 @@ def run_memtester(duration: int = 300) -> Dict[str, Any]:
|
|||||||
result["end_time"] = time.strftime('%Y-%m-%d %H:%M:%S')
|
result["end_time"] = time.strftime('%Y-%m-%d %H:%M:%S')
|
||||||
result["duration_seconds"] = round(time.time() - start_ts, 2)
|
result["duration_seconds"] = round(time.time() - start_ts, 2)
|
||||||
|
|
||||||
|
logger.info(f"[MEMORY STRESS TEST] memtester 执行完成,耗时: {result['duration_seconds']}秒")
|
||||||
|
|
||||||
output = stdout + stderr
|
output = stdout + stderr
|
||||||
result["raw_output"] = output[:2000] # 保存部分原始输出
|
result["raw_output"] = output[:2000] # 保存部分原始输出
|
||||||
|
|
||||||
# 分析结果
|
# 分析结果
|
||||||
if 'FAILURE' in output.upper():
|
if 'FAILURE' in output.upper():
|
||||||
result["passed"] = False
|
result["passed"] = False
|
||||||
|
logger.error("[MEMORY STRESS TEST] 测试失败: 发现 FAILURE")
|
||||||
# 提取错误信息
|
# 提取错误信息
|
||||||
for line in output.split('\n'):
|
for line in output.split('\n'):
|
||||||
if 'FAILURE' in line.upper() or 'error' in line.lower():
|
if 'FAILURE' in line.upper() or 'error' in line.lower():
|
||||||
result["errors"].append(line.strip())
|
result["errors"].append(line.strip())
|
||||||
|
logger.error(f"[MEMORY STRESS TEST] 错误详情: {line.strip()}")
|
||||||
elif 'SUCCESS' in output.upper() or 'ok' in output.lower() or 'finished' in output.lower():
|
elif 'SUCCESS' in output.upper() or 'ok' in output.lower() or 'finished' in output.lower():
|
||||||
result["passed"] = True
|
result["passed"] = True
|
||||||
|
logger.info("[MEMORY STRESS TEST] 测试通过")
|
||||||
else:
|
else:
|
||||||
# 检查是否完成所有测试
|
# 检查是否完成所有测试
|
||||||
if 'Done' in output or 'finished' in output.lower():
|
if 'Done' in output or 'finished' in output.lower():
|
||||||
result["passed"] = True
|
result["passed"] = True
|
||||||
|
logger.info("[MEMORY STRESS TEST] 测试完成")
|
||||||
else:
|
else:
|
||||||
result["passed"] = False
|
result["passed"] = False
|
||||||
result["errors"].append("测试可能未完成")
|
result["errors"].append("测试可能未完成")
|
||||||
|
logger.warning("[MEMORY STRESS TEST] 测试可能未完成")
|
||||||
|
|
||||||
# 提取运行的测试
|
# 提取运行的测试
|
||||||
test_names = [
|
test_names = [
|
||||||
@@ -436,9 +452,12 @@ def run_memtester(duration: int = 300) -> Dict[str, Any]:
|
|||||||
if test in output:
|
if test in output:
|
||||||
result["tests_run"].append(test)
|
result["tests_run"].append(test)
|
||||||
|
|
||||||
|
logger.info(f"[MEMORY STRESS TEST] 执行的测试项: {', '.join(result['tests_run'])}")
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
result["passed"] = False
|
result["passed"] = False
|
||||||
result["errors"].append(str(e))
|
result["errors"].append(str(e))
|
||||||
|
logger.exception(f"[MEMORY STRESS TEST] memtester 执行异常: {e}")
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|||||||
87
utils.py
87
utils.py
@@ -161,13 +161,98 @@ def setup_logging(
|
|||||||
|
|
||||||
if log_file:
|
if log_file:
|
||||||
os.makedirs(os.path.dirname(log_file) or '.', exist_ok=True)
|
os.makedirs(os.path.dirname(log_file) or '.', exist_ok=True)
|
||||||
file_handler = logging.FileHandler(log_file)
|
# 使用 FileHandler 并设置立即刷新
|
||||||
|
file_handler = logging.FileHandler(log_file, mode='a')
|
||||||
file_handler.setFormatter(formatter)
|
file_handler.setFormatter(formatter)
|
||||||
|
# 确保每次日志写入后立即刷新到磁盘
|
||||||
|
file_handler.flush = lambda: file_handler.stream.flush()
|
||||||
logger.addHandler(file_handler)
|
logger.addHandler(file_handler)
|
||||||
|
|
||||||
return logger
|
return logger
|
||||||
|
|
||||||
|
|
||||||
|
class ProgressLogger:
|
||||||
|
"""
|
||||||
|
进度日志记录器 - 用于记录测试进度,便于中断后排查问题。
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, log_file: Optional[str] = None):
|
||||||
|
self.logger = logging.getLogger(__name__)
|
||||||
|
self.log_file = log_file
|
||||||
|
self.steps = []
|
||||||
|
self.current_step = None
|
||||||
|
self.start_time = None
|
||||||
|
|
||||||
|
def start(self, operation: str):
|
||||||
|
"""开始一个操作步骤。"""
|
||||||
|
from datetime import datetime
|
||||||
|
self.current_step = {
|
||||||
|
"operation": operation,
|
||||||
|
"start_time": datetime.now().isoformat(),
|
||||||
|
"status": "running"
|
||||||
|
}
|
||||||
|
self.start_time = datetime.now()
|
||||||
|
msg = f"[START] {operation}"
|
||||||
|
self.logger.info(msg)
|
||||||
|
self._flush_log()
|
||||||
|
|
||||||
|
def end(self, status: str = "success", message: str = ""):
|
||||||
|
"""结束当前操作步骤。"""
|
||||||
|
from datetime import datetime
|
||||||
|
if self.current_step:
|
||||||
|
end_time = datetime.now()
|
||||||
|
duration = (end_time - self.start_time).total_seconds() if self.start_time else 0
|
||||||
|
|
||||||
|
self.current_step["end_time"] = end_time.isoformat()
|
||||||
|
self.current_step["status"] = status
|
||||||
|
self.current_step["duration_seconds"] = duration
|
||||||
|
self.current_step["message"] = message
|
||||||
|
|
||||||
|
self.steps.append(self.current_step)
|
||||||
|
|
||||||
|
msg = f"[END] {self.current_step['operation']} - Status: {status}"
|
||||||
|
if message:
|
||||||
|
msg += f" - {message}"
|
||||||
|
msg += f" (Duration: {duration:.2f}s)"
|
||||||
|
|
||||||
|
if status == "error":
|
||||||
|
self.logger.error(msg)
|
||||||
|
elif status == "warning":
|
||||||
|
self.logger.warning(msg)
|
||||||
|
else:
|
||||||
|
self.logger.info(msg)
|
||||||
|
|
||||||
|
self._flush_log()
|
||||||
|
self.current_step = None
|
||||||
|
|
||||||
|
def log(self, message: str, level: str = "info"):
|
||||||
|
"""记录中间日志。"""
|
||||||
|
msg = f"[PROGRESS] {self.current_step['operation'] if self.current_step else 'UNKNOWN'} - {message}"
|
||||||
|
if level == "error":
|
||||||
|
self.logger.error(msg)
|
||||||
|
elif level == "warning":
|
||||||
|
self.logger.warning(msg)
|
||||||
|
elif level == "debug":
|
||||||
|
self.logger.debug(msg)
|
||||||
|
else:
|
||||||
|
self.logger.info(msg)
|
||||||
|
self._flush_log()
|
||||||
|
|
||||||
|
def _flush_log(self):
|
||||||
|
"""强制刷新日志到磁盘。"""
|
||||||
|
for handler in self.logger.handlers:
|
||||||
|
if hasattr(handler, 'flush'):
|
||||||
|
handler.flush()
|
||||||
|
|
||||||
|
def get_summary(self) -> Dict[str, Any]:
|
||||||
|
"""获取执行摘要。"""
|
||||||
|
return {
|
||||||
|
"total_steps": len(self.steps),
|
||||||
|
"steps": self.steps,
|
||||||
|
"current_running": self.current_step
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def parse_key_value_output(text: str, delimiter: str = ':') -> Dict[str, str]:
|
def parse_key_value_output(text: str, delimiter: str = ':') -> Dict[str, str]:
|
||||||
"""
|
"""
|
||||||
解析 key: value 格式的文本输出。
|
解析 key: value 格式的文本输出。
|
||||||
|
|||||||
Reference in New Issue
Block a user