增强日志

This commit is contained in:
zj
2026-03-02 15:50:51 +08:00
parent e16578a304
commit deb4fa0e79
5 changed files with 384 additions and 25 deletions

View File

@@ -19,6 +19,7 @@ ServerGuard 是一款基于 Python 的 Linux 命令行工具,用于诊断服
- **Python**: 3.6 或更高版本 - **Python**: 3.6 或更高版本
- **权限**: root 权限(大多数硬件诊断功能需要) - **权限**: root 权限(大多数硬件诊断功能需要)
- **架构**: x86_64 (AMD64) - **架构**: x86_64 (AMD64)
- **磁盘空间**: 至少 100MB 可用空间(用于日志和报告)
## 克隆及安装方法 ## 克隆及安装方法
@@ -313,6 +314,62 @@ python3 quick_test.py
python3 -m unittest discover tests/ -v python3 -m unittest discover tests/ -v
``` ```
## 日志记录
ServerGuard 会实时记录详细的测试日志,方便排查问题:
### 日志文件位置
默认日志文件路径:`/var/log/serverguard.log`
```bash
# 查看实时日志
tail -f /var/log/serverguard.log
# 查看最近 100 行日志
tail -n 100 /var/log/serverguard.log
```
### 日志内容说明
日志包含以下关键信息:
- **启动信息**: 程序启动时间、命令行参数、Python版本等
- **进度记录**: 每个模块的检测开始和结束时间
- **详细步骤**: 压力测试前后的温度、执行的命令等
- **错误信息**: 详细的异常信息和堆栈跟踪
### 日志示例
```
2026-03-02 15:41:28 - ServerGuard 启动
2026-03-02 15:41:28 - [DIAGNOSTIC START] 全面硬件诊断
2026-03-02 15:41:28 - [PROGRESS] 模块 1/7: system
2026-03-02 15:41:28 - [MODULE START] cpu - stress_test=True, duration=300
2026-03-02 15:41:28 - [CPU STRESS TEST] 测试前温度: 45°C
2026-03-02 15:46:28 - [CPU STRESS TEST] 测试后温度: 78°C
2026-03-02 15:46:28 - [CPU STRESS TEST] 压力测试通过
```
### 排查机器重启/关机问题
如果测试过程中机器意外关机或重启,查看日志文件:
```bash
# 查找最后的日志记录
tail -n 50 /var/log/serverguard.log
# 查找压力测试相关的日志
grep "STRESS TEST" /var/log/serverguard.log
# 查找错误信息
grep -i "error\|exception\|failed" /var/log/serverguard.log
```
**常见情况分析:**
- 如果在 `[CPU STRESS TEST]``[MEMORY STRESS TEST]` 后没有 `[END]` 记录,说明机器在压力测试中出现问题
- 查看压力测试前后的温度记录,判断是否因过热导致关机
- 查看系统日志 `dmesg``/var/log/messages` 确认硬件错误
## 故障排除 ## 故障排除
### 1. 提示 "未找到压力测试工具" ### 1. 提示 "未找到压力测试工具"

220
main.py
View File

@@ -167,7 +167,7 @@ def confirm_stress_test(duration: int, auto_confirm: bool = False) -> bool:
return False return False
def run_module(module_name: str, stress_test: bool = False, stress_duration: int = 300) -> Dict[str, Any]: def run_module(module_name: str, stress_test: bool = False, stress_duration: int = 300, progress_logger=None) -> Dict[str, Any]:
""" """
运行指定的检测模块。 运行指定的检测模块。
@@ -175,6 +175,7 @@ def run_module(module_name: str, stress_test: bool = False, stress_duration: int
module_name: 模块名称 module_name: 模块名称
stress_test: 是否执行压力测试 stress_test: 是否执行压力测试
stress_duration: 压力测试持续时间 stress_duration: 压力测试持续时间
progress_logger: 进度日志记录器
Returns: Returns:
Dict[str, Any]: 模块检测结果 Dict[str, Any]: 模块检测结果
@@ -196,39 +197,89 @@ def run_module(module_name: str, stress_test: bool = False, stress_duration: int
logger.error(f"未知模块: {module_name}") logger.error(f"未知模块: {module_name}")
return {"status": "error", "error": f"未知模块: {module_name}"} return {"status": "error", "error": f"未知模块: {module_name}"}
# 记录开始
if progress_logger:
progress_logger.start(f"模块检测: {module_name}")
try: try:
logger.info(f"[MODULE START] {module_name} - stress_test={stress_test}, duration={stress_duration}")
module = __import__(module_map[module_name], fromlist=['']) module = __import__(module_map[module_name], fromlist=[''])
result = None
if module_name == 'system': if module_name == 'system':
return module.get_system_info() if progress_logger:
progress_logger.log("开始收集系统信息")
result = module.get_system_info()
elif module_name == 'cpu': elif module_name == 'cpu':
return module.run_cpu_check(stress_test, stress_duration) if progress_logger:
progress_logger.log(f"开始CPU检测 (stress_test={stress_test})")
result = module.run_cpu_check(stress_test, stress_duration)
elif module_name == 'memory': elif module_name == 'memory':
return module.run_memory_check(stress_test, stress_duration) if progress_logger:
progress_logger.log(f"开始内存检测 (stress_test={stress_test})")
result = module.run_memory_check(stress_test, stress_duration)
elif module_name == 'storage': elif module_name == 'storage':
return module.run_storage_check() if progress_logger:
progress_logger.log("开始存储设备检测")
result = module.run_storage_check()
elif module_name == 'sensors': elif module_name == 'sensors':
return module.run_sensors_check() if progress_logger:
progress_logger.log("开始传感器监控")
result = module.run_sensors_check()
elif module_name == 'gpu': elif module_name == 'gpu':
return module.run_gpu_check() if progress_logger:
progress_logger.log("开始显卡检测")
result = module.run_gpu_check()
elif module_name == 'logs': elif module_name == 'logs':
return module.analyze_logs() if progress_logger:
progress_logger.log("开始日志分析")
result = module.analyze_logs()
# 记录结果
status = result.get("status", "unknown") if result else "unknown"
logger.info(f"[MODULE END] {module_name} - Status: {status}")
# 如果结果中有错误信息,记录到日志
if result and result.get("error"):
logger.error(f"[MODULE ERROR] {module_name}: {result['error']}")
if progress_logger:
progress_logger.end(status=status)
return result
except Exception as e: except Exception as e:
logger.error(f"运行模块 {module_name} 时出错: {e}") error_msg = f"运行模块 {module_name} 时出错: {e}"
logger.exception(error_msg)
if progress_logger:
progress_logger.end(status="error", message=str(e))
return {"status": "error", "error": str(e)} return {"status": "error", "error": str(e)}
def run_quick_check() -> Dict[str, Any]: def run_quick_check(progress_logger=None) -> Dict[str, Any]:
""" """
执行快速检测(非侵入性)。 执行快速检测(非侵入性)。
Args:
progress_logger: 进度日志记录器
Returns: Returns:
Dict[str, Any]: 检测结果 Dict[str, Any]: 检测结果
""" """
import logging import logging
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
# 记录检测开始
logger.info("=" * 70)
logger.info("[DIAGNOSTIC START] 快速硬件检测")
logger.info("=" * 70)
if progress_logger:
progress_logger.start("快速硬件检测")
print("正在执行快速硬件检测...") print("正在执行快速硬件检测...")
print("-" * 60) print("-" * 60)
@@ -239,37 +290,54 @@ def run_quick_check() -> Dict[str, Any]:
} }
modules_to_run = ['system', 'cpu', 'memory', 'storage', 'sensors', 'gpu', 'logs'] modules_to_run = ['system', 'cpu', 'memory', 'storage', 'sensors', 'gpu', 'logs']
total_modules = len(modules_to_run)
for module_name in modules_to_run: for idx, module_name in enumerate(modules_to_run, 1):
logger.info(f"[PROGRESS] 模块 {idx}/{total_modules}: {module_name}")
print(f"正在检测: {module_name}...", end=' ', flush=True) print(f"正在检测: {module_name}...", end=' ', flush=True)
try: try:
result = run_module(module_name, stress_test=False) result = run_module(module_name, stress_test=False, progress_logger=progress_logger)
results["modules"][module_name] = result results["modules"][module_name] = result
status = result.get("status", "unknown") status = result.get("status", "unknown")
if status == "success": if status == "success":
print("[完成]") print("[完成]")
logger.info(f"[MODULE SUCCESS] {module_name}")
elif status == "warning": elif status == "warning":
print("[警告]") print("[警告]")
logger.warning(f"[MODULE WARNING] {module_name}")
elif status == "error": elif status == "error":
print("[错误]") print("[错误]")
logger.error(f"[MODULE ERROR] {module_name}: {result.get('error', 'Unknown error')}")
else: else:
print(f"[{status}]") print(f"[{status}]")
logger.info(f"[MODULE {status.upper()}] {module_name}")
except Exception as e: except Exception as e:
logger.error(f"模块 {module_name} 执行失败: {e}") error_msg = f"模块 {module_name} 执行失败: {e}"
logger.exception(error_msg)
results["modules"][module_name] = {"status": "error", "error": str(e)} results["modules"][module_name] = {"status": "error", "error": str(e)}
print("[失败]") print("[失败]")
print("-" * 60) print("-" * 60)
logger.info("[DIAGNOSTIC END] 快速硬件检测完成")
logger.info("=" * 70)
if progress_logger:
progress_logger.end(status="success")
return results return results
def run_full_diagnostic(stress_duration: int, auto_confirm: bool = False) -> Dict[str, Any]: def run_full_diagnostic(stress_duration: int, auto_confirm: bool = False, progress_logger=None) -> Dict[str, Any]:
""" """
执行全面诊断(包含压力测试)。 执行全面诊断(包含压力测试)。
Args: Args:
stress_duration: 压力测试持续时间 stress_duration: 压力测试持续时间
auto_confirm: 是否自动确认 auto_confirm: 是否自动确认
progress_logger: 进度日志记录器
Returns: Returns:
Dict[str, Any]: 检测结果 Dict[str, Any]: 检测结果
@@ -281,6 +349,14 @@ def run_full_diagnostic(stress_duration: int, auto_confirm: bool = False) -> Dic
print("诊断已取消") print("诊断已取消")
sys.exit(0) sys.exit(0)
# 记录诊断开始
logger.info("=" * 70)
logger.info(f"[DIAGNOSTIC START] 全面硬件诊断 (stress_duration={stress_duration}s)")
logger.info("=" * 70)
if progress_logger:
progress_logger.start("全面硬件诊断")
print("\n正在执行全面硬件诊断...") print("\n正在执行全面硬件诊断...")
print("=" * 60) print("=" * 60)
@@ -293,38 +369,73 @@ def run_full_diagnostic(stress_duration: int, auto_confirm: bool = False) -> Dic
# 先执行快速检测 # 先执行快速检测
modules_to_run = ['system', 'cpu', 'memory', 'storage', 'sensors', 'gpu', 'logs'] modules_to_run = ['system', 'cpu', 'memory', 'storage', 'sensors', 'gpu', 'logs']
total_modules = len(modules_to_run)
for module_name in modules_to_run: for idx, module_name in enumerate(modules_to_run, 1):
logger.info(f"[PROGRESS] 模块 {idx}/{total_modules}: {module_name}")
print(f"\n正在检测: {module_name}...") print(f"\n正在检测: {module_name}...")
try: try:
# CPU 和内存执行压力测试 # CPU 和内存执行压力测试
do_stress = module_name in ['cpu', 'memory'] do_stress = module_name in ['cpu', 'memory']
result = run_module(module_name, stress_test=do_stress, stress_duration=stress_duration)
if do_stress:
logger.warning(f"[STRESS TEST] {module_name} 压力测试即将开始 (duration={stress_duration}s)")
print(f" ⚠ 即将执行 {module_name} 压力测试,持续时间 {stress_duration}")
result = run_module(module_name, stress_test=do_stress, stress_duration=stress_duration, progress_logger=progress_logger)
results["modules"][module_name] = result results["modules"][module_name] = result
status = result.get("status", "unknown") status = result.get("status", "unknown")
print(f" 状态: {status}") print(f" 状态: {status}")
if status == "success":
logger.info(f"[MODULE SUCCESS] {module_name}")
elif status == "warning":
logger.warning(f"[MODULE WARNING] {module_name}")
elif status == "error":
logger.error(f"[MODULE ERROR] {module_name}: {result.get('error', 'Unknown error')}")
except Exception as e: except Exception as e:
logger.error(f"模块 {module_name} 执行失败: {e}") error_msg = f"模块 {module_name} 执行失败: {e}"
logger.exception(error_msg)
results["modules"][module_name] = {"status": "error", "error": str(e)} results["modules"][module_name] = {"status": "error", "error": str(e)}
print(f" 状态: 失败 - {e}") print(f" 状态: 失败 - {e}")
print("\n" + "=" * 60) print("\n" + "=" * 60)
logger.info("[DIAGNOSTIC END] 全面硬件诊断完成")
logger.info("=" * 70)
if progress_logger:
progress_logger.end(status="success")
return results return results
def run_specific_modules(module_list: str, stress_duration: int) -> Dict[str, Any]: def run_specific_modules(module_list: str, stress_duration: int, progress_logger=None) -> Dict[str, Any]:
""" """
运行指定的模块列表。 运行指定的模块列表。
Args: Args:
module_list: 逗号分隔的模块名称 module_list: 逗号分隔的模块名称
stress_duration: 压力测试持续时间 stress_duration: 压力测试持续时间
progress_logger: 进度日志记录器
Returns: Returns:
Dict[str, Any]: 检测结果 Dict[str, Any]: 检测结果
""" """
import logging
logger = logging.getLogger(__name__)
modules = [m.strip() for m in module_list.split(',')] modules = [m.strip() for m in module_list.split(',')]
logger.info("=" * 70)
logger.info(f"[DIAGNOSTIC START] 自定义模块检测: {', '.join(modules)}")
logger.info("=" * 70)
if progress_logger:
progress_logger.start(f"自定义模块检测: {', '.join(modules)}")
results = { results = {
"scan_type": "custom", "scan_type": "custom",
"timestamp": get_file_timestamp(), "timestamp": get_file_timestamp(),
@@ -334,18 +445,38 @@ def run_specific_modules(module_list: str, stress_duration: int) -> Dict[str, An
print(f"正在执行自定义模块检测: {', '.join(modules)}") print(f"正在执行自定义模块检测: {', '.join(modules)}")
print("-" * 60) print("-" * 60)
for module_name in modules: total_modules = len(modules)
for idx, module_name in enumerate(modules, 1):
logger.info(f"[PROGRESS] 模块 {idx}/{total_modules}: {module_name}")
print(f"正在检测: {module_name}...", end=' ', flush=True) print(f"正在检测: {module_name}...", end=' ', flush=True)
try: try:
result = run_module(module_name, stress_test=False) result = run_module(module_name, stress_test=False, progress_logger=progress_logger)
results["modules"][module_name] = result results["modules"][module_name] = result
status = result.get("status", "unknown") status = result.get("status", "unknown")
print(f"[{status}]") print(f"[{status}]")
if status == "error":
logger.error(f"[MODULE ERROR] {module_name}: {result.get('error', 'Unknown error')}")
elif status == "warning":
logger.warning(f"[MODULE WARNING] {module_name}")
else:
logger.info(f"[MODULE SUCCESS] {module_name}")
except Exception as e: except Exception as e:
error_msg = f"模块 {module_name} 执行失败: {e}"
logger.exception(error_msg)
results["modules"][module_name] = {"status": "error", "error": str(e)} results["modules"][module_name] = {"status": "error", "error": str(e)}
print(f"[失败: {e}]") print(f"[失败: {e}]")
print("-" * 60) print("-" * 60)
logger.info("[DIAGNOSTIC END] 自定义模块检测完成")
logger.info("=" * 70)
if progress_logger:
progress_logger.end(status="success")
return results return results
@@ -362,6 +493,19 @@ def main():
) )
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
# 创建进度日志记录器
from utils import ProgressLogger
progress_logger = ProgressLogger(log_file=args.log)
# 记录程序启动信息
logger.info("=" * 70)
logger.info("ServerGuard 启动")
logger.info(f"命令行参数: {' '.join(sys.argv)}")
logger.info(f"工作目录: {os.getcwd()}")
logger.info(f"Python版本: {sys.version}")
logger.info(f"用户ID: {os.getuid()}, 是否为root: {check_root_privileges()}")
logger.info("=" * 70)
# 列出模块 # 列出模块
if args.list_modules: if args.list_modules:
list_available_modules() list_available_modules()
@@ -375,21 +519,31 @@ def main():
# 执行诊断 # 执行诊断
try: try:
progress_logger.start("ServerGuard 诊断任务")
if args.quick: if args.quick:
results = run_quick_check() logger.info("执行模式: 快速检测")
results = run_quick_check(progress_logger=progress_logger)
elif args.full: elif args.full:
results = run_full_diagnostic(args.stress_duration, args.yes) logger.info("执行模式: 全面诊断")
results = run_full_diagnostic(args.stress_duration, args.yes, progress_logger=progress_logger)
elif args.module: elif args.module:
results = run_specific_modules(args.module, args.stress_duration) logger.info(f"执行模式: 自定义模块 - {args.module}")
results = run_specific_modules(args.module, args.stress_duration, progress_logger=progress_logger)
else: else:
print("请指定操作模式: --quick, --full, --module 或 --list-modules") print("请指定操作模式: --quick, --full, --module 或 --list-modules")
sys.exit(1) sys.exit(1)
# 记录诊断完成
progress_logger.end(status="success")
logger.info("诊断执行完成,正在生成报告...")
# 生成报告 # 生成报告
generator = ReportGenerator() generator = ReportGenerator()
if args.output: if args.output:
generator.save_report(results, args.format, args.output) generator.save_report(results, args.format, args.output)
logger.info(f"报告已保存至: {args.output}")
print(f"\n报告已保存至: {args.output}") print(f"\n报告已保存至: {args.output}")
else: else:
report = generator.generate_report(results, args.format) report = generator.generate_report(results, args.format)
@@ -403,14 +557,32 @@ def main():
m.get("status") == "error" m.get("status") == "error"
for m in results.get("modules", {}).values() for m in results.get("modules", {}).values()
) )
logger.info("=" * 70)
logger.info(f"ServerGuard 正常结束 - 退出码: {1 if has_errors else 0}")
logger.info("=" * 70)
sys.exit(1 if has_errors else 0) sys.exit(1 if has_errors else 0)
except KeyboardInterrupt: except KeyboardInterrupt:
logger.warning("操作已被用户中断 (KeyboardInterrupt)")
print("\n\n操作已被用户中断") print("\n\n操作已被用户中断")
sys.exit(130) sys.exit(130)
except Exception as e: except Exception as e:
logger.exception("程序执行过程中发生错误") error_msg = f"程序执行过程中发生严重错误: {e}"
logger.exception(error_msg)
logger.error("=" * 70)
logger.error(f"异常类型: {type(e).__name__}")
logger.error(f"异常信息: {e}")
logger.error("=" * 70)
# 尝试记录当前进度
if progress_logger and progress_logger.current_step:
logger.error(f"错误发生时正在执行的步骤: {progress_logger.current_step}")
print(f"\n错误: {e}") print(f"\n错误: {e}")
print(f"\n详细错误信息已记录到日志: {args.log}")
sys.exit(1) sys.exit(1)

View File

@@ -402,6 +402,9 @@ def run_cpu_stress_test(duration: int = 300) -> Dict[str, Any]:
Returns: Returns:
Dict[str, Any]: 测试结果 Dict[str, Any]: 测试结果
""" """
import logging
logger = logging.getLogger(__name__)
result = { result = {
"passed": False, "passed": False,
"duration_seconds": duration, "duration_seconds": duration,
@@ -417,10 +420,13 @@ def run_cpu_stress_test(duration: int = 300) -> Dict[str, Any]:
if check_command_exists('stress-ng'): if check_command_exists('stress-ng'):
result["tool_used"] = "stress-ng" result["tool_used"] = "stress-ng"
try: try:
logger.info(f"[CPU STRESS TEST] 开始使用 stress-ng 进行压力测试,持续时间: {duration}")
result["start_time"] = time.strftime('%Y-%m-%d %H:%M:%S') result["start_time"] = time.strftime('%Y-%m-%d %H:%M:%S')
# 获取测试前温度 # 获取测试前温度
temp_before = get_cpu_temperature() temp_before = get_cpu_temperature()
temp_before_val = temp_before.get("max_c", "N/A")
logger.info(f"[CPU STRESS TEST] 测试前温度: {temp_before_val}°C")
# 运行 stress-ng # 运行 stress-ng
# --cpu 0 使用所有 CPU 核心 # --cpu 0 使用所有 CPU 核心
@@ -433,6 +439,8 @@ def run_cpu_stress_test(duration: int = 300) -> Dict[str, Any]:
'--metrics-brief' '--metrics-brief'
] ]
logger.info(f"[CPU STRESS TEST] 执行命令: {' '.join(cmd)}")
_, stdout, stderr = execute_command( _, stdout, stderr = execute_command(
cmd, cmd,
timeout=duration + 30, # 给一些额外时间 timeout=duration + 30, # 给一些额外时间
@@ -440,9 +448,12 @@ def run_cpu_stress_test(duration: int = 300) -> Dict[str, Any]:
) )
result["end_time"] = time.strftime('%Y-%m-%d %H:%M:%S') result["end_time"] = time.strftime('%Y-%m-%d %H:%M:%S')
logger.info("[CPU STRESS TEST] stress-ng 执行完成")
# 获取测试后温度 # 获取测试后温度
temp_after = get_cpu_temperature() temp_after = get_cpu_temperature()
temp_after_val = temp_after.get("max_c", "N/A")
logger.info(f"[CPU STRESS TEST] 测试后温度: {temp_after_val}°C")
# 分析输出 # 分析输出
output = stdout + stderr output = stdout + stderr
@@ -451,13 +462,16 @@ def run_cpu_stress_test(duration: int = 300) -> Dict[str, Any]:
if 'error' in output.lower() or 'fail' in output.lower(): if 'error' in output.lower() or 'fail' in output.lower():
result["passed"] = False result["passed"] = False
result["errors"].append("压力测试过程中发现错误") result["errors"].append("压力测试过程中发现错误")
logger.error("[CPU STRESS TEST] 压力测试执行过程中发现错误")
else: else:
result["passed"] = True result["passed"] = True
logger.info("[CPU STRESS TEST] 压力测试通过")
# 提取性能指标 # 提取性能指标
bogo_ops = re.search(r'stress-ng:\s+cpu:\s+(\d+)\s+bogo ops', output) bogo_ops = re.search(r'stress-ng:\s+cpu:\s+(\d+)\s+bogo ops', output)
if bogo_ops: if bogo_ops:
result["bogo_ops"] = safe_int(bogo_ops.group(1)) result["bogo_ops"] = safe_int(bogo_ops.group(1))
logger.info(f"[CPU STRESS TEST] Bogo ops: {result['bogo_ops']}")
bogo_ops_per_sec = re.search(r'(\d+\.\d+)\s+bogo ops per second', output) bogo_ops_per_sec = re.search(r'(\d+\.\d+)\s+bogo ops per second', output)
if bogo_ops_per_sec: if bogo_ops_per_sec:
@@ -475,16 +489,22 @@ def run_cpu_stress_test(duration: int = 300) -> Dict[str, Any]:
except Exception as e: except Exception as e:
result["passed"] = False result["passed"] = False
result["errors"].append(str(e)) result["errors"].append(str(e))
logger.exception(f"[CPU STRESS TEST] stress-ng 执行异常: {e}")
# 备选: 使用 stress # 备选: 使用 stress
elif check_command_exists('stress'): elif check_command_exists('stress'):
result["tool_used"] = "stress" result["tool_used"] = "stress"
try: try:
logger.info(f"[CPU STRESS TEST] 开始使用 stress 进行压力测试,持续时间: {duration}")
result["start_time"] = time.strftime('%Y-%m-%d %H:%M:%S') result["start_time"] = time.strftime('%Y-%m-%d %H:%M:%S')
temp_before = get_cpu_temperature() temp_before = get_cpu_temperature()
temp_before_val = temp_before.get("max_c", "N/A")
logger.info(f"[CPU STRESS TEST] 测试前温度: {temp_before_val}°C")
num_cores = os.cpu_count() or 1 num_cores = os.cpu_count() or 1
logger.info(f"[CPU STRESS TEST] 使用 {num_cores} 个 CPU 核心")
_, stdout, stderr = execute_command( _, stdout, stderr = execute_command(
['stress', '--cpu', str(num_cores), '--timeout', str(duration)], ['stress', '--cpu', str(num_cores), '--timeout', str(duration)],
timeout=duration + 30, timeout=duration + 30,
@@ -492,7 +512,11 @@ def run_cpu_stress_test(duration: int = 300) -> Dict[str, Any]:
) )
result["end_time"] = time.strftime('%Y-%m-%d %H:%M:%S') result["end_time"] = time.strftime('%Y-%m-%d %H:%M:%S')
logger.info("[CPU STRESS TEST] stress 执行完成")
temp_after = get_cpu_temperature() temp_after = get_cpu_temperature()
temp_after_val = temp_after.get("max_c", "N/A")
logger.info(f"[CPU STRESS TEST] 测试后温度: {temp_after_val}°C")
result["passed"] = True result["passed"] = True
result["temperature_before"] = temp_before result["temperature_before"] = temp_before
@@ -504,11 +528,13 @@ def run_cpu_stress_test(duration: int = 300) -> Dict[str, Any]:
except Exception as e: except Exception as e:
result["passed"] = False result["passed"] = False
result["errors"].append(str(e)) result["errors"].append(str(e))
logger.exception(f"[CPU STRESS TEST] stress 执行异常: {e}")
else: else:
result["passed"] = False result["passed"] = False
result["errors"].append("未找到压力测试工具 (stress-ng 或 stress)") result["errors"].append("未找到压力测试工具 (stress-ng 或 stress)")
result["note"] = "请安装 stress-ng 或 stress: yum install stress / apt install stress-ng" result["note"] = "请安装 stress-ng 或 stress: yum install stress / apt install stress-ng"
logger.error("[CPU STRESS TEST] 未找到压力测试工具 (stress-ng 或 stress)")
return result return result

View File

@@ -358,6 +358,9 @@ def run_memtester(duration: int = 300) -> Dict[str, Any]:
Returns: Returns:
Dict[str, Any]: 测试结果 Dict[str, Any]: 测试结果
""" """
import logging
logger = logging.getLogger(__name__)
result = { result = {
"passed": False, "passed": False,
"size_mb": 0, "size_mb": 0,
@@ -371,9 +374,12 @@ def run_memtester(duration: int = 300) -> Dict[str, Any]:
if not check_command_exists('memtester'): if not check_command_exists('memtester'):
result["errors"].append("memtester 未安装") result["errors"].append("memtester 未安装")
logger.warning("[MEMORY STRESS TEST] memtester 未安装")
return result return result
try: try:
logger.info("[MEMORY STRESS TEST] 开始使用 memtester 进行内存测试")
# 计算测试内存大小 # 计算测试内存大小
# 留出一些内存给系统和 stress-ng 使用 # 留出一些内存给系统和 stress-ng 使用
with open('/proc/meminfo', 'r') as f: with open('/proc/meminfo', 'r') as f:
@@ -391,8 +397,11 @@ def run_memtester(duration: int = 300) -> Dict[str, Any]:
result["start_time"] = time.strftime('%Y-%m-%d %H:%M:%S') result["start_time"] = time.strftime('%Y-%m-%d %H:%M:%S')
start_ts = time.time() start_ts = time.time()
logger.info(f"[MEMORY STRESS TEST] 测试内存大小: {test_size_mb}MB")
# 运行 memtester # 运行 memtester
cmd = ['memtester', f'{test_size_mb}M', '1'] cmd = ['memtester', f'{test_size_mb}M', '1']
logger.info(f"[MEMORY STRESS TEST] 执行命令: {' '.join(cmd)}")
_, stdout, stderr = execute_command( _, stdout, stderr = execute_command(
cmd, cmd,
@@ -403,25 +412,32 @@ def run_memtester(duration: int = 300) -> Dict[str, Any]:
result["end_time"] = time.strftime('%Y-%m-%d %H:%M:%S') result["end_time"] = time.strftime('%Y-%m-%d %H:%M:%S')
result["duration_seconds"] = round(time.time() - start_ts, 2) result["duration_seconds"] = round(time.time() - start_ts, 2)
logger.info(f"[MEMORY STRESS TEST] memtester 执行完成,耗时: {result['duration_seconds']}")
output = stdout + stderr output = stdout + stderr
result["raw_output"] = output[:2000] # 保存部分原始输出 result["raw_output"] = output[:2000] # 保存部分原始输出
# 分析结果 # 分析结果
if 'FAILURE' in output.upper(): if 'FAILURE' in output.upper():
result["passed"] = False result["passed"] = False
logger.error("[MEMORY STRESS TEST] 测试失败: 发现 FAILURE")
# 提取错误信息 # 提取错误信息
for line in output.split('\n'): for line in output.split('\n'):
if 'FAILURE' in line.upper() or 'error' in line.lower(): if 'FAILURE' in line.upper() or 'error' in line.lower():
result["errors"].append(line.strip()) result["errors"].append(line.strip())
logger.error(f"[MEMORY STRESS TEST] 错误详情: {line.strip()}")
elif 'SUCCESS' in output.upper() or 'ok' in output.lower() or 'finished' in output.lower(): elif 'SUCCESS' in output.upper() or 'ok' in output.lower() or 'finished' in output.lower():
result["passed"] = True result["passed"] = True
logger.info("[MEMORY STRESS TEST] 测试通过")
else: else:
# 检查是否完成所有测试 # 检查是否完成所有测试
if 'Done' in output or 'finished' in output.lower(): if 'Done' in output or 'finished' in output.lower():
result["passed"] = True result["passed"] = True
logger.info("[MEMORY STRESS TEST] 测试完成")
else: else:
result["passed"] = False result["passed"] = False
result["errors"].append("测试可能未完成") result["errors"].append("测试可能未完成")
logger.warning("[MEMORY STRESS TEST] 测试可能未完成")
# 提取运行的测试 # 提取运行的测试
test_names = [ test_names = [
@@ -436,9 +452,12 @@ def run_memtester(duration: int = 300) -> Dict[str, Any]:
if test in output: if test in output:
result["tests_run"].append(test) result["tests_run"].append(test)
logger.info(f"[MEMORY STRESS TEST] 执行的测试项: {', '.join(result['tests_run'])}")
except Exception as e: except Exception as e:
result["passed"] = False result["passed"] = False
result["errors"].append(str(e)) result["errors"].append(str(e))
logger.exception(f"[MEMORY STRESS TEST] memtester 执行异常: {e}")
return result return result

View File

@@ -161,13 +161,98 @@ def setup_logging(
if log_file: if log_file:
os.makedirs(os.path.dirname(log_file) or '.', exist_ok=True) os.makedirs(os.path.dirname(log_file) or '.', exist_ok=True)
file_handler = logging.FileHandler(log_file) # 使用 FileHandler 并设置立即刷新
file_handler = logging.FileHandler(log_file, mode='a')
file_handler.setFormatter(formatter) file_handler.setFormatter(formatter)
# 确保每次日志写入后立即刷新到磁盘
file_handler.flush = lambda: file_handler.stream.flush()
logger.addHandler(file_handler) logger.addHandler(file_handler)
return logger return logger
class ProgressLogger:
"""
进度日志记录器 - 用于记录测试进度,便于中断后排查问题。
"""
def __init__(self, log_file: Optional[str] = None):
self.logger = logging.getLogger(__name__)
self.log_file = log_file
self.steps = []
self.current_step = None
self.start_time = None
def start(self, operation: str):
"""开始一个操作步骤。"""
from datetime import datetime
self.current_step = {
"operation": operation,
"start_time": datetime.now().isoformat(),
"status": "running"
}
self.start_time = datetime.now()
msg = f"[START] {operation}"
self.logger.info(msg)
self._flush_log()
def end(self, status: str = "success", message: str = ""):
"""结束当前操作步骤。"""
from datetime import datetime
if self.current_step:
end_time = datetime.now()
duration = (end_time - self.start_time).total_seconds() if self.start_time else 0
self.current_step["end_time"] = end_time.isoformat()
self.current_step["status"] = status
self.current_step["duration_seconds"] = duration
self.current_step["message"] = message
self.steps.append(self.current_step)
msg = f"[END] {self.current_step['operation']} - Status: {status}"
if message:
msg += f" - {message}"
msg += f" (Duration: {duration:.2f}s)"
if status == "error":
self.logger.error(msg)
elif status == "warning":
self.logger.warning(msg)
else:
self.logger.info(msg)
self._flush_log()
self.current_step = None
def log(self, message: str, level: str = "info"):
"""记录中间日志。"""
msg = f"[PROGRESS] {self.current_step['operation'] if self.current_step else 'UNKNOWN'} - {message}"
if level == "error":
self.logger.error(msg)
elif level == "warning":
self.logger.warning(msg)
elif level == "debug":
self.logger.debug(msg)
else:
self.logger.info(msg)
self._flush_log()
def _flush_log(self):
"""强制刷新日志到磁盘。"""
for handler in self.logger.handlers:
if hasattr(handler, 'flush'):
handler.flush()
def get_summary(self) -> Dict[str, Any]:
"""获取执行摘要。"""
return {
"total_steps": len(self.steps),
"steps": self.steps,
"current_running": self.current_step
}
def parse_key_value_output(text: str, delimiter: str = ':') -> Dict[str, str]: def parse_key_value_output(text: str, delimiter: str = ':') -> Dict[str, str]:
""" """
解析 key: value 格式的文本输出。 解析 key: value 格式的文本输出。