Spaces:

avans06
/

SeedVR2_Image_upscaler

Running

App Files Files Community

avans06 commited on Dec 10, 2025

Commit

8c93973

1 Parent(s): 9f78a38

init commit

Browse files

Files changed (9) hide show

.gitignore +6 -0
README.md +2 -2
app.py +0 -0
requirements.txt +39 -0
src/optimization/blockswap.py +1032 -0
src/optimization/blockswap.py.bak +938 -0
src/optimization/memory_manager.py +1285 -0
src/optimization/memory_manager.py.bak +1231 -0
webui.bat +187 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,6 @@

+.vs
+venv
+tmp
+*.pyc
+models
+images

README.md CHANGED Viewed

@@ -1,10 +1,10 @@
 ---
 title: SeedVR2 Image Upscaler
-emoji: 😻
 colorFrom: gray
 colorTo: blue
 sdk: gradio
-sdk_version: 6.1.0
 app_file: app.py
 pinned: false
 license: apache-2.0

 ---
 title: SeedVR2 Image Upscaler
+emoji: 🖼️
 colorFrom: gray
 colorTo: blue
 sdk: gradio
+sdk_version: 5.50.0
 app_file: app.py
 pinned: false
 license: apache-2.0

app.py ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

	@@ -0,0 +1,39 @@

+--extra-index-url https://download.pytorch.org/whl/cu130
+# Web UI
+gradio==5.50.0
+# Core numeric / vision
+numpy
+opencv-python
+# PyTorch
+torch
+torchvision
+# Hugging Face helper for downloading weights
+huggingface-hub==0.36.0
+# Utilities
+tqdm
+# SeedVR2
+psutil
+einops
+diffusers
+rotary-embedding-torch
+omegaconf
+gguf
+triton; sys_platform != 'win32'
+triton-windows; sys_platform == 'win32'
+#
+# flash-attn; sys_platform != 'win32'
+# sageattention; sys_platform != 'win32'
+#
+# if sys_platform == 'win32'
+# https://huggingface.co/lldacing/flash-attention-windows-wheel
+# https://huggingface.co/ussoewwin/Flash-Attention-2_for_Windows
+# https://github.com/woct0rdho/SageAttention

src/optimization/blockswap.py ADDED Viewed

	@@ -0,0 +1,1032 @@

+"""
+BlockSwap Module for SeedVR2
+This module implements dynamic block swapping between GPU and CPU memory
+to enable running large models on limited VRAM systems.
+Key Features:
+- Dynamic transformer block offloading during inference
+- Non-blocking GPU transfers for optimal performance
+- RoPE computation fallback to CPU on OOM
+- Minimal performance overhead with intelligent caching
+- I/O component offloading for maximum memory savings
+"""
+import time
+import types
+import torch
+import weakref
+from typing import Dict, Any, List, Optional
+from .memory_manager import clear_memory
+from .compatibility import call_rope_with_stability
+from ..common.distributed import get_device
+def is_blockswap_enabled(config: Optional[Dict[str, Any]]) -> bool:
+    """
+    Check if BlockSwap configuration indicates BlockSwap should be enabled.
+    BlockSwap is enabled if either blocks_to_swap > 0 OR swap_io_components is True.
+    This is the authoritative function for determining BlockSwap status from configuration.
+    Args:
+        config: BlockSwap configuration dictionary with optional keys:
+            - blocks_to_swap: Number of blocks to offload (0 = disabled)
+            - swap_io_components: Whether to offload I/O components
+    Returns:
+        True if BlockSwap should be active, False otherwise
+    """
+    if not config:
+        return False
+    blocks_to_swap = config.get("blocks_to_swap", 0)
+    swap_io_components = config.get("swap_io_components", False)
+    return blocks_to_swap > 0 or swap_io_components
+def validate_blockswap_config(
+    block_swap_config: Optional[Dict[str, Any]],
+    dit_device: 'torch.device',
+    dit_offload_device: Optional['torch.device'],
+    debug: 'Debug'
+) -> Optional[Dict[str, Any]]:
+    """
+    Validate and potentially modify BlockSwap configuration.
+    Performs platform-specific validation and configuration adjustment:
+    - On macOS (MPS): Auto-disables BlockSwap since unified memory makes it meaningless
+    - On other platforms: Validates that offload_device is properly configured
+    This is the single authoritative validation point for BlockSwap configuration,
+    called early in configure_runner() before any model loading.
+    Args:
+        block_swap_config: BlockSwap configuration dictionary (may be None)
+        dit_device: Target device for DiT model inference
+        dit_offload_device: Device for offloading DiT blocks (may be None)
+        debug: Debug instance for logging warnings/errors
+    Returns:
+        Validated/modified block_swap_config (may be None or modified copy)
+    Raises:
+        ValueError: If BlockSwap is enabled but offload_device is invalid (non-MPS only)
+    """
+    if not is_blockswap_enabled(block_swap_config):
+        return block_swap_config
+    blocks_to_swap = block_swap_config.get("blocks_to_swap", 0)
+    swap_io_components = block_swap_config.get("swap_io_components", False)
+    # Check for macOS unified memory - BlockSwap is meaningless there
+    if dit_device.type == "mps":
+        debug.log(
+            f"BlockSwap disabled: macOS uses unified memory (no separate VRAM/RAM). "
+            f"Ignoring blocks_to_swap={blocks_to_swap}, swap_io_components={swap_io_components}",
+            level="WARNING", category="blockswap", force=True
+        )
+        # Return disabled config
+        return {
+            **block_swap_config,
+            "blocks_to_swap": 0,
+            "swap_io_components": False
+        }
+    # Validate offload_device is set and different from dit_device
+    offload_device_valid = (
+        dit_offload_device is not None and
+        str(dit_offload_device) != str(dit_device)
+    )
+    if not offload_device_valid:
+        config_details = []
+        if blocks_to_swap > 0:
+            config_details.append(f"blocks_to_swap={blocks_to_swap}")
+        if swap_io_components:
+            config_details.append("swap_io_components=True")
+        offload_str = str(dit_offload_device) if dit_offload_device else "none"
+        raise ValueError(
+            f"BlockSwap enabled ({', '.join(config_details)}) but dit_offload_device is invalid. "
+            f"Current: device='{dit_device}', dit_offload_device='{offload_str}'. "
+            f"BlockSwap requires offload_device on the DiT Model to be set and different from device. "
+            f"Set --dit_offload_device cpu or disable BlockSwap."
+        )
+    return block_swap_config
+# Timing helpers marked to skip torch.compile tracing
+# These functions are excluded from Dynamo's graph tracing to avoid warnings
+# about non-traceable builtins like time.time(), but they still execute normally
+@torch._dynamo.disable
+def _get_swap_start_time(debug, enabled: bool) -> Optional[float]:
+    """Get start time for swap operation if debug is enabled."""
+    return time.time() if debug and enabled else None
+@torch._dynamo.disable
+def _log_swap_timing(debug, t_start: Optional[float], component_id, component_type: str) -> None:
+    """Log swap timing if start time was captured."""
+    if debug and t_start is not None:
+        debug.log_swap_time(
+            component_id=component_id,
+            duration=time.time() - t_start,
+            component_type=component_type
+        )
+def get_module_memory_mb(module: torch.nn.Module) -> float:
+    """
+    Calculate memory usage of a module in MB.
+    Args:
+        module: PyTorch module to measure
+    Returns:
+        Memory usage in megabytes
+    """
+    total_bytes = sum(
+        param.nelement() * param.element_size()
+        for param in module.parameters()
+        if param.data is not None
+    )
+    return total_bytes / (1024 * 1024)
+def apply_block_swap_to_dit(
+    runner: 'VideoDiffusionInfer',
+    block_swap_config: Dict[str, Any],
+    debug: 'Debug'
+) -> None:
+    """
+    Apply block swapping configuration to a DiT model with OOM protection.
+    This is the main entry point for configuring block swapping on a model.
+    Handles block selection, I/O component offloading, device placement, and
+    forward method wrapping for dynamic memory management.
+    Args:
+        runner: VideoDiffusionInfer instance containing the model
+        block_swap_config: Configuration dictionary with keys:
+            - blocks_to_swap: Number of blocks to swap (from the start)
+            - swap_io_components: Whether to offload I/O components
+            - enable_debug: Whether to enable debug logging
+            - offload_device: Device to offload to (default: 'cpu')
+        debug: Debug instance for logging (required)
+    """
+    # Early return if BlockSwap not enabled
+    if not is_blockswap_enabled(block_swap_config):
+        return
+    blocks_to_swap = block_swap_config.get("blocks_to_swap", 0)
+    swap_io_components = block_swap_config.get("swap_io_components", False)
+    # Early return only if both block swap and I/O swap are disabled
+    if blocks_to_swap <= 0 and not swap_io_components:
+        return
+    if debug is None:
+        if hasattr(runner, 'debug') and runner.debug is not None:
+            debug = runner.debug
+        else:
+            raise ValueError("Debug instance must be provided to apply_block_swap_to_dit")
+    debug.start_timer("apply_blockswap")
+    # Get the actual model (handle CompatibleDiT wrapper)
+    model = runner.dit
+    if hasattr(model, "dit_model"):
+        model = model.dit_model
+    # Determine devices
+    if hasattr(runner, '_dit_device'):
+        device = runner._dit_device
+    else:
+        device = get_device()
+    offload_device = block_swap_config.get("offload_device", torch.device('cpu'))
+    # Validate model structure
+    if not hasattr(model, "blocks"):
+        debug.log("Model doesn't have 'blocks' attribute for BlockSwap", level="ERROR", category="blockswap", force=True)
+        return
+    total_blocks = len(model.blocks)
+    # Clamp blocks_to_swap to available blocks BEFORE logging
+    effective_blocks = min(blocks_to_swap, total_blocks) if blocks_to_swap > 0 else 0
+    # Log configuration clearly based on what's enabled
+    block_text = "block" if effective_blocks <= 1 else "blocks"
+    if effective_blocks > 0 and swap_io_components:
+        debug.log(f"BlockSwap: {effective_blocks}/{total_blocks} transformer {block_text} + I/O components offloaded to {str(offload_device).upper()}", category="blockswap", force=True)
+    elif effective_blocks > 0:
+        debug.log(f"BlockSwap: {effective_blocks}/{total_blocks} transformer {block_text} offloaded to {str(offload_device).upper()}", category="blockswap", force=True)
+    elif swap_io_components:
+        debug.log(f"BlockSwap: I/O components offloaded to {str(offload_device).upper()} (0/{total_blocks} blocks swapped)", category="blockswap", force=True)
+    # Configure model with blockswap attributes
+    if blocks_to_swap > 0:
+        model.blocks_to_swap = effective_blocks - 1  # Convert to 0-indexed
+    else:
+        # No block swapping, set to -1 so no blocks match the swap condition
+        model.blocks_to_swap = -1
+    model.main_device = device
+    model.offload_device = offload_device
+    # Configure I/O components
+    io_config = _configure_io_components(model, device, offload_device,
+                                        swap_io_components, debug)
+    memory_stats = _configure_blocks(model, device, offload_device, debug)
+    memory_stats['io_components'] = io_config['components']
+    memory_stats['io_memory_mb'] = io_config['memory_mb']
+    memory_stats['gpu_components'] = io_config['gpu_components']
+    memory_stats['io_gpu_memory_mb'] = io_config['gpu_memory_mb']
+    # Log memory summary
+    _log_memory_summary(memory_stats, offload_device, device, swap_io_components,
+                       debug)
+    # Initialize Nunchaku-style async management object
+    if blocks_to_swap > 0:
+        # normalize device objects
+        if isinstance(device, str):
+            device = torch.device(device)
+        model._swap_stream = torch.cuda.Stream(device=device)
+        model._block_ready_events = {}
+        # Preload first swapped block to seed pipeline (non-blocking on swap_stream)
+        try:
+            first_idx = 0
+            if first_idx <= model.blocks_to_swap:
+                with torch.cuda.stream(model._swap_stream):
+                    model.blocks[first_idx].to(device, non_blocking=True)
+                    ev = torch.cuda.Event(blocking=False)
+                    ev.record(model._swap_stream) # record on swap_stream -> event gets device-bound here
+                    model._block_ready_events[first_idx] = ev
+        except Exception as e:
+            debug.log(f"Failed to initialize swap-stream prefetch: {e}", level="WARNING", category="blockswap", force=True)
+    # Wrap block forward methods for dynamic swapping (only if blocks_to_swap > 0)
+    if blocks_to_swap > 0:
+        for b, block in enumerate(model.blocks):
+            if b <= model.blocks_to_swap:
+                _wrap_block_forward(block, b, model, debug)
+    # Patch RoPE modules for robust error handling
+    _patch_rope_for_blockswap(model, debug)
+    # Mark BlockSwap as active
+    runner._blockswap_active = True
+    # Store configuration for debugging and cleanup
+    model._block_swap_config = {
+        "blocks_swapped": blocks_to_swap,
+        "swap_io_components": swap_io_components,
+        "total_blocks": total_blocks,
+        "offload_device": offload_device,
+        "main_device": device,
+        "offload_memory": memory_stats['offload_memory'],
+        "main_memory": memory_stats['main_memory']
+    }
+    # Protect model from being moved entirely
+    _protect_model_from_move(model, runner, debug)
+    debug.log("BlockSwap configuration complete", category="success")
+    debug.end_timer("apply_blockswap", "BlockSwap configuration application")
+def _configure_io_components(
+    model: torch.nn.Module,
+    device: torch.device,
+    offload_device: torch.device,
+    swap_io_components: bool,
+    debug: 'Debug'
+) -> Dict[str, Any]:
+    """
+    Configure I/O component placement and wrapping with memory tracking.
+    Handles all non-block modules (embeddings, normalization layers, etc.) by
+    either keeping them on GPU or offloading them with dynamic swapping wrappers.
+    Args:
+        model: DiT model containing named children to configure
+        device: Main computation device (typically GPU)
+        offload_device: Device for offloaded components (typically CPU)
+        swap_io_components: If True, offload I/O components with dynamic swapping
+        debug: Debug instance for logging (required)
+    Returns:
+        Dictionary containing:
+            - components: List of offloaded component names
+            - memory_mb: Total memory of offloaded components in MB
+            - gpu_components: List of components remaining on GPU
+            - gpu_memory_mb: Total memory of GPU components in MB
+    """
+    io_components_offloaded = []
+    io_components_on_gpu = []
+    io_memory_mb = 0.0
+    io_gpu_memory_mb = 0.0
+    # Check for pin memory condition
+    use_pin_memory = (offload_device == "cpu") if isinstance(offload_device, str) else (offload_device.type == "cpu")
+    # Handle I/O modules with dynamic swapping
+    for name, module in model.named_children():
+        if name != "blocks":
+            module_memory = get_module_memory_mb(module)
+            if swap_io_components:
+                module.to(offload_device)
+                # Enable Pin Memory for I/O components
+                if use_pin_memory:
+                    for p in module.parameters():
+                        if not p.is_pinned():
+                            p.data = p.data.pin_memory()
+                    for buf in module.buffers():
+                        if not buf.is_pinned():
+                            buf.data = buf.data.pin_memory()
+                _wrap_io_forward(module, name, model, debug)
+                io_components_offloaded.append(name)
+                io_memory_mb += module_memory
+                debug.log(f"{name} → {str(offload_device).upper()} ({module_memory:.2f}MB, dynamic swapping)", category="blockswap", indent_level=1)
+            else:
+                module.to(device)
+                io_components_on_gpu.append(name)
+                io_gpu_memory_mb += module_memory
+                debug.log(f"{name} → {str(device).upper()} ({module_memory:.2f}MB)", category="blockswap", indent_level=1)
+    return {
+        'components': io_components_offloaded,
+        'memory_mb': io_memory_mb,
+        'gpu_components': io_components_on_gpu,
+        'gpu_memory_mb': io_gpu_memory_mb
+    }
+def _configure_blocks(
+    model: torch.nn.Module,
+    device: torch.device,
+    offload_device: torch.device,
+    debug: 'Debug'
+) -> Dict[str, float]:
+    """
+    Configure transformer block placement and calculate memory statistics.
+    Moves blocks to their designated devices based on model.blocks_to_swap
+    attribute. Blocks with index <= blocks_to_swap go to offload device,
+    others stay on main device.
+    Args:
+        model: DiT model with blocks attribute and blocks_to_swap configured
+        device: Main computation device for non-swapped blocks
+        offload_device: Device for swapped blocks
+        debug: Debug instance for logging (required)
+    Returns:
+        Dictionary containing:
+            - offload_memory: Total memory of offloaded blocks in MB
+            - main_memory: Total memory of blocks on main device in MB
+            - io_components: Empty list (populated by caller)
+    """
+    total_offload_memory = 0.0
+    total_main_memory = 0.0
+    # Check if we should pin memory (if offloading to CPU)
+    # Nunchaku uses pinned memory for faster async transfers
+    use_pin_memory = (offload_device == "cpu") if isinstance(offload_device, str) else (offload_device.type == "cpu")
+    # Move blocks based on swap configuration
+    for b, block in enumerate(model.blocks):
+        block_memory = get_module_memory_mb(block)
+        if b > model.blocks_to_swap:
+            block.to(device)
+            total_main_memory += block_memory
+        else:
+            block.to(offload_device, non_blocking=False)
+            total_offload_memory += block_memory
+            # Enable Pin Memory optimization for CPU Offload transfer speed
+            if use_pin_memory:
+                for p in block.parameters():
+                    if not p.is_pinned():
+                        p.data = p.data.pin_memory()
+                for buf in block.buffers():
+                    if not buf.is_pinned():
+                        buf.data = buf.data.pin_memory()
+    # Ensure all buffers match their containing module's device
+    for b, block in enumerate(model.blocks):
+        target_device = device if b > model.blocks_to_swap else offload_device
+        for name, buffer in block.named_buffers():
+            if buffer.device != torch.device(target_device):
+                # Apply pinning if needed
+                if use_pin_memory and target_device.type == "cpu" and not buffer.is_pinned():
+                    buffer.data = buffer.data.pin_memory()
+                buffer.data = buffer.data.to(target_device, non_blocking=False)
+    return {
+        "offload_memory": total_offload_memory,
+        "main_memory": total_main_memory,
+        "io_components": []  # Will be populated by caller
+    }
+def _log_memory_summary(
+    memory_stats: Dict[str, float],
+    offload_device: torch.device,
+    device: torch.device,
+    swap_io_components: bool,
+    debug: 'Debug'
+) -> None:
+    """
+    Log comprehensive memory usage summary for BlockSwap configuration.
+    Displays detailed breakdown of memory distribution across devices,
+    including transformer blocks and I/O components.
+    Args:
+        memory_stats: Dictionary containing:
+            - offload_memory: Memory offloaded from blocks (MB)
+            - main_memory: Memory remaining on main device (MB)
+            - io_memory_mb: Memory from offloaded I/O components (MB)
+            - io_gpu_memory_mb: Memory from I/O components on GPU (MB)
+        offload_device: Device used for offloading
+        device: Main computation device
+        swap_io_components: Whether I/O components are being swapped
+        debug: Debug instance for logging (required)
+    """
+    debug.log("BlockSwap memory configuration:", category="blockswap")
+    # Log transformer blocks memory
+    blocks_offloaded = memory_stats['offload_memory']
+    blocks_on_gpu = memory_stats['main_memory']
+    offload_str = str(offload_device)
+    device_str = str(device)
+    if blocks_on_gpu == 0:
+        debug.log(f"Transformer blocks: {blocks_offloaded:.2f}MB on {offload_str} (dynamic swapping)", category="blockswap", indent_level=1)
+    else:
+        debug.log(f"Transformer blocks: {blocks_on_gpu:.2f}MB on {device_str}, {blocks_offloaded:.2f}MB on {offload_str}", category="blockswap", indent_level=1)
+    # Always log I/O components (whether swapping or not)
+    io_memory = memory_stats.get('io_memory_mb', 0.0)
+    io_gpu_memory = memory_stats.get('io_gpu_memory_mb', 0.0)
+    if swap_io_components and io_memory > 0:
+        io_components = memory_stats.get('io_components', [])
+        debug.log(f"I/O components: {io_memory:.2f}MB on {offload_str} (dynamic swapping)", category="blockswap", indent_level=1)
+        debug.log(f"{', '.join(io_components)}", category="blockswap", indent_level=2)
+    elif io_gpu_memory > 0:
+        io_gpu_components = memory_stats.get('gpu_components', [])
+        debug.log(f"I/O components: {io_gpu_memory:.2f}MB on {device_str}", category="blockswap", indent_level=1)
+        debug.log(f"{', '.join(io_gpu_components)}", category="blockswap", indent_level=2)
+    # Log total VRAM savings
+    total_offloaded = blocks_offloaded + (io_memory if swap_io_components else 0)
+    if total_offloaded > 0:
+        debug.log(f"Total VRAM saved: {total_offloaded:.2f}MB (~{total_offloaded/1024:.2f}GB)", category="blockswap", indent_level=1)
+def _wrap_block_forward(
+    block: torch.nn.Module,
+    block_idx: int,
+    model: torch.nn.Module,
+    debug: 'Debug'
+) -> None:
+    """
+    Wrap individual transformer block forward for dynamic device swapping.
+    Implements Nunchaku-style pipelining: Prefetch Next -> Compute Current -> Offload Current.
+    https://github.com/nunchaku-tech/nunchaku/blob/main/nunchaku/models/utils.py
+    Creates a wrapped forward method that automatically:
+    1. Moves block to GPU before computation
+    2. Executes original forward pass
+    3. Moves block back to offload device after computation
+    4. Logs timing and manages memory pressure
+    Uses weak references to prevent memory leaks from closure retention.
+    Args:
+        block: Individual transformer block to wrap
+        block_idx: Index of this block in model.blocks
+        model: Parent DiT model (used for device references)
+        debug: Debug instance for logging (required)
+    """
+    if hasattr(block, '_original_forward'):
+        return  # Already wrapped
+    # Store original forward method
+    original_forward = block.forward
+    # Create weak references
+    model_ref = weakref.ref(model)
+    debug_ref = weakref.ref(debug) if debug is not None else (lambda: None)
+    # Store block_idx on the block itself to avoid closure issues
+    block._block_idx = block_idx
+    def wrapped_forward(self, *args, **kwargs):
+        # Retrieve weak references
+        model = model_ref()
+        debug = debug_ref()
+        if not model:
+            # Model has been garbage collected, fall back to original
+            return original_forward(*args, **kwargs)
+        # Check if block swap is active for this block
+        if hasattr(model, 'blocks_to_swap') and self._block_idx <= model.blocks_to_swap:
+            # Use dynamo-disabled helper to get start time (avoids compilation warnings)
+            t_start = _get_swap_start_time(debug, debug.enabled if debug else False)
+            # Only move to GPU if necessary
+            current_device = next(self.parameters()).device
+            target_device = torch.device(model.main_device)
+            # 1. Ensure CURRENT block is ready on GPU
+            # Check if we have a prefetch event waiting
+            if hasattr(model, '_block_ready_events') and self._block_idx in model._block_ready_events:
+                # Wait for the swap stream to finish moving this block
+                torch.cuda.current_stream().wait_event(model._block_ready_events[self._block_idx])
+                # Cleanup event
+                del model._block_ready_events[self._block_idx]
+            elif current_device != target_device:
+                # Fallback: First block or missed prefetch, move synchronously (but non-blocking)
+                debug.log(f"[blockswap] Block {self._block_idx} missing prefetch event, moving synchronously", level="WARNING", category="blockswap", force=True)
+                self.to(model.main_device, non_blocking=True)
+            # 2. Trigger Prefetch for NEXT block (Pipelining)
+            # Nunchaku logic: Start moving i+1 while i is computing
+            next_idx = self._block_idx + 1
+            if next_idx <= model.blocks_to_swap:
+                next_block = model.blocks[next_idx]
+                # Use the dedicated swap stream
+                with torch.cuda.stream(model._swap_stream):
+                    next_block.to(model.main_device, non_blocking=True)
+                    # Record event so next iteration knows when to wait
+                    event = torch.cuda.Event(blocking=False)
+                    event.record(model._swap_stream)
+                    model._block_ready_events[next_idx] = event
+            # 3. Execute forward pass (Compute)
+            # This runs on the default stream, overlapping with the prefetch above
+            output = original_forward(*args, **kwargs)
+            # 4. Offload CURRENT block (Async)
+            # We record an event on compute stream to ensure we don't move data while it's being used
+            compute_done_event = torch.cuda.Event(blocking=False)
+            compute_done_event.record(torch.cuda.current_stream())
+            with torch.cuda.stream(model._swap_stream):
+                # Wait for compute to finish before moving memory out
+                model._swap_stream.wait_event(compute_done_event)
+                # Move back to offload device
+                self.to(model.offload_device, non_blocking=True)
+            # Use dynamo-disabled helper to log timing (avoids compilation warnings)
+            _log_swap_timing(debug, t_start, self._block_idx, "block (pipelined)")
+            # Only clear cache under memory pressure
+            clear_memory(debug=debug, deep=False, force=False, timer_name="wrap_block_forward")
+        else:
+            output = original_forward(*args, **kwargs)
+        return output
+    # Bind the wrapped function as a method to the block
+    block.forward = types.MethodType(wrapped_forward, block)
+    # Store reference to original forward for cleanup
+    block._original_forward = original_forward
+def _wrap_io_forward(
+    module: torch.nn.Module,
+    module_name: str,
+    model: torch.nn.Module,
+    debug: 'Debug'
+) -> None:
+    """
+    Wrap I/O component forward for dynamic device swapping.
+    Similar to _wrap_block_forward but for I/O components (embeddings,
+    normalization layers, etc.). Handles swapping between GPU and CPU
+    during forward passes.
+    Uses weak references to prevent circular dependencies and memory leaks.
+    Args:
+        module: I/O component module to wrap
+        module_name: Name identifier for logging (e.g., 'x_embedder')
+        model: Parent DiT model (used for device references)
+        debug: Debug instance for logging (required)
+    """
+    if hasattr(module, '_is_io_wrapped') and module._is_io_wrapped:
+        debug.log(f"Reusing existing I/O wrapper for {module_name}", category="reuse")
+        return  # Already wrapped
+    # Store original forward method
+    original_forward = module.forward
+    # Create weak references
+    model_ref = weakref.ref(model)
+    debug_ref = weakref.ref(debug) if debug else lambda: None
+    # Store module name on the module itself
+    module._module_name = module_name
+    module._original_forward = original_forward
+    def wrapped_io_forward(self, *args, **kwargs):
+        # Retrieve weak references
+        model = model_ref()
+        debug = debug_ref()
+        if not model:
+            # Model has been garbage collected, fall back to original
+            return self._original_forward(*args, **kwargs)
+        # Use dynamo-disabled helper to get start time (avoids compilation warnings)
+        t_start = _get_swap_start_time(debug, debug.enabled if debug else False)
+        # Check current device to avoid unnecessary moves
+        current_device = next(self.parameters()).device
+        target_device = torch.device(model.main_device)
+        # Move to GPU for computation if needed
+        if current_device != target_device:
+            self.to(model.main_device, non_blocking=False)
+        # Execute forward pass
+        output = self._original_forward(*args, **kwargs)
+        # Move back to offload device
+        self.to(model.offload_device, non_blocking=False)
+        # Use dynamo-disabled helper to log timing (avoids compilation warnings)
+        _log_swap_timing(debug, t_start, self._module_name, "I/O")
+        # Only clear cache under memory pressure
+        clear_memory(debug=debug, deep=False, force=False, timer_name="wrap_block_forward")
+        return output
+    # Bind as a method
+    module.forward = types.MethodType(wrapped_io_forward, module)
+    module._is_io_wrapped = True
+    # Store module reference for restoration
+    if not hasattr(model, '_io_swappers'):
+        model._io_swappers = []
+    model._io_swappers.append((module, module_name))
+def _patch_rope_for_blockswap(
+    model: torch.nn.Module,
+    debug: 'Debug'
+) -> None:
+    """
+    Patch RoPE (Rotary Position Embedding) modules for device-aware fallback.
+    Adds CPU fallback logic to RoPE modules to handle device mismatch errors
+    that can occur during BlockSwap operations. Complements the stability
+    wrapper from compatibility.py with device-specific error handling.
+    Args:
+        model: DiT model containing RoPE modules to patch
+        debug: Debug instance for logging (required)
+    """
+    rope_patches = []
+    for name, module in model.named_modules():
+        if "rope" in name.lower() and hasattr(module, "get_axial_freqs"):
+            # Skip if already wrapped by blockswap
+            if hasattr(module, '_blockswap_wrapped') and module._blockswap_wrapped:
+                continue
+            # Get current method (might be stability-wrapped)
+            current_method = module.get_axial_freqs
+            # Create device-aware wrapper with proper closure handling
+            def make_device_aware_wrapper(module_name, current_fn):
+                def device_aware_rope_wrapper(self, *args, **kwargs):
+                    try:
+                        # Try current method (original or stability-wrapped)
+                        return current_fn(*args, **kwargs)
+                    except (RuntimeError, torch.cuda.OutOfMemoryError) as e:
+                        error_msg = str(e).lower()
+                        # Only handle device/memory specific errors
+                        if any(x in error_msg for x in ["device", "memory", "allocation"]):
+                            debug.log(f"RoPE OOM for {module_name}", level="WARNING", category="rope", force=True)
+                            debug.log(f"Clearing RoPE cache and retrying", category="info", force=True)
+                            # Get current device from parameters
+                            try:
+                                current_device = next(self.parameters()).device
+                            except StopIteration:
+                                # Fallback: use model's main_device if BlockSwap has set it, else use offload_device
+                                if hasattr(model, 'main_device'):
+                                    current_device = torch.device(model.main_device)
+                                elif hasattr(model, 'offload_device'):
+                                    current_device = torch.device(model.offload_device)
+                            # Try clearing cache first (non-invasive fix)
+                            if hasattr(current_fn, 'cache_clear'):
+                                current_fn.cache_clear()
+                                try:
+                                    # Retry on same device after clearing cache
+                                    return current_fn(*args, **kwargs)
+                                except Exception as retry_error:
+                                    # Cache clear wasn't enough, need more drastic measures
+                                    debug.log(f"Cache clear insufficient for {module_name}, falling back to CPU", level="WARNING", category="rope", force=True)
+                            # Fallback to CPU computation with stability
+                            self.cpu()
+                            try:
+                                # Use call_rope_with_stability for CPU computation
+                                # This ensures cache is cleared and autocast disabled
+                                original_fn = getattr(self, '_original_get_axial_freqs', current_fn)
+                                result = call_rope_with_stability(original_fn, *args, **kwargs)
+                                # Move module back to original device
+                                self.to(current_device)
+                                # Move result to appropriate device if it's a tensor
+                                if hasattr(result, 'to'):
+                                    target_device = args[0].device if len(args) > 0 and hasattr(args[0], 'device') else current_device
+                                    return result.to(target_device)
+                                return result
+                            except Exception as cpu_error:
+                                # Always restore device even on error
+                                self.to(current_device)
+                                raise cpu_error
+                        else:
+                            # Not a device error, let it bubble up
+                            raise
+                return device_aware_rope_wrapper
+            # Apply wrapper
+            module.get_axial_freqs = types.MethodType(
+                make_device_aware_wrapper(name, current_method),
+                module
+            )
+            module._blockswap_wrapped = True
+            # Store for cleanup (use original or previously stored)
+            original_method = getattr(module, '_original_get_axial_freqs', current_method)
+            rope_patches.append((module, original_method))
+    if rope_patches:
+        model._rope_patches = rope_patches
+        debug.log(f"Patched {len(rope_patches)} RoPE modules with device handling", category="success")
+def _protect_model_from_move(
+    model: torch.nn.Module,
+    runner: 'VideoDiffusionInfer',
+    debug: 'Debug'
+) -> None:
+    """
+    Protect model from unintended full device movement during BlockSwap.
+    Wraps model.to() method to prevent other code from accidentally moving
+    the entire model to GPU, which would defeat BlockSwap's memory savings.
+    Allows movement only when explicitly bypassed via model flag.
+    Args:
+        model: DiT model to protect
+        runner: VideoDiffusionInfer instance (for active status check)
+        debug: Debug instance for logging (required)
+    """
+    if not hasattr(model, '_original_to'):
+        # Store runner reference as weak reference to avoid circular refs
+        model._blockswap_runner_ref = weakref.ref(runner)
+        model._original_to = model.to
+        # Define the protected method without closures
+        def protected_model_to(self, device, *args, **kwargs):
+            # Check if protection is temporarily bypassed for offloading
+            # Flag is stored on model itself (not runner) to survive runner recreation
+            if getattr(self, "_blockswap_bypass_protection", False):
+                # Protection bypassed, allow movement
+                if hasattr(self, '_original_to'):
+                    return self._original_to(device, *args, **kwargs)
+            # Get configured offload device directly from model
+            blockswap_offload_device = "cpu"  # default
+            if hasattr(self, "_block_swap_config"):
+                blockswap_offload_device = self._block_swap_config.get("offload_device", "cpu")
+            # Check if BlockSwap is currently active via runner weak reference
+            runner_ref = getattr(self, '_blockswap_runner_ref', None)
+            blockswap_is_active = False
+            if runner_ref:
+                runner_obj = runner_ref()
+                if runner_obj and hasattr(runner_obj, "_blockswap_active"):
+                    blockswap_is_active = runner_obj._blockswap_active
+            # Block attempts to move model away from configured offload device when active
+            if blockswap_is_active and str(device) != str(blockswap_offload_device):
+                # Get debug instance from runner if available
+                debug_instance = None
+                if runner_ref:
+                    runner_obj = runner_ref()
+                    if runner_obj and hasattr(runner_obj, 'debug'):
+                        debug_instance = runner_obj.debug
+                if debug_instance:
+                    debug_instance.log(
+                        f"Blocked attempt to move BlockSwap model from {blockswap_offload_device} to {device}",
+                        level="WARNING", category="blockswap", force=True
+                    )
+                return self
+            # Allow movement (either bypass is enabled or target is offload device)
+            if hasattr(self, '_original_to'):
+                return self._original_to(device, *args, **kwargs)
+            else:
+                # Fallback - shouldn't happen
+                return super(type(self), self).to(device, *args, **kwargs)
+        # Bind as a method to the model instance
+        model.to = types.MethodType(protected_model_to, model)
+def set_blockswap_bypass(runner, bypass: bool, debug):
+    """
+    Set or unset bypass flag for BlockSwap protection.
+    Used for offloading to temporarily allow model movement.
+    Args:
+        runner: Runner instance with BlockSwap
+        bypass: True to bypass protection, False to enforce it
+        debug: Debug instance for logging
+    """
+    if not hasattr(runner, "_blockswap_active") or not runner._blockswap_active:
+        return
+    # Get the actual model (handle CompatibleDiT wrapper)
+    model = runner.dit
+    if hasattr(model, "dit_model"):
+        model = model.dit_model
+    # Store on model so it survives runner recreation during caching
+    model._blockswap_bypass_protection = bypass
+    if bypass:
+        debug.log("BlockSwap protection disabled to allow model DiT offloading", category="success")
+    else:
+        debug.log("BlockSwap protection renabled to avoid accidentally offloading the entire DiT model", category="success")
+def cleanup_blockswap(runner, keep_state_for_cache=False):
+    """
+    Clean up BlockSwap configuration based on caching mode.
+    When caching (keep_state_for_cache=True):
+    - Keep all BlockSwap configuration intact
+    - Only mark as inactive for safety during non-inference operations
+    When not caching (keep_state_for_cache=False):
+    - Full cleanup of all BlockSwap state
+    Args:
+        runner: VideoDiffusionInfer instance to clean up
+        keep_state_for_cache: If True, preserve BlockSwap state for reuse
+    """
+    # Get debug instance from runner
+    if not hasattr(runner, 'debug') or runner.debug is None:
+        raise ValueError("Debug instance must be available on runner for cleanup_blockswap")
+    debug = runner.debug
+    # Get the actual model (handle CompatibleDiT wrapper)
+    model = runner.dit
+    if hasattr(model, "dit_model"):
+        model = model.dit_model
+    # Check if there's any BlockSwap state to clean up (check both runner and model)
+    has_blockswap_state = (
+        hasattr(runner, "_blockswap_active") or
+        hasattr(model, "_block_swap_config") or
+        hasattr(model, "_blockswap_bypass_protection")
+    )
+    if not has_blockswap_state:
+        return
+    debug.log("Starting BlockSwap cleanup", category="cleanup")
+    if keep_state_for_cache:
+        # Minimal cleanup for caching - just mark as inactive and allow offloading
+        # Everything else stays intact for fast reactivation
+        if hasattr(runner, "_blockswap_active") and runner._blockswap_active:
+            if not getattr(model, "_blockswap_bypass_protection", False):
+                set_blockswap_bypass(runner=runner, bypass=True, debug=debug)
+            runner._blockswap_active = False
+        debug.log("BlockSwap deactivated for caching (configuration preserved)", category="success")
+        return
+    # Full cleanup when not caching
+    # Get the actual model (handle CompatibleDiT wrapper)
+    model = runner.dit
+    if hasattr(model, "dit_model"):
+        model = model.dit_model
+    # 1. Restore block forward methods
+    if hasattr(model, 'blocks'):
+        restored_count = 0
+        for block in model.blocks:
+            if hasattr(block, '_original_forward'):
+                block.forward = block._original_forward
+                delattr(block, '_original_forward')
+                restored_count += 1
+                # Clean up wrapper attributes
+                for attr in ['_block_idx', '_model_ref', '_debug_ref', '_blockswap_wrapped']:
+                    if hasattr(block, attr):
+                        delattr(block, attr)
+        if restored_count > 0:
+            debug.log(f"Restored {restored_count} block forward methods", category="success")
+    # 2. Restore RoPE patches
+    if hasattr(model, '_rope_patches'):
+        for module, original_method in model._rope_patches:
+            module.get_axial_freqs = original_method
+            # Clean up wrapper attributes
+            for attr in ['_rope_wrapped', '_original_get_axial_freqs']:
+                if hasattr(module, attr):
+                    delattr(module, attr)
+        debug.log(f"Restored {len(model._rope_patches)} RoPE methods", category="success")
+        delattr(model, '_rope_patches')
+    # 3. Restore I/O component forward methods and move to offload device
+    if hasattr(model, '_io_swappers'):
+        for module, module_name in model._io_swappers:
+            if hasattr(module, '_original_forward'):
+                module.forward = module._original_forward
+                # Clean up wrapper attributes
+                for attr in ['_original_forward', '_model_ref', '_debug_ref',
+                           '_module_name', '_is_io_wrapped']:
+                    if hasattr(module, attr):
+                        delattr(module, attr)
+        debug.log(f"Restored {len(model._io_swappers)} I/O components", category="success")
+        delattr(model, '_io_swappers')
+    # Move all IO components to offload device during full cleanup
+    if hasattr(model, 'offload_device'):
+        offload_device = model.offload_device
+        moved_count = 0
+        for name, module in model.named_children():
+            if name != "blocks":
+                module.to(offload_device)
+                moved_count += 1
+        if moved_count > 0:
+            debug.log(f"Moved {moved_count} IO components to offload device", category="success")
+    # 4. Restore original .to() method
+    if hasattr(model, '_original_to'):
+        model.to = model._original_to
+        delattr(model, '_original_to')
+        debug.log("Restored original .to() method", category="success")
+    # 5. Clean up BlockSwap-specific attributes
+    for attr in ['_blockswap_runner_ref', 'blocks_to_swap', 'main_device',
+                 'offload_device']:
+        if hasattr(model, attr):
+            delattr(model, attr)
+    # 6. Clean up runner attributes
+    runner._blockswap_active = False
+    # Clean up pipelining resources on model (synchronize first)
+    if hasattr(model, '_swap_stream'):
+        try:
+            model._swap_stream.synchronize()
+        except Exception:
+            pass
+    for attr in ['_swap_stream', '_block_ready_events']:
+        if hasattr(model, attr):
+            delattr(model, attr)
+    # Remove all config attributes
+    for attr in ['_cached_blockswap_config', '_block_swap_config', '_blockswap_debug']:
+        if hasattr(runner, attr):
+            delattr(runner, attr)
+    debug.log("BlockSwap cleanup complete", category="success")

src/optimization/blockswap.py.bak ADDED Viewed

	@@ -0,0 +1,938 @@

+"""
+BlockSwap Module for SeedVR2
+This module implements dynamic block swapping between GPU and CPU memory
+to enable running large models on limited VRAM systems.
+Key Features:
+- Dynamic transformer block offloading during inference
+- Non-blocking GPU transfers for optimal performance
+- RoPE computation fallback to CPU on OOM
+- Minimal performance overhead with intelligent caching
+- I/O component offloading for maximum memory savings
+"""
+import time
+import types
+import torch
+import weakref
+from typing import Dict, Any, List, Optional
+from .memory_manager import clear_memory
+from .compatibility import call_rope_with_stability
+from ..common.distributed import get_device
+def is_blockswap_enabled(config: Optional[Dict[str, Any]]) -> bool:
+    """
+    Check if BlockSwap configuration indicates BlockSwap should be enabled.
+    BlockSwap is enabled if either blocks_to_swap > 0 OR swap_io_components is True.
+    This is the authoritative function for determining BlockSwap status from configuration.
+    Args:
+        config: BlockSwap configuration dictionary with optional keys:
+            - blocks_to_swap: Number of blocks to offload (0 = disabled)
+            - swap_io_components: Whether to offload I/O components
+    Returns:
+        True if BlockSwap should be active, False otherwise
+    """
+    if not config:
+        return False
+    blocks_to_swap = config.get("blocks_to_swap", 0)
+    swap_io_components = config.get("swap_io_components", False)
+    return blocks_to_swap > 0 or swap_io_components
+def validate_blockswap_config(
+    block_swap_config: Optional[Dict[str, Any]],
+    dit_device: 'torch.device',
+    dit_offload_device: Optional['torch.device'],
+    debug: 'Debug'
+) -> Optional[Dict[str, Any]]:
+    """
+    Validate and potentially modify BlockSwap configuration.
+    Performs platform-specific validation and configuration adjustment:
+    - On macOS (MPS): Auto-disables BlockSwap since unified memory makes it meaningless
+    - On other platforms: Validates that offload_device is properly configured
+    This is the single authoritative validation point for BlockSwap configuration,
+    called early in configure_runner() before any model loading.
+    Args:
+        block_swap_config: BlockSwap configuration dictionary (may be None)
+        dit_device: Target device for DiT model inference
+        dit_offload_device: Device for offloading DiT blocks (may be None)
+        debug: Debug instance for logging warnings/errors
+    Returns:
+        Validated/modified block_swap_config (may be None or modified copy)
+    Raises:
+        ValueError: If BlockSwap is enabled but offload_device is invalid (non-MPS only)
+    """
+    if not is_blockswap_enabled(block_swap_config):
+        return block_swap_config
+    blocks_to_swap = block_swap_config.get("blocks_to_swap", 0)
+    swap_io_components = block_swap_config.get("swap_io_components", False)
+    # Check for macOS unified memory - BlockSwap is meaningless there
+    if dit_device.type == "mps":
+        debug.log(
+            f"BlockSwap disabled: macOS uses unified memory (no separate VRAM/RAM). "
+            f"Ignoring blocks_to_swap={blocks_to_swap}, swap_io_components={swap_io_components}",
+            level="WARNING", category="blockswap", force=True
+        )
+        # Return disabled config
+        return {
+            **block_swap_config,
+            "blocks_to_swap": 0,
+            "swap_io_components": False
+        }
+    # Validate offload_device is set and different from dit_device
+    offload_device_valid = (
+        dit_offload_device is not None and
+        str(dit_offload_device) != str(dit_device)
+    )
+    if not offload_device_valid:
+        config_details = []
+        if blocks_to_swap > 0:
+            config_details.append(f"blocks_to_swap={blocks_to_swap}")
+        if swap_io_components:
+            config_details.append("swap_io_components=True")
+        offload_str = str(dit_offload_device) if dit_offload_device else "none"
+        raise ValueError(
+            f"BlockSwap enabled ({', '.join(config_details)}) but dit_offload_device is invalid. "
+            f"Current: device='{dit_device}', dit_offload_device='{offload_str}'. "
+            f"BlockSwap requires offload_device on the DiT Model to be set and different from device. "
+            f"Set --dit_offload_device cpu or disable BlockSwap."
+        )
+    return block_swap_config
+# Timing helpers marked to skip torch.compile tracing
+# These functions are excluded from Dynamo's graph tracing to avoid warnings
+# about non-traceable builtins like time.time(), but they still execute normally
+@torch._dynamo.disable
+def _get_swap_start_time(debug, enabled: bool) -> Optional[float]:
+    """Get start time for swap operation if debug is enabled."""
+    return time.time() if debug and enabled else None
+@torch._dynamo.disable
+def _log_swap_timing(debug, t_start: Optional[float], component_id, component_type: str) -> None:
+    """Log swap timing if start time was captured."""
+    if debug and t_start is not None:
+        debug.log_swap_time(
+            component_id=component_id,
+            duration=time.time() - t_start,
+            component_type=component_type
+        )
+def get_module_memory_mb(module: torch.nn.Module) -> float:
+    """
+    Calculate memory usage of a module in MB.
+    Args:
+        module: PyTorch module to measure
+    Returns:
+        Memory usage in megabytes
+    """
+    total_bytes = sum(
+        param.nelement() * param.element_size()
+        for param in module.parameters()
+        if param.data is not None
+    )
+    return total_bytes / (1024 * 1024)
+def apply_block_swap_to_dit(
+    runner: 'VideoDiffusionInfer',
+    block_swap_config: Dict[str, Any],
+    debug: 'Debug'
+) -> None:
+    """
+    Apply block swapping configuration to a DiT model with OOM protection.
+    This is the main entry point for configuring block swapping on a model.
+    Handles block selection, I/O component offloading, device placement, and
+    forward method wrapping for dynamic memory management.
+    Args:
+        runner: VideoDiffusionInfer instance containing the model
+        block_swap_config: Configuration dictionary with keys:
+            - blocks_to_swap: Number of blocks to swap (from the start)
+            - swap_io_components: Whether to offload I/O components
+            - enable_debug: Whether to enable debug logging
+            - offload_device: Device to offload to (default: 'cpu')
+        debug: Debug instance for logging (required)
+    """
+    # Early return if BlockSwap not enabled
+    if not is_blockswap_enabled(block_swap_config):
+        return
+    blocks_to_swap = block_swap_config.get("blocks_to_swap", 0)
+    swap_io_components = block_swap_config.get("swap_io_components", False)
+    # Early return only if both block swap and I/O swap are disabled
+    if blocks_to_swap <= 0 and not swap_io_components:
+        return
+    if debug is None:
+        if hasattr(runner, 'debug') and runner.debug is not None:
+            debug = runner.debug
+        else:
+            raise ValueError("Debug instance must be provided to apply_block_swap_to_dit")
+    debug.start_timer("apply_blockswap")
+    # Get the actual model (handle CompatibleDiT wrapper)
+    model = runner.dit
+    if hasattr(model, "dit_model"):
+        model = model.dit_model
+    # Determine devices
+    if hasattr(runner, '_dit_device'):
+        device = runner._dit_device
+    else:
+        device = get_device()
+    offload_device = block_swap_config.get("offload_device", torch.device('cpu'))
+    # Validate model structure
+    if not hasattr(model, "blocks"):
+        debug.log("Model doesn't have 'blocks' attribute for BlockSwap", level="ERROR", category="blockswap", force=True)
+        return
+    total_blocks = len(model.blocks)
+    # Clamp blocks_to_swap to available blocks BEFORE logging
+    effective_blocks = min(blocks_to_swap, total_blocks) if blocks_to_swap > 0 else 0
+    # Log configuration clearly based on what's enabled
+    block_text = "block" if effective_blocks <= 1 else "blocks"
+    if effective_blocks > 0 and swap_io_components:
+        debug.log(f"BlockSwap: {effective_blocks}/{total_blocks} transformer {block_text} + I/O components offloaded to {str(offload_device).upper()}", category="blockswap", force=True)
+    elif effective_blocks > 0:
+        debug.log(f"BlockSwap: {effective_blocks}/{total_blocks} transformer {block_text} offloaded to {str(offload_device).upper()}", category="blockswap", force=True)
+    elif swap_io_components:
+        debug.log(f"BlockSwap: I/O components offloaded to {str(offload_device).upper()} (0/{total_blocks} blocks swapped)", category="blockswap", force=True)
+    # Configure model with blockswap attributes
+    if blocks_to_swap > 0:
+        model.blocks_to_swap = effective_blocks - 1  # Convert to 0-indexed
+    else:
+        # No block swapping, set to -1 so no blocks match the swap condition
+        model.blocks_to_swap = -1
+    model.main_device = device
+    model.offload_device = offload_device
+    # Configure I/O components
+    io_config = _configure_io_components(model, device, offload_device,
+                                        swap_io_components, debug)
+    memory_stats = _configure_blocks(model, device, offload_device, debug)
+    memory_stats['io_components'] = io_config['components']
+    memory_stats['io_memory_mb'] = io_config['memory_mb']
+    memory_stats['gpu_components'] = io_config['gpu_components']
+    memory_stats['io_gpu_memory_mb'] = io_config['gpu_memory_mb']
+    # Log memory summary
+    _log_memory_summary(memory_stats, offload_device, device, swap_io_components,
+                       debug)
+    # Wrap block forward methods for dynamic swapping (only if blocks_to_swap > 0)
+    if blocks_to_swap > 0:
+        for b, block in enumerate(model.blocks):
+            if b <= model.blocks_to_swap:
+                _wrap_block_forward(block, b, model, debug)
+    # Patch RoPE modules for robust error handling
+    _patch_rope_for_blockswap(model, debug)
+    # Mark BlockSwap as active
+    runner._blockswap_active = True
+    # Store configuration for debugging and cleanup
+    model._block_swap_config = {
+        "blocks_swapped": blocks_to_swap,
+        "swap_io_components": swap_io_components,
+        "total_blocks": total_blocks,
+        "offload_device": offload_device,
+        "main_device": device,
+        "offload_memory": memory_stats['offload_memory'],
+        "main_memory": memory_stats['main_memory']
+    }
+    # Protect model from being moved entirely
+    _protect_model_from_move(model, runner, debug)
+    debug.log("BlockSwap configuration complete", category="success")
+    debug.end_timer("apply_blockswap", "BlockSwap configuration application")
+def _configure_io_components(
+    model: torch.nn.Module,
+    device: torch.device,
+    offload_device: torch.device,
+    swap_io_components: bool,
+    debug: 'Debug'
+) -> Dict[str, Any]:
+    """
+    Configure I/O component placement and wrapping with memory tracking.
+    Handles all non-block modules (embeddings, normalization layers, etc.) by
+    either keeping them on GPU or offloading them with dynamic swapping wrappers.
+    Args:
+        model: DiT model containing named children to configure
+        device: Main computation device (typically GPU)
+        offload_device: Device for offloaded components (typically CPU)
+        swap_io_components: If True, offload I/O components with dynamic swapping
+        debug: Debug instance for logging (required)
+    Returns:
+        Dictionary containing:
+            - components: List of offloaded component names
+            - memory_mb: Total memory of offloaded components in MB
+            - gpu_components: List of components remaining on GPU
+            - gpu_memory_mb: Total memory of GPU components in MB
+    """
+    io_components_offloaded = []
+    io_components_on_gpu = []
+    io_memory_mb = 0.0
+    io_gpu_memory_mb = 0.0
+    # Handle I/O modules with dynamic swapping
+    for name, module in model.named_children():
+        if name != "blocks":
+            module_memory = get_module_memory_mb(module)
+            if swap_io_components:
+                module.to(offload_device)
+                _wrap_io_forward(module, name, model, debug)
+                io_components_offloaded.append(name)
+                io_memory_mb += module_memory
+                debug.log(f"{name} → {str(offload_device).upper()} ({module_memory:.2f}MB, dynamic swapping)", category="blockswap", indent_level=1)
+            else:
+                module.to(device)
+                io_components_on_gpu.append(name)
+                io_gpu_memory_mb += module_memory
+                debug.log(f"{name} → {str(device).upper()} ({module_memory:.2f}MB)", category="blockswap", indent_level=1)
+    return {
+        'components': io_components_offloaded,
+        'memory_mb': io_memory_mb,
+        'gpu_components': io_components_on_gpu,
+        'gpu_memory_mb': io_gpu_memory_mb
+    }
+def _configure_blocks(
+    model: torch.nn.Module,
+    device: torch.device,
+    offload_device: torch.device,
+    debug: 'Debug'
+) -> Dict[str, float]:
+    """
+    Configure transformer block placement and calculate memory statistics.
+    Moves blocks to their designated devices based on model.blocks_to_swap
+    attribute. Blocks with index <= blocks_to_swap go to offload device,
+    others stay on main device.
+    Args:
+        model: DiT model with blocks attribute and blocks_to_swap configured
+        device: Main computation device for non-swapped blocks
+        offload_device: Device for swapped blocks
+        debug: Debug instance for logging (required)
+    Returns:
+        Dictionary containing:
+            - offload_memory: Total memory of offloaded blocks in MB
+            - main_memory: Total memory of blocks on main device in MB
+            - io_components: Empty list (populated by caller)
+    """
+    total_offload_memory = 0.0
+    total_main_memory = 0.0
+    # Move blocks based on swap configuration
+    for b, block in enumerate(model.blocks):
+        block_memory = get_module_memory_mb(block)
+        if b > model.blocks_to_swap:
+            block.to(device)
+            total_main_memory += block_memory
+        else:
+            block.to(offload_device, non_blocking=False)
+            total_offload_memory += block_memory
+    # Ensure all buffers match their containing module's device
+    for b, block in enumerate(model.blocks):
+        target_device = device if b > model.blocks_to_swap else offload_device
+        for name, buffer in block.named_buffers():
+            if buffer.device != torch.device(target_device):
+                buffer.data = buffer.data.to(target_device, non_blocking=False)
+    return {
+        "offload_memory": total_offload_memory,
+        "main_memory": total_main_memory,
+        "io_components": []  # Will be populated by caller
+    }
+def _log_memory_summary(
+    memory_stats: Dict[str, float],
+    offload_device: torch.device,
+    device: torch.device,
+    swap_io_components: bool,
+    debug: 'Debug'
+) -> None:
+    """
+    Log comprehensive memory usage summary for BlockSwap configuration.
+    Displays detailed breakdown of memory distribution across devices,
+    including transformer blocks and I/O components.
+    Args:
+        memory_stats: Dictionary containing:
+            - offload_memory: Memory offloaded from blocks (MB)
+            - main_memory: Memory remaining on main device (MB)
+            - io_memory_mb: Memory from offloaded I/O components (MB)
+            - io_gpu_memory_mb: Memory from I/O components on GPU (MB)
+        offload_device: Device used for offloading
+        device: Main computation device
+        swap_io_components: Whether I/O components are being swapped
+        debug: Debug instance for logging (required)
+    """
+    debug.log("BlockSwap memory configuration:", category="blockswap")
+    # Log transformer blocks memory
+    blocks_offloaded = memory_stats['offload_memory']
+    blocks_on_gpu = memory_stats['main_memory']
+    offload_str = str(offload_device)
+    device_str = str(device)
+    if blocks_on_gpu == 0:
+        debug.log(f"Transformer blocks: {blocks_offloaded:.2f}MB on {offload_str} (dynamic swapping)", category="blockswap", indent_level=1)
+    else:
+        debug.log(f"Transformer blocks: {blocks_on_gpu:.2f}MB on {device_str}, {blocks_offloaded:.2f}MB on {offload_str}", category="blockswap", indent_level=1)
+    # Always log I/O components (whether swapping or not)
+    io_memory = memory_stats.get('io_memory_mb', 0.0)
+    io_gpu_memory = memory_stats.get('io_gpu_memory_mb', 0.0)
+    if swap_io_components and io_memory > 0:
+        io_components = memory_stats.get('io_components', [])
+        debug.log(f"I/O components: {io_memory:.2f}MB on {offload_str} (dynamic swapping)", category="blockswap", indent_level=1)
+        debug.log(f"{', '.join(io_components)}", category="blockswap", indent_level=2)
+    elif io_gpu_memory > 0:
+        io_gpu_components = memory_stats.get('gpu_components', [])
+        debug.log(f"I/O components: {io_gpu_memory:.2f}MB on {device_str}", category="blockswap", indent_level=1)
+        debug.log(f"{', '.join(io_gpu_components)}", category="blockswap", indent_level=2)
+    # Log total VRAM savings
+    total_offloaded = blocks_offloaded + (io_memory if swap_io_components else 0)
+    if total_offloaded > 0:
+        debug.log(f"Total VRAM saved: {total_offloaded:.2f}MB (~{total_offloaded/1024:.2f}GB)", category="blockswap", indent_level=1)
+def _wrap_block_forward(
+    block: torch.nn.Module,
+    block_idx: int,
+    model: torch.nn.Module,
+    debug: 'Debug'
+) -> None:
+    """
+    Wrap individual transformer block forward for dynamic device swapping.
+    Creates a wrapped forward method that automatically:
+    1. Moves block to GPU before computation
+    2. Executes original forward pass
+    3. Moves block back to offload device after computation
+    4. Logs timing and manages memory pressure
+    Uses weak references to prevent memory leaks from closure retention.
+    Args:
+        block: Individual transformer block to wrap
+        block_idx: Index of this block in model.blocks
+        model: Parent DiT model (used for device references)
+        debug: Debug instance for logging (required)
+    """
+    if hasattr(block, '_original_forward'):
+        return  # Already wrapped
+    # Store original forward method
+    original_forward = block.forward
+    # Create weak references
+    model_ref = weakref.ref(model)
+    debug_ref = weakref.ref(debug)
+    # Store block_idx on the block itself to avoid closure issues
+    block._block_idx = block_idx
+    def wrapped_forward(self, *args, **kwargs):
+        # Retrieve weak references
+        model = model_ref()
+        debug = debug_ref()
+        if not model:
+            # Model has been garbage collected, fall back to original
+            return original_forward(*args, **kwargs)
+        # Check if block swap is active for this block
+        if hasattr(model, 'blocks_to_swap') and self._block_idx <= model.blocks_to_swap:
+            # Use dynamo-disabled helper to get start time (avoids compilation warnings)
+            t_start = _get_swap_start_time(debug, debug.enabled if debug else False)
+            # Only move to GPU if necessary
+            current_device = next(self.parameters()).device
+            target_device = torch.device(model.main_device)
+            if current_device != target_device:
+                self.to(model.main_device, non_blocking=False)
+            # Execute forward pass with OOM protection
+            output = original_forward(*args, **kwargs)
+            # Move back to offload device
+            self.to(model.offload_device, non_blocking=False)
+            # Use dynamo-disabled helper to log timing (avoids compilation warnings)
+            _log_swap_timing(debug, t_start, self._block_idx, "block")
+            # Only clear cache under memory pressure
+            clear_memory(debug=debug, deep=False, force=False, timer_name="wrap_block_forward")
+        else:
+            output = original_forward(*args, **kwargs)
+        return output
+    # Bind the wrapped function as a method to the block
+    block.forward = types.MethodType(wrapped_forward, block)
+    # Store reference to original forward for cleanup
+    block._original_forward = original_forward
+def _wrap_io_forward(
+    module: torch.nn.Module,
+    module_name: str,
+    model: torch.nn.Module,
+    debug: 'Debug'
+) -> None:
+    """
+    Wrap I/O component forward for dynamic device swapping.
+    Similar to _wrap_block_forward but for I/O components (embeddings,
+    normalization layers, etc.). Handles swapping between GPU and CPU
+    during forward passes.
+    Uses weak references to prevent circular dependencies and memory leaks.
+    Args:
+        module: I/O component module to wrap
+        module_name: Name identifier for logging (e.g., 'x_embedder')
+        model: Parent DiT model (used for device references)
+        debug: Debug instance for logging (required)
+    """
+    if hasattr(module, '_is_io_wrapped') and module._is_io_wrapped:
+        debug.log(f"Reusing existing I/O wrapper for {module_name}", category="reuse")
+        return  # Already wrapped
+    # Store original forward method
+    original_forward = module.forward
+    # Create weak references
+    model_ref = weakref.ref(model)
+    debug_ref = weakref.ref(debug) if debug else lambda: None
+    # Store module name on the module itself
+    module._module_name = module_name
+    module._original_forward = original_forward
+    def wrapped_io_forward(self, *args, **kwargs):
+        # Retrieve weak references
+        model = model_ref()
+        debug = debug_ref()
+        if not model:
+            # Model has been garbage collected, fall back to original
+            return self._original_forward(*args, **kwargs)
+        # Use dynamo-disabled helper to get start time (avoids compilation warnings)
+        t_start = _get_swap_start_time(debug, debug.enabled if debug else False)
+        # Check current device to avoid unnecessary moves
+        current_device = next(self.parameters()).device
+        target_device = torch.device(model.main_device)
+        # Move to GPU for computation if needed
+        if current_device != target_device:
+            self.to(model.main_device, non_blocking=False)
+        # Execute forward pass
+        output = self._original_forward(*args, **kwargs)
+        # Move back to offload device
+        self.to(model.offload_device, non_blocking=False)
+        # Use dynamo-disabled helper to log timing (avoids compilation warnings)
+        _log_swap_timing(debug, t_start, self._module_name, "I/O")
+        # Only clear cache under memory pressure
+        clear_memory(debug=debug, deep=False, force=False, timer_name="wrap_block_forward")
+        return output
+    # Bind as a method
+    module.forward = types.MethodType(wrapped_io_forward, module)
+    module._is_io_wrapped = True
+    # Store module reference for restoration
+    if not hasattr(model, '_io_swappers'):
+        model._io_swappers = []
+    model._io_swappers.append((module, module_name))
+def _patch_rope_for_blockswap(
+    model: torch.nn.Module,
+    debug: 'Debug'
+) -> None:
+    """
+    Patch RoPE (Rotary Position Embedding) modules for device-aware fallback.
+    Adds CPU fallback logic to RoPE modules to handle device mismatch errors
+    that can occur during BlockSwap operations. Complements the stability
+    wrapper from compatibility.py with device-specific error handling.
+    Args:
+        model: DiT model containing RoPE modules to patch
+        debug: Debug instance for logging (required)
+    """
+    rope_patches = []
+    for name, module in model.named_modules():
+        if "rope" in name.lower() and hasattr(module, "get_axial_freqs"):
+            # Skip if already wrapped by blockswap
+            if hasattr(module, '_blockswap_wrapped') and module._blockswap_wrapped:
+                continue
+            # Get current method (might be stability-wrapped)
+            current_method = module.get_axial_freqs
+            # Create device-aware wrapper with proper closure handling
+            def make_device_aware_wrapper(module_name, current_fn):
+                def device_aware_rope_wrapper(self, *args, **kwargs):
+                    try:
+                        # Try current method (original or stability-wrapped)
+                        return current_fn(*args, **kwargs)
+                    except (RuntimeError, torch.cuda.OutOfMemoryError) as e:
+                        error_msg = str(e).lower()
+                        # Only handle device/memory specific errors
+                        if any(x in error_msg for x in ["device", "memory", "allocation"]):
+                            debug.log(f"RoPE OOM for {module_name}", level="WARNING", category="rope", force=True)
+                            debug.log(f"Clearing RoPE cache and retrying", category="info", force=True)
+                            # Get current device from parameters
+                            try:
+                                current_device = next(self.parameters()).device
+                            except StopIteration:
+                                # Fallback: use model's main_device if BlockSwap has set it, else use offload_device
+                                if hasattr(model, 'main_device'):
+                                    current_device = torch.device(model.main_device)
+                                elif hasattr(model, 'offload_device'):
+                                    current_device = torch.device(model.offload_device)
+                            # Try clearing cache first (non-invasive fix)
+                            if hasattr(current_fn, 'cache_clear'):
+                                current_fn.cache_clear()
+                                try:
+                                    # Retry on same device after clearing cache
+                                    return current_fn(*args, **kwargs)
+                                except Exception as retry_error:
+                                    # Cache clear wasn't enough, need more drastic measures
+                                    debug.log(f"Cache clear insufficient for {module_name}, falling back to CPU", level="WARNING", category="rope", force=True)
+                            # Fallback to CPU computation with stability
+                            self.cpu()
+                            try:
+                                # Use call_rope_with_stability for CPU computation
+                                # This ensures cache is cleared and autocast disabled
+                                original_fn = getattr(self, '_original_get_axial_freqs', current_fn)
+                                result = call_rope_with_stability(original_fn, *args, **kwargs)
+                                # Move module back to original device
+                                self.to(current_device)
+                                # Move result to appropriate device if it's a tensor
+                                if hasattr(result, 'to'):
+                                    target_device = args[0].device if len(args) > 0 and hasattr(args[0], 'device') else current_device
+                                    return result.to(target_device)
+                                return result
+                            except Exception as cpu_error:
+                                # Always restore device even on error
+                                self.to(current_device)
+                                raise cpu_error
+                        else:
+                            # Not a device error, let it bubble up
+                            raise
+                return device_aware_rope_wrapper
+            # Apply wrapper
+            module.get_axial_freqs = types.MethodType(
+                make_device_aware_wrapper(name, current_method),
+                module
+            )
+            module._blockswap_wrapped = True
+            # Store for cleanup (use original or previously stored)
+            original_method = getattr(module, '_original_get_axial_freqs', current_method)
+            rope_patches.append((module, original_method))
+    if rope_patches:
+        model._rope_patches = rope_patches
+        debug.log(f"Patched {len(rope_patches)} RoPE modules with device handling", category="success")
+def _protect_model_from_move(
+    model: torch.nn.Module,
+    runner: 'VideoDiffusionInfer',
+    debug: 'Debug'
+) -> None:
+    """
+    Protect model from unintended full device movement during BlockSwap.
+    Wraps model.to() method to prevent other code from accidentally moving
+    the entire model to GPU, which would defeat BlockSwap's memory savings.
+    Allows movement only when explicitly bypassed via model flag.
+    Args:
+        model: DiT model to protect
+        runner: VideoDiffusionInfer instance (for active status check)
+        debug: Debug instance for logging (required)
+    """
+    if not hasattr(model, '_original_to'):
+        # Store runner reference as weak reference to avoid circular refs
+        model._blockswap_runner_ref = weakref.ref(runner)
+        model._original_to = model.to
+        # Define the protected method without closures
+        def protected_model_to(self, device, *args, **kwargs):
+            # Check if protection is temporarily bypassed for offloading
+            # Flag is stored on model itself (not runner) to survive runner recreation
+            if getattr(self, "_blockswap_bypass_protection", False):
+                # Protection bypassed, allow movement
+                if hasattr(self, '_original_to'):
+                    return self._original_to(device, *args, **kwargs)
+            # Get configured offload device directly from model
+            blockswap_offload_device = "cpu"  # default
+            if hasattr(self, "_block_swap_config"):
+                blockswap_offload_device = self._block_swap_config.get("offload_device", "cpu")
+            # Check if BlockSwap is currently active via runner weak reference
+            runner_ref = getattr(self, '_blockswap_runner_ref', None)
+            blockswap_is_active = False
+            if runner_ref:
+                runner_obj = runner_ref()
+                if runner_obj and hasattr(runner_obj, "_blockswap_active"):
+                    blockswap_is_active = runner_obj._blockswap_active
+            # Block attempts to move model away from configured offload device when active
+            if blockswap_is_active and str(device) != str(blockswap_offload_device):
+                # Get debug instance from runner if available
+                debug_instance = None
+                if runner_ref:
+                    runner_obj = runner_ref()
+                    if runner_obj and hasattr(runner_obj, 'debug'):
+                        debug_instance = runner_obj.debug
+                if debug_instance:
+                    debug_instance.log(
+                        f"Blocked attempt to move BlockSwap model from {blockswap_offload_device} to {device}",
+                        level="WARNING", category="blockswap", force=True
+                    )
+                return self
+            # Allow movement (either bypass is enabled or target is offload device)
+            if hasattr(self, '_original_to'):
+                return self._original_to(device, *args, **kwargs)
+            else:
+                # Fallback - shouldn't happen
+                return super(type(self), self).to(device, *args, **kwargs)
+        # Bind as a method to the model instance
+        model.to = types.MethodType(protected_model_to, model)
+def set_blockswap_bypass(runner, bypass: bool, debug):
+    """
+    Set or unset bypass flag for BlockSwap protection.
+    Used for offloading to temporarily allow model movement.
+    Args:
+        runner: Runner instance with BlockSwap
+        bypass: True to bypass protection, False to enforce it
+        debug: Debug instance for logging
+    """
+    if not hasattr(runner, "_blockswap_active") or not runner._blockswap_active:
+        return
+    # Get the actual model (handle CompatibleDiT wrapper)
+    model = runner.dit
+    if hasattr(model, "dit_model"):
+        model = model.dit_model
+    # Store on model so it survives runner recreation during caching
+    model._blockswap_bypass_protection = bypass
+    if bypass:
+        debug.log("BlockSwap protection disabled to allow model DiT offloading", category="success")
+    else:
+        debug.log("BlockSwap protection renabled to avoid accidentally offloading the entire DiT model", category="success")
+def cleanup_blockswap(runner, keep_state_for_cache=False):
+    """
+    Clean up BlockSwap configuration based on caching mode.
+    When caching (keep_state_for_cache=True):
+    - Keep all BlockSwap configuration intact
+    - Only mark as inactive for safety during non-inference operations
+    When not caching (keep_state_for_cache=False):
+    - Full cleanup of all BlockSwap state
+    Args:
+        runner: VideoDiffusionInfer instance to clean up
+        keep_state_for_cache: If True, preserve BlockSwap state for reuse
+    """
+    # Get debug instance from runner
+    if not hasattr(runner, 'debug') or runner.debug is None:
+        raise ValueError("Debug instance must be available on runner for cleanup_blockswap")
+    debug = runner.debug
+    # Get the actual model (handle CompatibleDiT wrapper)
+    model = runner.dit
+    if hasattr(model, "dit_model"):
+        model = model.dit_model
+    # Check if there's any BlockSwap state to clean up (check both runner and model)
+    has_blockswap_state = (
+        hasattr(runner, "_blockswap_active") or
+        hasattr(model, "_block_swap_config") or
+        hasattr(model, "_blockswap_bypass_protection")
+    )
+    if not has_blockswap_state:
+        return
+    debug.log("Starting BlockSwap cleanup", category="cleanup")
+    if keep_state_for_cache:
+        # Minimal cleanup for caching - just mark as inactive and allow offloading
+        # Everything else stays intact for fast reactivation
+        if hasattr(runner, "_blockswap_active") and runner._blockswap_active:
+            if not getattr(model, "_blockswap_bypass_protection", False):
+                set_blockswap_bypass(runner=runner, bypass=True, debug=debug)
+            runner._blockswap_active = False
+        debug.log("BlockSwap deactivated for caching (configuration preserved)", category="success")
+        return
+    # Full cleanup when not caching
+    # Get the actual model (handle CompatibleDiT wrapper)
+    model = runner.dit
+    if hasattr(model, "dit_model"):
+        model = model.dit_model
+    # 1. Restore block forward methods
+    if hasattr(model, 'blocks'):
+        restored_count = 0
+        for block in model.blocks:
+            if hasattr(block, '_original_forward'):
+                block.forward = block._original_forward
+                delattr(block, '_original_forward')
+                restored_count += 1
+                # Clean up wrapper attributes
+                for attr in ['_block_idx', '_model_ref', '_debug_ref', '_blockswap_wrapped']:
+                    if hasattr(block, attr):
+                        delattr(block, attr)
+        if restored_count > 0:
+            debug.log(f"Restored {restored_count} block forward methods", category="success")
+    # 2. Restore RoPE patches
+    if hasattr(model, '_rope_patches'):
+        for module, original_method in model._rope_patches:
+            module.get_axial_freqs = original_method
+            # Clean up wrapper attributes
+            for attr in ['_rope_wrapped', '_original_get_axial_freqs']:
+                if hasattr(module, attr):
+                    delattr(module, attr)
+        debug.log(f"Restored {len(model._rope_patches)} RoPE methods", category="success")
+        delattr(model, '_rope_patches')
+    # 3. Restore I/O component forward methods and move to offload device
+    if hasattr(model, '_io_swappers'):
+        for module, module_name in model._io_swappers:
+            if hasattr(module, '_original_forward'):
+                module.forward = module._original_forward
+                # Clean up wrapper attributes
+                for attr in ['_original_forward', '_model_ref', '_debug_ref',
+                           '_module_name', '_is_io_wrapped']:
+                    if hasattr(module, attr):
+                        delattr(module, attr)
+        debug.log(f"Restored {len(model._io_swappers)} I/O components", category="success")
+        delattr(model, '_io_swappers')
+    # Move all IO components to offload device during full cleanup
+    if hasattr(model, 'offload_device'):
+        offload_device = model.offload_device
+        moved_count = 0
+        for name, module in model.named_children():
+            if name != "blocks":
+                module.to(offload_device)
+                moved_count += 1
+        if moved_count > 0:
+            debug.log(f"Moved {moved_count} IO components to offload device", category="success")
+    # 4. Restore original .to() method
+    if hasattr(model, '_original_to'):
+        model.to = model._original_to
+        delattr(model, '_original_to')
+        debug.log("Restored original .to() method", category="success")
+    # 5. Clean up BlockSwap-specific attributes
+    for attr in ['_blockswap_runner_ref', 'blocks_to_swap', 'main_device',
+                 'offload_device']:
+        if hasattr(model, attr):
+            delattr(model, attr)
+    # 6. Clean up runner attributes
+    runner._blockswap_active = False
+    # Remove all config attributes
+    for attr in ['_cached_blockswap_config', '_block_swap_config', '_blockswap_debug']:
+        if hasattr(runner, attr):
+            delattr(runner, attr)
+    debug.log("BlockSwap cleanup complete", category="success")

src/optimization/memory_manager.py ADDED Viewed

	@@ -0,0 +1,1285 @@

+"""
+Memory management module for SeedVR2
+Handles VRAM usage, cache management, and memory optimization
+Extracted from: seedvr2.py (lines 373-405, 607-626, 1016-1044)
+"""
+import torch
+import gc
+import sys
+import time
+import psutil
+import platform
+from typing import Tuple, Dict, Any, Optional, List, Union
+def _device_str(device: Union[torch.device, str]) -> str:
+    """Normalized uppercase device string for comparison and logging. MPS variants → 'MPS'."""
+    s = str(device).upper()
+    return 'MPS' if s.startswith('MPS') else s
+def is_mps_available() -> bool:
+    """Check if MPS (Apple Metal) backend is available."""
+    return hasattr(torch.backends, 'mps') and torch.backends.mps.is_available()
+def is_cuda_available() -> bool:
+    """Check if CUDA backend is available."""
+    return torch.cuda.is_available()
+def get_gpu_backend() -> str:
+    """Get the active GPU backend type.
+    Returns:
+        'cuda': NVIDIA CUDA
+        'mps': Apple Metal Performance Shaders
+        'cpu': No GPU backend available
+    """
+    if is_cuda_available():
+        return 'cuda'
+    if is_mps_available():
+        return 'mps'
+    return 'cpu'
+def get_device_list(include_none: bool = False, include_cpu: bool = False) -> List[str]:
+    """
+    Get list of available compute devices for SeedVR2
+    Args:
+        include_none: If True, prepend "none" to the device list (for offload options)
+        include_cpu: If True, include "cpu" in the device list (for offload options only)
+                     Note: On MPS-only systems, "cpu" is automatically excluded since
+                     unified memory architecture makes CPU offloading meaningless
+    Returns:
+        List of device strings (e.g., ["cuda:0", "cuda:1"] or ["none", "cpu", "cuda:0", "cuda:1"])
+    """
+    devs = []
+    has_cuda = False
+    has_mps = False
+    try:
+        if is_cuda_available():
+            devs += [f"cuda:{i}" for i in range(torch.cuda.device_count())]
+            has_cuda = True
+    except Exception:
+        pass
+    try:
+        if is_mps_available():
+            devs.append("mps")  # MPS doesn't use device indices
+            has_mps = True
+    except Exception:
+        pass
+    # Build result list with optional prefixes
+    result = []
+    if include_none:
+        result.append("none")
+    # Only include "cpu" option if:
+    # 1. It was requested (include_cpu=True), AND
+    # 2. Either CUDA is available OR MPS is not the only option
+    # Rationale: On MPS-only systems with unified memory architecture,
+    # CPU offloading is semantically meaningless as CPU and GPU share the same memory pool
+    if include_cpu and (has_cuda or not has_mps):
+        result.append("cpu")
+    result.extend(devs)
+    return result if result else []
+def get_basic_vram_info(device: Optional[torch.device] = None) -> Dict[str, Any]:
+    """
+    Get basic VRAM availability info (free and total memory).
+    Used for capacity planning and initial checks.
+    Args:
+        device: Optional device to query. If None, uses cuda:0
+    Returns:
+        dict: {"free_gb": float, "total_gb": float} or {"error": str}
+    """
+    try:
+        if is_cuda_available():
+            if device is None:
+                device = torch.device("cuda:0")
+            elif not isinstance(device, torch.device):
+                device = torch.device(device)
+            free_memory, total_memory = torch.cuda.mem_get_info(device)
+        elif is_mps_available():
+            # MPS doesn't support per-device queries or mem_get_info
+            # Use system memory as proxy
+            mem = psutil.virtual_memory()
+            free_memory = mem.total - mem.used
+            total_memory = mem.total
+        else:
+            return {"error": "No GPU backend available (CUDA/MPS)"}
+        return {
+            "free_gb": free_memory / (1024**3),
+            "total_gb": total_memory / (1024**3)
+        }
+    except Exception as e:
+        return {"error": f"Failed to get memory info: {str(e)}"}
+# Initial VRAM check at module load
+vram_info = get_basic_vram_info(device=None)
+if "error" not in vram_info:
+    backend = "MPS" if is_mps_available() else "CUDA"
+    print(f"📊 Initial {backend} memory: {vram_info['free_gb']:.2f}GB free / {vram_info['total_gb']:.2f}GB total")
+else:
+    print(f"⚠️ Memory check failed: {vram_info['error']} - No available backend!")
+def get_vram_usage(device: Optional[torch.device] = None, debug: Optional['Debug'] = None) -> Tuple[float, float, float, float]:
+    """
+    Get current VRAM usage metrics for monitoring.
+    Used for tracking memory consumption during processing.
+    Args:
+        device: Optional device to query. If None, uses cuda:0
+        debug: Optional debug instance for logging
+    Returns:
+        tuple: (allocated_gb, reserved_gb, peak_allocated_gb, peak_reserved_gb)
+               Returns (0, 0, 0, 0) if no GPU available
+    """
+    try:
+        if is_cuda_available():
+            if device is None:
+                device = torch.device("cuda:0")
+            elif not isinstance(device, torch.device):
+                device = torch.device(device)
+            allocated = torch.cuda.memory_allocated(device) / (1024**3)
+            reserved = torch.cuda.memory_reserved(device) / (1024**3)
+            peak_allocated = torch.cuda.max_memory_allocated(device) / (1024**3)
+            peak_reserved = torch.cuda.max_memory_reserved(device) / (1024**3)
+            return allocated, reserved, peak_allocated, peak_reserved
+        elif is_mps_available():
+            # MPS doesn't support per-device queries - uses global memory tracking
+            allocated = torch.mps.current_allocated_memory() / (1024**3)
+            reserved = torch.mps.driver_allocated_memory() / (1024**3)
+            # MPS doesn't track peak separately
+            return allocated, reserved, allocated, reserved
+    except Exception as e:
+        if debug:
+            debug.log(f"Failed to get VRAM usage: {e}", level="WARNING", category="memory", force=True)
+    return 0.0, 0.0, 0.0, 0.0
+def get_ram_usage(debug: Optional['Debug'] = None) -> Tuple[float, float, float, float]:
+    """
+    Get current RAM usage metrics for the current process.
+    Provides accurate tracking of process-specific memory consumption.
+    Args:
+        debug: Optional debug instance for logging
+    Returns:
+        tuple: (process_gb, available_gb, total_gb, used_by_others_gb)
+               Returns (0, 0, 0, 0) if psutil not available or on error
+    """
+    try:
+        if not psutil:
+            return 0.0, 0.0, 0.0, 0.0
+        # Get current process memory
+        process = psutil.Process()
+        process_memory = process.memory_info()
+        process_gb = process_memory.rss / (1024**3)
+        # Get system memory
+        sys_memory = psutil.virtual_memory()
+        total_gb = sys_memory.total / (1024**3)
+        available_gb = sys_memory.available / (1024**3)
+        # Calculate memory used by other processes
+        # This is the CORRECT calculation:
+        total_used_gb = total_gb - available_gb  # Total memory used by ALL processes
+        used_by_others_gb = max(0, total_used_gb - process_gb)  # Subtract current process
+        return process_gb, available_gb, total_gb, used_by_others_gb
+    except Exception as e:
+        if debug:
+            debug.log(f"Failed to get RAM usage: {e}", level="WARNING", category="memory", force=True)
+        return 0.0, 0.0, 0.0, 0.0
+# Global cache for OS libraries (initialized once)
+_os_memory_lib = None
+def clear_memory(debug: Optional['Debug'] = None, deep: bool = False, force: bool = True,
+                timer_name: Optional[str] = None) -> None:
+    """
+    Clear memory caches with two-tier approach for optimal performance.
+    Args:
+        debug: Debug instance for logging (optional)
+        force: If True, always clear. If False, only clear when <5% free
+        deep: If True, perform deep cleanup including GC and OS operations.
+              If False (default), only perform minimal GPU cache clearing.
+        timer_name: Optional suffix for timer names to make them unique per invocation
+    Two-tier approach:
+        - Minimal mode (deep=False): GPU cache operations (~1-5ms)
+          Used for frequent calls during batch processing
+        - Deep mode (deep=True): Complete cleanup with GC and OS operations (~10-50ms)
+          Used at key points like model switches or final cleanup
+    """
+    global _os_memory_lib
+    # Create unique timer names if suffix provided
+    if timer_name:
+        main_timer = f"memory_clear_{timer_name}"
+        gpu_timer = f"gpu_cache_clear_{timer_name}"
+        gc_timer = f"garbage_collection_{timer_name}"
+        os_timer = f"os_memory_release_{timer_name}"
+        completion_msg = f"clear_memory() completion ({timer_name})"
+    else:
+        main_timer = "memory_clear"
+        gpu_timer = "gpu_cache_clear"
+        gc_timer = "garbage_collection"
+        os_timer = "os_memory_release"
+        completion_msg = "clear_memory() completion"
+    # Start timer for entire operation
+    if debug:
+        debug.start_timer(main_timer)
+    # Check if we should clear based on memory pressure
+    if not force:
+        should_clear = False
+        # Use existing function for memory info
+        mem_info = get_basic_vram_info(device=None)
+        if "error" not in mem_info and mem_info["total_gb"] > 0:
+            # Check VRAM/MPS memory pressure (5% free threshold)
+            free_ratio = mem_info["free_gb"] / mem_info["total_gb"]
+            if free_ratio < 0.05:
+                should_clear = True
+                if debug:
+                    backend = "Unified Memory" if is_mps_available() else "VRAM"
+                    debug.log(f"{backend} pressure: {mem_info['free_gb']:.2f}GB free of {mem_info['total_gb']:.2f}GB", category="memory")
+        # For non-MPS systems, also check system RAM separately
+        if not should_clear and not is_mps_available():
+            mem = psutil.virtual_memory()
+            if mem.available < mem.total * 0.05:
+                should_clear = True
+                if debug:
+                    debug.log(f"RAM pressure: {mem.available/(1024**3):.2f}GB free of {mem.total/(1024**3):.2f}GB", category="memory")
+        if not should_clear:
+            # End timer before early return to keep stack clean
+            if debug:
+                debug.end_timer(main_timer)
+            return
+    # Determine cleanup level
+    cleanup_mode = "deep" if deep else "minimal"
+    if debug:
+        debug.log(f"Clearing memory caches ({cleanup_mode})...", category="cleanup")
+    # ===== MINIMAL OPERATIONS (Always performed) =====
+    # Step 1: Clear GPU caches - Fast operations (~1-5ms)
+    if debug:
+        debug.start_timer(gpu_timer)
+    if is_cuda_available():
+        torch.cuda.empty_cache()
+        torch.cuda.ipc_collect()
+    elif is_mps_available():
+        torch.mps.empty_cache()
+    if debug:
+        debug.end_timer(gpu_timer, "GPU cache clearing")
+    # ===== DEEP OPERATIONS (Only when deep=True) =====
+    if deep:
+        # Step 2: Deep garbage collection (expensive ~5-20ms)
+        if debug:
+            debug.start_timer(gc_timer)
+        gc.collect(2)
+        if debug:
+            debug.end_timer(gc_timer, "Garbage collection")
+        # Step 3: Return memory to OS (platform-specific, ~5-30ms)
+        if debug:
+            debug.start_timer(os_timer)
+        try:
+            if sys.platform == 'linux':
+                # Linux: malloc_trim
+                import ctypes  # Import only when needed
+                if _os_memory_lib is None:
+                    _os_memory_lib = ctypes.CDLL("libc.so.6")
+                _os_memory_lib.malloc_trim(0)
+            elif sys.platform == 'win32':
+                # Windows: Trim working set
+                import ctypes  # Import only when needed
+                if _os_memory_lib is None:
+                    _os_memory_lib = ctypes.windll.kernel32
+                handle = _os_memory_lib.GetCurrentProcess()
+                _os_memory_lib.SetProcessWorkingSetSize(handle, -1, -1)
+            elif is_mps_available():
+                # macOS with MPS
+                import ctypes  # Import only when needed
+                import ctypes.util
+                if _os_memory_lib is None:
+                    libc_path = ctypes.util.find_library('c')
+                    if libc_path:
+                        _os_memory_lib = ctypes.CDLL(libc_path)
+                if _os_memory_lib:
+                    _os_memory_lib.sync()
+        except Exception as e:
+            if debug:
+                debug.log(f"Failed to perform OS memory operations: {e}", level="WARNING", category="memory", force=True)
+        if debug:
+            debug.end_timer(os_timer, "OS memory release")
+    # End overall timer
+    if debug:
+        debug.end_timer(main_timer, completion_msg)
+def retry_on_oom(func, *args, debug=None, operation_name="operation", **kwargs):
+    """
+    Execute function with single OOM retry after memory cleanup.
+    Args:
+        func: Callable to execute
+        *args: Positional arguments for func
+        debug: Debug instance for logging (optional)
+        operation_name: Name for logging
+        **kwargs: Keyword arguments for func
+    Returns:
+        Result of func(*args, **kwargs)
+    """
+    try:
+        return func(*args, **kwargs)
+    except (torch.cuda.OutOfMemoryError, RuntimeError) as e:
+        # Only handle OOM errors
+        if not any(x in str(e).lower() for x in ["out of memory", "allocation on device"]):
+            raise
+        if debug:
+            debug.log(f"OOM during {operation_name}: {e}", level="WARNING", category="memory", force=True)
+            debug.log(f"Clearing memory and retrying", category="info", force=True)
+        # Clear memory
+        clear_memory(debug=debug, deep=True, force=True, timer_name=operation_name)
+        # Let memory settle
+        time.sleep(0.5)
+        debug.log_memory_state("After memory clearing", show_tensors=False, detailed_tensors=False)
+        # Single retry
+        try:
+            result = func(*args, **kwargs)
+            if debug:
+                debug.log(f"Retry successful for {operation_name}", category="success", force=True)
+            return result
+        except Exception as retry_e:
+            if debug:
+                debug.log(f"Retry failed for {operation_name}: {retry_e}", level="ERROR", category="memory", force=True)
+            raise
+def reset_vram_peak(device: Optional[torch.device] = None, debug: Optional['Debug'] = None) -> None:
+    """
+    Reset VRAM peak memory statistics for fresh tracking.
+    Args:
+        device: Optional device to reset stats for. If None, uses cuda:0
+        debug: Optional debug instance for logging
+    """
+    if debug and debug.enabled:
+        debug.log("Resetting VRAM peak memory statistics", category="memory")
+    try:
+        if is_cuda_available():
+            if device is None:
+                device = torch.device("cuda:0")
+            elif not isinstance(device, torch.device):
+                device = torch.device(device)
+            torch.cuda.reset_peak_memory_stats(device)
+        # Note: MPS doesn't support peak memory reset - no action needed
+    except Exception as e:
+        if debug and debug.enabled:
+            debug.log(f"Failed to reset peak memory stats: {e}", level="WARNING", category="memory", force=True)
+def clear_rope_lru_caches(model: Optional[torch.nn.Module], debug: Optional['Debug'] = None) -> int:
+    """
+    Clear ALL LRU caches from RoPE modules.
+    Args:
+        model: PyTorch model to clear caches from
+        debug: Optional debug instance for logging
+    Returns:
+        Number of caches cleared
+    """
+    if model is None:
+        return 0
+    cleared_count = 0
+    try:
+        for name, module in model.named_modules():
+            if hasattr(module, 'get_axial_freqs') and hasattr(module.get_axial_freqs, 'cache_clear'):
+                try:
+                    module.get_axial_freqs.cache_clear()
+                    cleared_count += 1
+                except Exception as e:
+                    if debug:
+                        debug.log(f"Failed to clear RoPE LRU cache for module {name}: {e}", level="WARNING", category="memory", force=True)
+    except (AttributeError, RuntimeError) as e:
+        if debug:
+            debug.log(f"Failed to iterate model modules for RoPE LRU cache clearing: {e}", level="WARNING", category="memory", force=True)
+    return cleared_count
+def release_tensor_memory(tensor: Optional[torch.Tensor]) -> None:
+    """Release tensor memory from any device (CPU/CUDA/MPS)"""
+    if tensor is not None and torch.is_tensor(tensor):
+        # Release storage for all devices (CPU, CUDA, MPS)
+        if tensor.numel() > 0:
+            tensor.data.set_()
+        tensor.grad = None
+def release_tensor_collection(collection: Any, recursive: bool = True) -> None:
+    """
+    Release GPU memory from tensors in any collection (list, tuple, dict, or single tensor).
+    Args:
+        collection: Tensor, list, tuple, dict, or nested structure to release
+        recursive: If True, handle nested structures recursively
+    Examples:
+        release_tensor_collection(tensor)                    # Single tensor
+        release_tensor_collection([tensor1, tensor2])        # List of tensors
+        release_tensor_collection([[t1, t2], [t3, t4]])     # Nested lists
+        release_tensor_collection({'a': tensor})             # Dict values
+    """
+    if collection is None:
+        return
+    if torch.is_tensor(collection):
+        release_tensor_memory(collection)
+    elif isinstance(collection, dict):
+        for value in collection.values():
+            if recursive:
+                release_tensor_collection(value, recursive=True)
+            elif torch.is_tensor(value):
+                release_tensor_memory(value)
+    elif isinstance(collection, (list, tuple)):
+        for item in collection:
+            if recursive:
+                release_tensor_collection(item, recursive=True)
+            elif torch.is_tensor(item):
+                release_tensor_memory(item)
+def release_text_embeddings(*embeddings: torch.Tensor, debug: Optional['Debug'] = None, names: Optional[List[str]] = None) -> None:
+    """
+    Release memory for text embeddings
+    Args:
+        *embeddings: Variable number of embedding tensors to release
+        debug: Optional debug instance for logging
+        names: Optional list of names for logging
+    """
+    for i, embedding in enumerate(embeddings):
+        if embedding is not None:
+            release_tensor_memory(embedding)
+            if debug and names and i < len(names):
+                debug.log(f"Cleaned up {names[i]}", category="cleanup")
+def cleanup_text_embeddings(ctx: Dict[str, Any], debug: Optional['Debug'] = None) -> None:
+    """
+    Clean up text embeddings from a context dictionary.
+    Extracts embeddings, releases memory, and clears the context entry.
+    Args:
+        ctx: Context dictionary potentially containing 'text_embeds'
+        debug: Optional debug instance for logging
+    """
+    if not ctx or not ctx.get('text_embeds'):
+        return
+    embeddings = []
+    names = []
+    for key, embeds_list in ctx['text_embeds'].items():
+        if embeds_list:
+            embeddings.extend(embeds_list)
+            names.append(key)
+    if embeddings:
+        release_text_embeddings(embeddings, names, debug)
+        if debug:
+            debug.log(f"Cleaned up text embeddings: {', '.join(names)}", category="cleanup")
+    ctx['text_embeds'] = None
+def release_model_memory(model: Optional[torch.nn.Module], debug: Optional['Debug'] = None) -> None:
+    """
+    Release all GPU/MPS memory from model in-place without CPU transfer.
+    Args:
+        model: PyTorch model to release memory from
+        debug: Optional debug instance for logging
+    """
+    if model is None:
+        return
+    # If the model has pipelining resources (swap stream), synchronize to ensure no pending async ops
+    try:
+        if hasattr(model, "_swap_stream"):
+            try:
+                model._swap_stream.synchronize()
+            except Exception:
+                if debug:
+                    debug.log("Failed to synchronize model._swap_stream before releasing memory", level="WARNING", category="memory", force=True)
+    except Exception:
+        pass
+    try:
+        # Clear gradients first
+        model.zero_grad(set_to_none=True)
+        # Release GPU memory directly without CPU transfer
+        released_params = 0
+        released_buffers = 0
+        for param in model.parameters():
+            if param.is_cuda or param.is_mps:
+                if param.numel() > 0:
+                    param.data.set_()
+                    released_params += 1
+                param.grad = None
+        for buffer in model.buffers():
+            if buffer.is_cuda or buffer.is_mps:
+                if buffer.numel() > 0:
+                    buffer.data.set_()
+                    released_buffers += 1
+        if debug and (released_params > 0 or released_buffers > 0):
+            debug.log(f"Released memory from {released_params} params and {released_buffers} buffers", category="success")
+    except (AttributeError, RuntimeError) as e:
+        if debug:
+            debug.log(f"Failed to release model memory: {e}", level="WARNING", category="memory", force=True)
+def manage_tensor(
+    tensor: torch.Tensor,
+    target_device: torch.device,
+    tensor_name: str = "tensor",
+    dtype: Optional[torch.dtype] = None,
+    non_blocking: bool = False,
+    debug: Optional['Debug'] = None,
+    reason: Optional[str] = None,
+    indent_level: int = 0
+) -> torch.Tensor:
+    """
+    Unified tensor management for device movement and dtype conversion.
+    Handles both device transfers (CPU ↔ GPU) and dtype conversions (e.g., float16 → bfloat16)
+    with intelligent early-exit optimization and comprehensive logging.
+    Args:
+        tensor: Tensor to manage
+        target_device: Target device (torch.device object)
+        tensor_name: Descriptive name for logging (e.g., "latent", "sample", "alpha_channel")
+        dtype: Optional target dtype to cast to (if None, keeps original dtype)
+        non_blocking: Whether to use non-blocking transfer
+        debug: Debug instance for logging
+        reason: Optional reason for the operation (e.g., "inference", "offload", "dtype alignment")
+        indent_level: Indentation level for debug logging (0=no indent, 1=2 spaces, etc.)
+    Returns:
+        Tensor on target device with optional dtype conversion
+    Note:
+        - Skips operation if tensor already has target device and dtype (zero-copy)
+        - Uses PyTorch's optimized .to() for efficient device/dtype handling
+        - Logs all operations consistently for tracking and debugging
+    """
+    if tensor is None:
+        return tensor
+    # Get current state
+    current_device = tensor.device
+    current_dtype = tensor.dtype
+    target_dtype = dtype if dtype is not None else current_dtype
+    # Check if movement is actually needed
+    needs_device_move = _device_str(current_device) != _device_str(target_device)
+    needs_dtype_change = dtype is not None and current_dtype != target_dtype
+    if not needs_device_move and not needs_dtype_change:
+        # Already on target device and dtype - skip
+        return tensor
+    # Determine reason for movement
+    if reason is None:
+        if needs_device_move and needs_dtype_change:
+            reason = "device and dtype conversion"
+        elif needs_device_move:
+            reason = "device movement"
+        else:
+            reason = "dtype conversion"
+    # Log the movement
+    if debug:
+        current_device_str = _device_str(current_device)
+        target_device_str = _device_str(target_device)
+        dtype_info = ""
+        if needs_dtype_change:
+            dtype_info = f", {current_dtype} → {target_dtype}"
+        debug.log(
+            f"Moving {tensor_name} from {current_device_str} to {target_device_str}{dtype_info} ({reason})",
+            category="general",
+            indent_level=indent_level
+        )
+    # Perform the operation based on what needs to change
+    if needs_device_move and needs_dtype_change:
+        # Both device and dtype need to change
+        return tensor.to(target_device, dtype=target_dtype, non_blocking=non_blocking)
+    elif needs_device_move:
+        # Only device needs to change
+        return tensor.to(target_device, non_blocking=non_blocking)
+    else:
+        # Only dtype needs to change
+        return tensor.to(dtype=target_dtype)
+def manage_model_device(model: torch.nn.Module, target_device: torch.device, model_name: str,
+                       debug: Optional['Debug'] = None, reason: Optional[str] = None,
+                       runner: Optional[Any] = None) -> bool:
+    """
+    Move model to target device with optimizations.
+    Handles BlockSwap-enabled models transparently.
+    Args:
+        model: The model to move
+        target_device: Target device (torch.device object, e.g., torch.device('cuda:0'))
+        model_name: Name for logging (e.g., "VAE", "DiT")
+        debug: Debug instance for logging
+        reason: Optional custom reason for the movement
+        runner: Optional runner instance for BlockSwap detection
+    Returns:
+        bool: True if model was moved, False if already on target device
+    """
+    if model is None:
+        return False
+    # Check if this is a BlockSwap-enabled DiT model
+    is_blockswap_model = False
+    actual_model = model
+    if runner and model_name == "DiT":
+        # Import here to avoid circular dependency
+        from .blockswap import is_blockswap_enabled
+        # Check if BlockSwap config exists and is enabled
+        has_blockswap_config = (
+            hasattr(runner, '_dit_block_swap_config') and
+            is_blockswap_enabled(runner._dit_block_swap_config)
+        )
+        if has_blockswap_config:
+            is_blockswap_model = True
+            # Get the actual model (handle CompatibleDiT wrapper)
+            if hasattr(model, "dit_model"):
+                actual_model = model.dit_model
+    # Get current device
+    try:
+        current_device = next(model.parameters()).device
+    except StopIteration:
+        return False
+    # Extract device type for comparison (both are torch.device objects)
+    target_type = target_device.type
+    current_device_upper = _device_str(current_device)
+    target_device_upper = _device_str(target_device)
+    # Compare normalized device types
+    if current_device_upper == target_device_upper and not is_blockswap_model:
+        # Already on target device type, no movement needed
+        if debug:
+            debug.log(f"{model_name} already on {current_device_upper}, skipping movement", category="general")
+        return False
+    # Handle BlockSwap models specially
+    if is_blockswap_model:
+        return _handle_blockswap_model_movement(
+            runner, actual_model, current_device, target_device, target_type,
+            model_name, debug, reason
+        )
+    # Standard model movement (non-BlockSwap)
+    return _standard_model_movement(
+        model, current_device, target_device, target_type, model_name,
+        debug, reason
+    )
+def _handle_blockswap_model_movement(runner: Any, model: torch.nn.Module,
+                                    current_device: torch.device, target_device: torch.device,
+                                    target_type: str, model_name: str,
+                                    debug: Optional['Debug'] = None, reason: Optional[str] = None) -> bool:
+    """
+    Handle device movement for BlockSwap-enabled models.
+    Args:
+        runner: Runner instance with BlockSwap configuration
+        model: Model to move (actual unwrapped model)
+        current_device: Current device of the model
+        target_device: Target device (torch.device object)
+        target_type: Target device type (cpu/cuda/mps)
+        model_name: Model name for logging
+        debug: Debug instance
+        reason: Movement reason
+    Returns:
+        bool: True if model was moved
+    """
+    # Import here to avoid circular dependency
+    from .blockswap import set_blockswap_bypass
+    if target_type == "cpu":
+        # Moving to offload device (typically CPU)
+        # Check if any parameter is on GPU (for accurate logging)
+        actual_source_device = None
+        for param in model.parameters():
+            if param.device.type in ['cuda', 'mps']:
+                actual_source_device = param.device
+                break
+        source_device_desc = _device_str(actual_source_device) if actual_source_device else _device_str(target_device)
+        if debug:
+            debug.log(f"Moving {model_name} from {source_device_desc} to {_device_str(target_device)} ({reason or 'model caching'})", category="general")
+        # Enable bypass to allow movement
+        set_blockswap_bypass(runner=runner, bypass=True, debug=debug)
+        # If a pipelined swap stream exists, synchronize it to ensure no pending async transfers
+        if hasattr(model, "_swap_stream"):
+            try:
+                model._swap_stream.synchronize()
+            except Exception:
+                # Best-effort; don't fail the movement if synchronize not supported
+                if debug:
+                    debug.log("Failed to synchronize model._swap_stream before offload", level="WARNING", category="memory", force=True)
+        # Start timer
+        timer_name = f"{model_name.lower()}_to_{target_type}"
+        if debug:
+            debug.start_timer(timer_name)
+        # Move entire model to target offload device
+        model.to(target_device)
+        model.zero_grad(set_to_none=True)
+        # After moving to CPU, attempt to pin CPU tensors to enable non-blocking async copies later.
+        try:
+            for p in model.parameters():
+                if p.device.type == "cpu" and p.numel() > 0 and not p.data.is_pinned():
+                    p.data = p.data.pin_memory()
+            for b in model.buffers():
+                if b.device.type == "cpu" and b.numel() > 0 and not b.data.is_pinned():
+                    b.data = b.data.pin_memory()
+        except Exception as e:
+            # Pinning is best-effort; log and continue
+            if debug:
+                debug.log(f"Pin-memory on offloaded model failed: {e}", level="WARNING", category="memory", force=True)
+        if debug:
+            debug.end_timer(timer_name, f"BlockSwap model offloaded to {_device_str(target_device)}")
+        # Move entire model to target offload device
+        model.to(target_device)
+        model.zero_grad(set_to_none=True)
+        if debug:
+            debug.end_timer(timer_name, f"BlockSwap model offloaded to {_device_str(target_device)}")
+        return True
+    else:
+        # Moving to GPU (reload)
+        # Check if we're in bypass mode (coming from offload)
+        if not getattr(model, "_blockswap_bypass_protection", False):
+            # Not in bypass mode, blocks are already configured
+            if debug:
+                debug.log(f"{model_name} with BlockSwap active - blocks already distributed across devices, skipping movement", category="general")
+            return False
+        # Get actual current device for accurate logging
+        actual_current_device = None
+        for param in model.parameters():
+            if param.device.type != 'meta':
+                actual_current_device = param.device
+                break
+        current_device_desc = _device_str(actual_current_device) if actual_current_device else "OFFLOAD"
+        if debug:
+            debug.log(f"Moving {model_name} from {current_device_desc} to {_device_str(target_device)} ({reason or 'inference requirement'})", category="general")
+        timer_name = f"{model_name.lower()}_to_gpu"
+        if debug:
+            debug.start_timer(timer_name)
+        # Restore blocks to their configured devices
+        if hasattr(model, "blocks") and hasattr(model, "blocks_to_swap"):
+            # Use configured offload_device from BlockSwap config
+            offload_device = model._block_swap_config.get("offload_device")
+            if not offload_device:
+                raise ValueError("BlockSwap config missing offload_device")
+            # Move blocks according to BlockSwap configuration
+            for b, block in enumerate(model.blocks):
+                if b > model.blocks_to_swap:
+                    # This block should be on GPU
+                    block.to(target_device)
+                else:
+                    # This block stays on offload device (will be swapped during forward)
+                    block.to(offload_device)
+            # Handle I/O components
+            if not model._block_swap_config.get("swap_io_components", False):
+                # I/O components should be on GPU if not offloaded
+                for name, module in model.named_children():
+                    if name != "blocks":
+                        module.to(target_device)
+            else:
+                # I/O components stay on offload device
+                for name, module in model.named_children():
+                    if name != "blocks":
+                        module.to(offload_device)
+            if debug:
+                # Get actual configuration from runner
+                if hasattr(model, '_block_swap_config'):
+                    blocks_on_gpu = model._block_swap_config.get('total_blocks', 32) - model._block_swap_config.get('blocks_swapped', 16)
+                    total_blocks = model._block_swap_config.get('total_blocks', 32)
+                    main_device = model._block_swap_config.get('main_device', 'GPU')
+                    debug.log(f"BlockSwap blocks restored to configured devices ({blocks_on_gpu}/{total_blocks} blocks on {_device_str(main_device)})", category="success")
+                else:
+                    debug.log("BlockSwap blocks restored to configured devices", category="success")
+        # Reactivate BlockSwap now that blocks are restored to their configured devices
+        runner._blockswap_active = True
+        # Disable bypass, re-enable protection
+        set_blockswap_bypass(runner=runner, bypass=False, debug=debug)
+        if debug:
+            debug.end_timer(timer_name, "BlockSwap model restored")
+        return True
+def _standard_model_movement(model: torch.nn.Module, current_device: torch.device,
+                            target_device: torch.device, target_type: str, model_name: str,
+                            debug: Optional['Debug'] = None, reason: Optional[str] = None) -> bool:
+    """
+    Handle standard (non-BlockSwap) model movement.
+    Args:
+        model: Model to move
+        current_device: Current device of the model
+        target_device: Target device (torch.device object)
+        target_type: Target device type
+        model_name: Model name for logging
+        debug: Debug instance
+        reason: Movement reason
+    Returns:
+        bool: True if model was moved
+    """
+    # Check if model is on meta device - can't move meta tensors
+    if current_device.type == 'meta':
+        if debug:
+            debug.log(f"{model_name} is on meta device - skipping movement (will materialize when needed)",
+                     category=model_name.lower())
+        return False
+    # Determine reason for movement
+    reason = reason or "inference requirement"
+    # Log the movement with full device strings
+    if debug:
+        current_device_str = _device_str(current_device)
+        target_device_str = _device_str(target_device)
+        debug.log(f"Moving {model_name} from {current_device_str} to {target_device_str} ({reason})", category="general")
+    # Start timer based on direction
+    timer_name = f"{model_name.lower()}_to_{'gpu' if target_type != 'cpu' else 'cpu'}"
+    if debug:
+        debug.start_timer(timer_name)
+    # Move model and clear gradients
+    model.to(target_device)
+    model.zero_grad(set_to_none=True)
+    # Clear VAE memory buffers when moving to CPU
+    if target_type == 'cpu' and model_name == "VAE":
+        cleared_count = 0
+        for module in model.modules():
+            if hasattr(module, 'memory') and module.memory is not None:
+                if torch.is_tensor(module.memory) and (module.memory.is_cuda or module.memory.is_mps):
+                    module.memory = None
+                    cleared_count += 1
+        if cleared_count > 0 and debug:
+            debug.log(f"Cleared {cleared_count} VAE memory buffers", category="success")
+    # End timer
+    if debug:
+        debug.end_timer(timer_name, f"{model_name} moved to {_device_str(target_device)}")
+    return True
+def clear_runtime_caches(runner: Any, debug: Optional['Debug'] = None) -> int:
+    """
+    Clear all runtime caches and temporary attributes.
+    """
+    if not runner:
+        return 0
+    if debug:
+        debug.start_timer("runtime_cache_clear")
+    cleaned_items = 0
+    # 1. Clear main runner cache
+    if hasattr(runner, 'cache') and hasattr(runner.cache, 'cache'):
+        if debug:
+            debug.start_timer("runner_cache_clear")
+        cache_entries = len(runner.cache.cache)
+        # Properly release tensor memory and delete as we go
+        for key in list(runner.cache.cache.keys()):
+            value = runner.cache.cache[key]
+            if torch.is_tensor(value):
+                release_tensor_memory(value)
+            elif isinstance(value, (list, tuple)):
+                for item in value:
+                    if torch.is_tensor(item):
+                        release_tensor_memory(item)
+            # Delete immediately to release reference
+            del runner.cache.cache[key]
+        # Final clear for safety
+        runner.cache.cache.clear()
+        cleaned_items += cache_entries
+        if debug:
+            debug.end_timer("runner_cache_clear", f"Clearing main runner cache entries")
+        if cache_entries > 0:
+            debug.log(f"Cleared {cache_entries} runtime cache entries", category="success")
+    # 2. Clear RoPE caches
+    if hasattr(runner, 'dit'):
+        if debug:
+            debug.start_timer("rope_cache_clear")
+        model = runner.dit
+        if hasattr(model, 'dit_model'):  # Handle wrapper
+            model = model.dit_model
+        rope_cleared = clear_rope_lru_caches(model=model, debug=debug)
+        cleaned_items += rope_cleared
+        if debug:
+            debug.end_timer("rope_cache_clear", "Clearing RoPE LRU caches")
+        if rope_cleared > 0:
+            debug.log(f"Cleared {rope_cleared} RoPE LRU caches", category="success")
+    # 3. Clear temporary attributes
+    temp_attrs = ['_temp_cache', '_block_cache', '_swap_cache', '_generation_cache',
+                  '_rope_cache', '_intermediate_cache', '_backward_cache']
+    for obj in [runner, getattr(runner, 'dit', None), getattr(runner, 'vae', None)]:
+        if obj is None:
+            continue
+        actual_obj = obj.dit_model if hasattr(obj, 'dit_model') else obj
+        for attr in temp_attrs:
+            if hasattr(actual_obj, attr):
+                delattr(actual_obj, attr)
+                cleaned_items += 1
+    if debug:
+        debug.end_timer("runtime_cache_clear", f"clear_runtime_caches() completion")
+    return cleaned_items
+def cleanup_dit(runner: Any, debug: Optional['Debug'] = None, cache_model: bool = False) -> None:
+    """
+    Cleanup DiT model and BlockSwap state after upscaling phase.
+    Called at the end of upscale_all_batches when DiT is no longer needed.
+    Args:
+        runner: Runner instance containing DiT model
+        debug: Debug instance for logging
+        cache_model: If True, move DiT to offload_device; if False, delete completely
+    """
+    if not runner or not hasattr(runner, 'dit'):
+        return
+    if debug:
+        debug.log("Cleaning up DiT components", category="cleanup")
+    # 1. Clear DiT-specific runtime caches first
+    if hasattr(runner, 'dit'):
+        model = runner.dit
+        if hasattr(model, 'dit_model'):  # Handle wrapper
+            model = model.dit_model
+        # Clear RoPE caches
+        rope_cleared = clear_rope_lru_caches(model=model, debug=debug)
+        if rope_cleared > 0 and debug:
+            debug.log(f"Cleared {rope_cleared} RoPE LRU caches", category="success")
+        # Clear DiT temporary attributes
+        temp_attrs = ['_temp_cache', '_block_cache', '_swap_cache', '_generation_cache',
+                      '_rope_cache', '_intermediate_cache', '_backward_cache']
+        actual_obj = model.dit_model if hasattr(model, 'dit_model') else model
+        for attr in temp_attrs:
+            if hasattr(actual_obj, attr):
+                delattr(actual_obj, attr)
+    # 2. Handle model offloading (for caching or before deletion)
+    try:
+        param_device = next(runner.dit.parameters()).device
+        # Move model off GPU if needed
+        if param_device.type not in ['meta', 'cpu']:
+            # MPS: skip CPU movement before deletion (unified memory, just causes sync)
+            if param_device.type == 'mps' and not cache_model:
+                if debug:
+                    debug.log("DiT on MPS - skipping CPU movement before deletion", category="cleanup")
+            else:
+                offload_target = getattr(runner, '_dit_offload_device', None)
+                if offload_target is None or offload_target == 'none':
+                    offload_target = torch.device('cpu')
+                reason = "model caching" if cache_model else "releasing GPU memory"
+                manage_model_device(model=runner.dit, target_device=offload_target, model_name="DiT",
+                                   debug=debug, reason=reason, runner=runner)
+        elif param_device.type == 'meta' and debug:
+            debug.log("DiT on meta device - keeping structure for cache", category="cleanup")
+    except StopIteration:
+        pass
+    # 3. Clean BlockSwap after model movement
+    if hasattr(runner, "_blockswap_active") and runner._blockswap_active:
+        # Import here to avoid circular dependency
+        from .blockswap import cleanup_blockswap
+        # If model had a swap stream, synchronize before cleanup to avoid races
+        try:
+            model_for_sync = runner.dit.dit_model if hasattr(runner.dit, 'dit_model') else runner.dit
+            if hasattr(model_for_sync, "_swap_stream"):
+                try:
+                    model_for_sync._swap_stream.synchronize()
+                except Exception:
+                    if debug:
+                        debug.log("Failed to synchronize model._swap_stream before cleanup_blockswap", level="WARNING", category="cleanup", force=True)
+        except Exception:
+            pass
+        cleanup_blockswap(runner=runner, keep_state_for_cache=cache_model)
+    # 4. Complete cleanup if not caching
+    if not cache_model:
+        release_model_memory(model=runner.dit, debug=debug)
+        runner.dit = None
+        if debug:
+            debug.log("DiT model deleted", category="cleanup")
+        # Clear DiT config attributes - not needed when model is not cached (will be recreated)
+        if hasattr(runner, '_dit_compile_args'):
+            delattr(runner, '_dit_compile_args')
+        if hasattr(runner, '_dit_block_swap_config'):
+            delattr(runner, '_dit_block_swap_config')
+        if hasattr(runner, '_dit_attention_mode'):
+            delattr(runner, '_dit_attention_mode')
+    # 5. Clear DiT temporary attributes (should be already cleared in materialize_model)
+    runner._dit_checkpoint = None
+    runner._dit_dtype_override = None
+    # 6. Clear DiT-related components and temporary attributes
+    runner.sampler = None
+    runner.sampling_timesteps = None
+    runner.schedule = None
+def cleanup_vae(runner: Any, debug: Optional['Debug'] = None, cache_model: bool = False) -> None:
+    """
+    Cleanup VAE model after decoding phase.
+    Called at the end of decode_all_batches when VAE is no longer needed.
+    Args:
+        runner: Runner instance containing VAE model
+        debug: Debug instance for logging
+        cache_model: If True, move VAE to offload_device; if False, delete completely
+    """
+    if not runner or not hasattr(runner, 'vae'):
+        return
+    if debug:
+        debug.log("Cleaning up VAE components", category="cleanup")
+    # 1. Clear VAE-specific temporary attributes
+    if hasattr(runner, 'vae'):
+        temp_attrs = ['_temp_cache', '_block_cache', '_swap_cache', '_generation_cache',
+                      '_rope_cache', '_intermediate_cache', '_backward_cache']
+        for attr in temp_attrs:
+            if hasattr(runner.vae, attr):
+                delattr(runner.vae, attr)
+    # 2. Handle model offloading (for caching or before deletion)
+    try:
+        param_device = next(runner.vae.parameters()).device
+        # Move model off GPU if needed
+        if param_device.type not in ['meta', 'cpu']:
+            # MPS: skip CPU movement before deletion (unified memory, just causes sync)
+            if param_device.type == 'mps' and not cache_model:
+                if debug:
+                    debug.log("VAE on MPS - skipping CPU movement before deletion", category="cleanup")
+            else:
+                offload_target = getattr(runner, '_vae_offload_device', None)
+                if offload_target is None or offload_target == 'none':
+                    offload_target = torch.device('cpu')
+                reason = "model caching" if cache_model else "releasing GPU memory"
+                manage_model_device(model=runner.vae, target_device=offload_target, model_name="VAE",
+                                   debug=debug, reason=reason, runner=runner)
+        elif param_device.type == 'meta' and debug:
+            debug.log("VAE on meta device - keeping structure for cache", category="cleanup")
+    except StopIteration:
+        pass
+    # 3. Complete cleanup if not caching
+    if not cache_model:
+        release_model_memory(model=runner.vae, debug=debug)
+        runner.vae = None
+        if debug:
+            debug.log("VAE model deleted", category="cleanup")
+        # Clear VAE config attributes - not needed when model is not cached (will be recreated)
+        if hasattr(runner, '_vae_compile_args'):
+            delattr(runner, '_vae_compile_args')
+        if hasattr(runner, '_vae_tiling_config'):
+            delattr(runner, '_vae_tiling_config')
+    # 3. Clear VAE temporary attributes (should be already cleared in materialize_model)
+    runner._vae_checkpoint = None
+    runner._vae_dtype_override = None
+def complete_cleanup(runner: Any, debug: Optional['Debug'] = None, dit_cache: bool = False, vae_cache: bool = False) -> None:
+    """
+    Complete cleanup of runner and remaining components with independent model caching support.
+    This is a lightweight cleanup for final stage, as model-specific cleanup
+    happens in their respective phases (cleanup_dit, cleanup_vae).
+    Args:
+        runner: Runner instance to clean up
+        debug: Debug instance for logging
+        dit_cache: If True, preserve DiT model on offload_device for future runs
+        vae_cache: If True, preserve VAE model on offload_device for future runs
+    Behavior:
+        - Can cache DiT and VAE independently for flexible memory management
+        - Preserves _dit_model_name and _vae_model_name when either model is cached for change detection
+        - Clears all temporary attributes and runtime caches
+        - Performs deep memory cleanup only when both models are fully released
+    Note:
+        Model name tracking (_dit_model_name, _vae_model_name) is only cleared if neither
+        model is cached, enabling proper model change detection on subsequent runs.
+    """
+    if not runner:
+        return
+    if debug:
+        cleanup_type = "partial cleanup" if (dit_cache or vae_cache) else "full cleanup"
+        debug.log(f"Starting {cleanup_type}", category="cleanup")
+    # 1. Cleanup any remaining models if they still exist
+    # (This handles cases where phases were skipped or errored)
+    if hasattr(runner, 'dit') and runner.dit is not None:
+        cleanup_dit(runner=runner, debug=debug, cache_model=dit_cache)
+    if hasattr(runner, 'vae') and runner.vae is not None:
+        cleanup_vae(runner=runner, debug=debug, cache_model=vae_cache)
+    # 2. Clear remaining runtime caches
+    clear_runtime_caches(runner=runner, debug=debug)
+    # 3. Clear config and other non-model components when fully releasing runner
+    if not (dit_cache or vae_cache):
+        # Full cleanup - clear config and model tracking
+        runner.config = None
+        runner._dit_model_name = None
+        runner._vae_model_name = None
+    # 4. Final memory cleanup
+    clear_memory(debug=debug, deep=True, force=True, timer_name="complete_cleanup")
+    # 5. Clear cuBLAS workspaces
+    torch._C._cuda_clearCublasWorkspaces() if hasattr(torch._C, '_cuda_clearCublasWorkspaces') else None
+    # Log what models are cached for next run
+    if dit_cache or vae_cache:
+        cached_models = []
+        if dit_cache and hasattr(runner, '_dit_model_name'):
+            cached_models.append(f"DiT ({runner._dit_model_name})")
+        if vae_cache and hasattr(runner, '_vae_model_name'):
+            cached_models.append(f"VAE ({runner._vae_model_name})")
+        if cached_models:
+            models_str = " and ".join(cached_models)
+            debug.log(f"Models cached for next run: {models_str}", category="cache", force=True)
+    if debug:
+        debug.log(f"Completed {cleanup_type}", category="success")

src/optimization/memory_manager.py.bak ADDED Viewed

	@@ -0,0 +1,1231 @@

+"""
+Memory management module for SeedVR2
+Handles VRAM usage, cache management, and memory optimization
+Extracted from: seedvr2.py (lines 373-405, 607-626, 1016-1044)
+"""
+import torch
+import gc
+import sys
+import time
+import psutil
+import platform
+from typing import Tuple, Dict, Any, Optional, List, Union
+def _device_str(device: Union[torch.device, str]) -> str:
+    """Normalized uppercase device string for comparison and logging. MPS variants → 'MPS'."""
+    s = str(device).upper()
+    return 'MPS' if s.startswith('MPS') else s
+def is_mps_available() -> bool:
+    """Check if MPS (Apple Metal) backend is available."""
+    return hasattr(torch.backends, 'mps') and torch.backends.mps.is_available()
+def is_cuda_available() -> bool:
+    """Check if CUDA backend is available."""
+    return torch.cuda.is_available()
+def get_gpu_backend() -> str:
+    """Get the active GPU backend type.
+    Returns:
+        'cuda': NVIDIA CUDA
+        'mps': Apple Metal Performance Shaders
+        'cpu': No GPU backend available
+    """
+    if is_cuda_available():
+        return 'cuda'
+    if is_mps_available():
+        return 'mps'
+    return 'cpu'
+def get_device_list(include_none: bool = False, include_cpu: bool = False) -> List[str]:
+    """
+    Get list of available compute devices for SeedVR2
+    Args:
+        include_none: If True, prepend "none" to the device list (for offload options)
+        include_cpu: If True, include "cpu" in the device list (for offload options only)
+                     Note: On MPS-only systems, "cpu" is automatically excluded since
+                     unified memory architecture makes CPU offloading meaningless
+    Returns:
+        List of device strings (e.g., ["cuda:0", "cuda:1"] or ["none", "cpu", "cuda:0", "cuda:1"])
+    """
+    devs = []
+    has_cuda = False
+    has_mps = False
+    try:
+        if is_cuda_available():
+            devs += [f"cuda:{i}" for i in range(torch.cuda.device_count())]
+            has_cuda = True
+    except Exception:
+        pass
+    try:
+        if is_mps_available():
+            devs.append("mps")  # MPS doesn't use device indices
+            has_mps = True
+    except Exception:
+        pass
+    # Build result list with optional prefixes
+    result = []
+    if include_none:
+        result.append("none")
+    # Only include "cpu" option if:
+    # 1. It was requested (include_cpu=True), AND
+    # 2. Either CUDA is available OR MPS is not the only option
+    # Rationale: On MPS-only systems with unified memory architecture,
+    # CPU offloading is semantically meaningless as CPU and GPU share the same memory pool
+    if include_cpu and (has_cuda or not has_mps):
+        result.append("cpu")
+    result.extend(devs)
+    return result if result else []
+def get_basic_vram_info(device: Optional[torch.device] = None) -> Dict[str, Any]:
+    """
+    Get basic VRAM availability info (free and total memory).
+    Used for capacity planning and initial checks.
+    Args:
+        device: Optional device to query. If None, uses cuda:0
+    Returns:
+        dict: {"free_gb": float, "total_gb": float} or {"error": str}
+    """
+    try:
+        if is_cuda_available():
+            if device is None:
+                device = torch.device("cuda:0")
+            elif not isinstance(device, torch.device):
+                device = torch.device(device)
+            free_memory, total_memory = torch.cuda.mem_get_info(device)
+        elif is_mps_available():
+            # MPS doesn't support per-device queries or mem_get_info
+            # Use system memory as proxy
+            mem = psutil.virtual_memory()
+            free_memory = mem.total - mem.used
+            total_memory = mem.total
+        else:
+            return {"error": "No GPU backend available (CUDA/MPS)"}
+        return {
+            "free_gb": free_memory / (1024**3),
+            "total_gb": total_memory / (1024**3)
+        }
+    except Exception as e:
+        return {"error": f"Failed to get memory info: {str(e)}"}
+# Initial VRAM check at module load
+vram_info = get_basic_vram_info(device=None)
+if "error" not in vram_info:
+    backend = "MPS" if is_mps_available() else "CUDA"
+    print(f"📊 Initial {backend} memory: {vram_info['free_gb']:.2f}GB free / {vram_info['total_gb']:.2f}GB total")
+else:
+    print(f"⚠️ Memory check failed: {vram_info['error']} - No available backend!")
+def get_vram_usage(device: Optional[torch.device] = None, debug: Optional['Debug'] = None) -> Tuple[float, float, float, float]:
+    """
+    Get current VRAM usage metrics for monitoring.
+    Used for tracking memory consumption during processing.
+    Args:
+        device: Optional device to query. If None, uses cuda:0
+        debug: Optional debug instance for logging
+    Returns:
+        tuple: (allocated_gb, reserved_gb, peak_allocated_gb, peak_reserved_gb)
+               Returns (0, 0, 0, 0) if no GPU available
+    """
+    try:
+        if is_cuda_available():
+            if device is None:
+                device = torch.device("cuda:0")
+            elif not isinstance(device, torch.device):
+                device = torch.device(device)
+            allocated = torch.cuda.memory_allocated(device) / (1024**3)
+            reserved = torch.cuda.memory_reserved(device) / (1024**3)
+            peak_allocated = torch.cuda.max_memory_allocated(device) / (1024**3)
+            peak_reserved = torch.cuda.max_memory_reserved(device) / (1024**3)
+            return allocated, reserved, peak_allocated, peak_reserved
+        elif is_mps_available():
+            # MPS doesn't support per-device queries - uses global memory tracking
+            allocated = torch.mps.current_allocated_memory() / (1024**3)
+            reserved = torch.mps.driver_allocated_memory() / (1024**3)
+            # MPS doesn't track peak separately
+            return allocated, reserved, allocated, reserved
+    except Exception as e:
+        if debug:
+            debug.log(f"Failed to get VRAM usage: {e}", level="WARNING", category="memory", force=True)
+    return 0.0, 0.0, 0.0, 0.0
+def get_ram_usage(debug: Optional['Debug'] = None) -> Tuple[float, float, float, float]:
+    """
+    Get current RAM usage metrics for the current process.
+    Provides accurate tracking of process-specific memory consumption.
+    Args:
+        debug: Optional debug instance for logging
+    Returns:
+        tuple: (process_gb, available_gb, total_gb, used_by_others_gb)
+               Returns (0, 0, 0, 0) if psutil not available or on error
+    """
+    try:
+        if not psutil:
+            return 0.0, 0.0, 0.0, 0.0
+        # Get current process memory
+        process = psutil.Process()
+        process_memory = process.memory_info()
+        process_gb = process_memory.rss / (1024**3)
+        # Get system memory
+        sys_memory = psutil.virtual_memory()
+        total_gb = sys_memory.total / (1024**3)
+        available_gb = sys_memory.available / (1024**3)
+        # Calculate memory used by other processes
+        # This is the CORRECT calculation:
+        total_used_gb = total_gb - available_gb  # Total memory used by ALL processes
+        used_by_others_gb = max(0, total_used_gb - process_gb)  # Subtract current process
+        return process_gb, available_gb, total_gb, used_by_others_gb
+    except Exception as e:
+        if debug:
+            debug.log(f"Failed to get RAM usage: {e}", level="WARNING", category="memory", force=True)
+        return 0.0, 0.0, 0.0, 0.0
+# Global cache for OS libraries (initialized once)
+_os_memory_lib = None
+def clear_memory(debug: Optional['Debug'] = None, deep: bool = False, force: bool = True,
+                timer_name: Optional[str] = None) -> None:
+    """
+    Clear memory caches with two-tier approach for optimal performance.
+    Args:
+        debug: Debug instance for logging (optional)
+        force: If True, always clear. If False, only clear when <5% free
+        deep: If True, perform deep cleanup including GC and OS operations.
+              If False (default), only perform minimal GPU cache clearing.
+        timer_name: Optional suffix for timer names to make them unique per invocation
+    Two-tier approach:
+        - Minimal mode (deep=False): GPU cache operations (~1-5ms)
+          Used for frequent calls during batch processing
+        - Deep mode (deep=True): Complete cleanup with GC and OS operations (~10-50ms)
+          Used at key points like model switches or final cleanup
+    """
+    global _os_memory_lib
+    # Create unique timer names if suffix provided
+    if timer_name:
+        main_timer = f"memory_clear_{timer_name}"
+        gpu_timer = f"gpu_cache_clear_{timer_name}"
+        gc_timer = f"garbage_collection_{timer_name}"
+        os_timer = f"os_memory_release_{timer_name}"
+        completion_msg = f"clear_memory() completion ({timer_name})"
+    else:
+        main_timer = "memory_clear"
+        gpu_timer = "gpu_cache_clear"
+        gc_timer = "garbage_collection"
+        os_timer = "os_memory_release"
+        completion_msg = "clear_memory() completion"
+    # Start timer for entire operation
+    if debug:
+        debug.start_timer(main_timer)
+    # Check if we should clear based on memory pressure
+    if not force:
+        should_clear = False
+        # Use existing function for memory info
+        mem_info = get_basic_vram_info(device=None)
+        if "error" not in mem_info and mem_info["total_gb"] > 0:
+            # Check VRAM/MPS memory pressure (5% free threshold)
+            free_ratio = mem_info["free_gb"] / mem_info["total_gb"]
+            if free_ratio < 0.05:
+                should_clear = True
+                if debug:
+                    backend = "Unified Memory" if is_mps_available() else "VRAM"
+                    debug.log(f"{backend} pressure: {mem_info['free_gb']:.2f}GB free of {mem_info['total_gb']:.2f}GB", category="memory")
+        # For non-MPS systems, also check system RAM separately
+        if not should_clear and not is_mps_available():
+            mem = psutil.virtual_memory()
+            if mem.available < mem.total * 0.05:
+                should_clear = True
+                if debug:
+                    debug.log(f"RAM pressure: {mem.available/(1024**3):.2f}GB free of {mem.total/(1024**3):.2f}GB", category="memory")
+        if not should_clear:
+            # End timer before early return to keep stack clean
+            if debug:
+                debug.end_timer(main_timer)
+            return
+    # Determine cleanup level
+    cleanup_mode = "deep" if deep else "minimal"
+    if debug:
+        debug.log(f"Clearing memory caches ({cleanup_mode})...", category="cleanup")
+    # ===== MINIMAL OPERATIONS (Always performed) =====
+    # Step 1: Clear GPU caches - Fast operations (~1-5ms)
+    if debug:
+        debug.start_timer(gpu_timer)
+    if is_cuda_available():
+        torch.cuda.empty_cache()
+        torch.cuda.ipc_collect()
+    elif is_mps_available():
+        torch.mps.empty_cache()
+    if debug:
+        debug.end_timer(gpu_timer, "GPU cache clearing")
+    # ===== DEEP OPERATIONS (Only when deep=True) =====
+    if deep:
+        # Step 2: Deep garbage collection (expensive ~5-20ms)
+        if debug:
+            debug.start_timer(gc_timer)
+        gc.collect(2)
+        if debug:
+            debug.end_timer(gc_timer, "Garbage collection")
+        # Step 3: Return memory to OS (platform-specific, ~5-30ms)
+        if debug:
+            debug.start_timer(os_timer)
+        try:
+            if sys.platform == 'linux':
+                # Linux: malloc_trim
+                import ctypes  # Import only when needed
+                if _os_memory_lib is None:
+                    _os_memory_lib = ctypes.CDLL("libc.so.6")
+                _os_memory_lib.malloc_trim(0)
+            elif sys.platform == 'win32':
+                # Windows: Trim working set
+                import ctypes  # Import only when needed
+                if _os_memory_lib is None:
+                    _os_memory_lib = ctypes.windll.kernel32
+                handle = _os_memory_lib.GetCurrentProcess()
+                _os_memory_lib.SetProcessWorkingSetSize(handle, -1, -1)
+            elif is_mps_available():
+                # macOS with MPS
+                import ctypes  # Import only when needed
+                import ctypes.util
+                if _os_memory_lib is None:
+                    libc_path = ctypes.util.find_library('c')
+                    if libc_path:
+                        _os_memory_lib = ctypes.CDLL(libc_path)
+                if _os_memory_lib:
+                    _os_memory_lib.sync()
+        except Exception as e:
+            if debug:
+                debug.log(f"Failed to perform OS memory operations: {e}", level="WARNING", category="memory", force=True)
+        if debug:
+            debug.end_timer(os_timer, "OS memory release")
+    # End overall timer
+    if debug:
+        debug.end_timer(main_timer, completion_msg)
+def retry_on_oom(func, *args, debug=None, operation_name="operation", **kwargs):
+    """
+    Execute function with single OOM retry after memory cleanup.
+    Args:
+        func: Callable to execute
+        *args: Positional arguments for func
+        debug: Debug instance for logging (optional)
+        operation_name: Name for logging
+        **kwargs: Keyword arguments for func
+    Returns:
+        Result of func(*args, **kwargs)
+    """
+    try:
+        return func(*args, **kwargs)
+    except (torch.cuda.OutOfMemoryError, RuntimeError) as e:
+        # Only handle OOM errors
+        if not any(x in str(e).lower() for x in ["out of memory", "allocation on device"]):
+            raise
+        if debug:
+            debug.log(f"OOM during {operation_name}: {e}", level="WARNING", category="memory", force=True)
+            debug.log(f"Clearing memory and retrying", category="info", force=True)
+        # Clear memory
+        clear_memory(debug=debug, deep=True, force=True, timer_name=operation_name)
+        # Let memory settle
+        time.sleep(0.5)
+        debug.log_memory_state("After memory clearing", show_tensors=False, detailed_tensors=False)
+        # Single retry
+        try:
+            result = func(*args, **kwargs)
+            if debug:
+                debug.log(f"Retry successful for {operation_name}", category="success", force=True)
+            return result
+        except Exception as retry_e:
+            if debug:
+                debug.log(f"Retry failed for {operation_name}: {retry_e}", level="ERROR", category="memory", force=True)
+            raise
+def reset_vram_peak(device: Optional[torch.device] = None, debug: Optional['Debug'] = None) -> None:
+    """
+    Reset VRAM peak memory statistics for fresh tracking.
+    Args:
+        device: Optional device to reset stats for. If None, uses cuda:0
+        debug: Optional debug instance for logging
+    """
+    if debug and debug.enabled:
+        debug.log("Resetting VRAM peak memory statistics", category="memory")
+    try:
+        if is_cuda_available():
+            if device is None:
+                device = torch.device("cuda:0")
+            elif not isinstance(device, torch.device):
+                device = torch.device(device)
+            torch.cuda.reset_peak_memory_stats(device)
+        # Note: MPS doesn't support peak memory reset - no action needed
+    except Exception as e:
+        if debug and debug.enabled:
+            debug.log(f"Failed to reset peak memory stats: {e}", level="WARNING", category="memory", force=True)
+def clear_rope_lru_caches(model: Optional[torch.nn.Module], debug: Optional['Debug'] = None) -> int:
+    """
+    Clear ALL LRU caches from RoPE modules.
+    Args:
+        model: PyTorch model to clear caches from
+        debug: Optional debug instance for logging
+    Returns:
+        Number of caches cleared
+    """
+    if model is None:
+        return 0
+    cleared_count = 0
+    try:
+        for name, module in model.named_modules():
+            if hasattr(module, 'get_axial_freqs') and hasattr(module.get_axial_freqs, 'cache_clear'):
+                try:
+                    module.get_axial_freqs.cache_clear()
+                    cleared_count += 1
+                except Exception as e:
+                    if debug:
+                        debug.log(f"Failed to clear RoPE LRU cache for module {name}: {e}", level="WARNING", category="memory", force=True)
+    except (AttributeError, RuntimeError) as e:
+        if debug:
+            debug.log(f"Failed to iterate model modules for RoPE LRU cache clearing: {e}", level="WARNING", category="memory", force=True)
+    return cleared_count
+def release_tensor_memory(tensor: Optional[torch.Tensor]) -> None:
+    """Release tensor memory from any device (CPU/CUDA/MPS)"""
+    if tensor is not None and torch.is_tensor(tensor):
+        # Release storage for all devices (CPU, CUDA, MPS)
+        if tensor.numel() > 0:
+            tensor.data.set_()
+        tensor.grad = None
+def release_tensor_collection(collection: Any, recursive: bool = True) -> None:
+    """
+    Release GPU memory from tensors in any collection (list, tuple, dict, or single tensor).
+    Args:
+        collection: Tensor, list, tuple, dict, or nested structure to release
+        recursive: If True, handle nested structures recursively
+    Examples:
+        release_tensor_collection(tensor)                    # Single tensor
+        release_tensor_collection([tensor1, tensor2])        # List of tensors
+        release_tensor_collection([[t1, t2], [t3, t4]])     # Nested lists
+        release_tensor_collection({'a': tensor})             # Dict values
+    """
+    if collection is None:
+        return
+    if torch.is_tensor(collection):
+        release_tensor_memory(collection)
+    elif isinstance(collection, dict):
+        for value in collection.values():
+            if recursive:
+                release_tensor_collection(value, recursive=True)
+            elif torch.is_tensor(value):
+                release_tensor_memory(value)
+    elif isinstance(collection, (list, tuple)):
+        for item in collection:
+            if recursive:
+                release_tensor_collection(item, recursive=True)
+            elif torch.is_tensor(item):
+                release_tensor_memory(item)
+def release_text_embeddings(*embeddings: torch.Tensor, debug: Optional['Debug'] = None, names: Optional[List[str]] = None) -> None:
+    """
+    Release memory for text embeddings
+    Args:
+        *embeddings: Variable number of embedding tensors to release
+        debug: Optional debug instance for logging
+        names: Optional list of names for logging
+    """
+    for i, embedding in enumerate(embeddings):
+        if embedding is not None:
+            release_tensor_memory(embedding)
+            if debug and names and i < len(names):
+                debug.log(f"Cleaned up {names[i]}", category="cleanup")
+def cleanup_text_embeddings(ctx: Dict[str, Any], debug: Optional['Debug'] = None) -> None:
+    """
+    Clean up text embeddings from a context dictionary.
+    Extracts embeddings, releases memory, and clears the context entry.
+    Args:
+        ctx: Context dictionary potentially containing 'text_embeds'
+        debug: Optional debug instance for logging
+    """
+    if not ctx or not ctx.get('text_embeds'):
+        return
+    embeddings = []
+    names = []
+    for key, embeds_list in ctx['text_embeds'].items():
+        if embeds_list:
+            embeddings.extend(embeds_list)
+            names.append(key)
+    if embeddings:
+        release_text_embeddings(embeddings, names, debug)
+        if debug:
+            debug.log(f"Cleaned up text embeddings: {', '.join(names)}", category="cleanup")
+    ctx['text_embeds'] = None
+def release_model_memory(model: Optional[torch.nn.Module], debug: Optional['Debug'] = None) -> None:
+    """
+    Release all GPU/MPS memory from model in-place without CPU transfer.
+    Args:
+        model: PyTorch model to release memory from
+        debug: Optional debug instance for logging
+    """
+    if model is None:
+        return
+    try:
+        # Clear gradients first
+        model.zero_grad(set_to_none=True)
+        # Release GPU memory directly without CPU transfer
+        released_params = 0
+        released_buffers = 0
+        for param in model.parameters():
+            if param.is_cuda or param.is_mps:
+                if param.numel() > 0:
+                    param.data.set_()
+                    released_params += 1
+                param.grad = None
+        for buffer in model.buffers():
+            if buffer.is_cuda or buffer.is_mps:
+                if buffer.numel() > 0:
+                    buffer.data.set_()
+                    released_buffers += 1
+        if debug and (released_params > 0 or released_buffers > 0):
+            debug.log(f"Released memory from {released_params} params and {released_buffers} buffers", category="success")
+    except (AttributeError, RuntimeError) as e:
+        if debug:
+            debug.log(f"Failed to release model memory: {e}", level="WARNING", category="memory", force=True)
+def manage_tensor(
+    tensor: torch.Tensor,
+    target_device: torch.device,
+    tensor_name: str = "tensor",
+    dtype: Optional[torch.dtype] = None,
+    non_blocking: bool = False,
+    debug: Optional['Debug'] = None,
+    reason: Optional[str] = None,
+    indent_level: int = 0
+) -> torch.Tensor:
+    """
+    Unified tensor management for device movement and dtype conversion.
+    Handles both device transfers (CPU ↔ GPU) and dtype conversions (e.g., float16 → bfloat16)
+    with intelligent early-exit optimization and comprehensive logging.
+    Args:
+        tensor: Tensor to manage
+        target_device: Target device (torch.device object)
+        tensor_name: Descriptive name for logging (e.g., "latent", "sample", "alpha_channel")
+        dtype: Optional target dtype to cast to (if None, keeps original dtype)
+        non_blocking: Whether to use non-blocking transfer
+        debug: Debug instance for logging
+        reason: Optional reason for the operation (e.g., "inference", "offload", "dtype alignment")
+        indent_level: Indentation level for debug logging (0=no indent, 1=2 spaces, etc.)
+    Returns:
+        Tensor on target device with optional dtype conversion
+    Note:
+        - Skips operation if tensor already has target device and dtype (zero-copy)
+        - Uses PyTorch's optimized .to() for efficient device/dtype handling
+        - Logs all operations consistently for tracking and debugging
+    """
+    if tensor is None:
+        return tensor
+    # Get current state
+    current_device = tensor.device
+    current_dtype = tensor.dtype
+    target_dtype = dtype if dtype is not None else current_dtype
+    # Check if movement is actually needed
+    needs_device_move = _device_str(current_device) != _device_str(target_device)
+    needs_dtype_change = dtype is not None and current_dtype != target_dtype
+    if not needs_device_move and not needs_dtype_change:
+        # Already on target device and dtype - skip
+        return tensor
+    # Determine reason for movement
+    if reason is None:
+        if needs_device_move and needs_dtype_change:
+            reason = "device and dtype conversion"
+        elif needs_device_move:
+            reason = "device movement"
+        else:
+            reason = "dtype conversion"
+    # Log the movement
+    if debug:
+        current_device_str = _device_str(current_device)
+        target_device_str = _device_str(target_device)
+        dtype_info = ""
+        if needs_dtype_change:
+            dtype_info = f", {current_dtype} → {target_dtype}"
+        debug.log(
+            f"Moving {tensor_name} from {current_device_str} to {target_device_str}{dtype_info} ({reason})",
+            category="general",
+            indent_level=indent_level
+        )
+    # Perform the operation based on what needs to change
+    if needs_device_move and needs_dtype_change:
+        # Both device and dtype need to change
+        return tensor.to(target_device, dtype=target_dtype, non_blocking=non_blocking)
+    elif needs_device_move:
+        # Only device needs to change
+        return tensor.to(target_device, non_blocking=non_blocking)
+    else:
+        # Only dtype needs to change
+        return tensor.to(dtype=target_dtype)
+def manage_model_device(model: torch.nn.Module, target_device: torch.device, model_name: str,
+                       debug: Optional['Debug'] = None, reason: Optional[str] = None,
+                       runner: Optional[Any] = None) -> bool:
+    """
+    Move model to target device with optimizations.
+    Handles BlockSwap-enabled models transparently.
+    Args:
+        model: The model to move
+        target_device: Target device (torch.device object, e.g., torch.device('cuda:0'))
+        model_name: Name for logging (e.g., "VAE", "DiT")
+        debug: Debug instance for logging
+        reason: Optional custom reason for the movement
+        runner: Optional runner instance for BlockSwap detection
+    Returns:
+        bool: True if model was moved, False if already on target device
+    """
+    if model is None:
+        return False
+    # Check if this is a BlockSwap-enabled DiT model
+    is_blockswap_model = False
+    actual_model = model
+    if runner and model_name == "DiT":
+        # Import here to avoid circular dependency
+        from .blockswap import is_blockswap_enabled
+        # Check if BlockSwap config exists and is enabled
+        has_blockswap_config = (
+            hasattr(runner, '_dit_block_swap_config') and
+            is_blockswap_enabled(runner._dit_block_swap_config)
+        )
+        if has_blockswap_config:
+            is_blockswap_model = True
+            # Get the actual model (handle CompatibleDiT wrapper)
+            if hasattr(model, "dit_model"):
+                actual_model = model.dit_model
+    # Get current device
+    try:
+        current_device = next(model.parameters()).device
+    except StopIteration:
+        return False
+    # Extract device type for comparison (both are torch.device objects)
+    target_type = target_device.type
+    current_device_upper = _device_str(current_device)
+    target_device_upper = _device_str(target_device)
+    # Compare normalized device types
+    if current_device_upper == target_device_upper and not is_blockswap_model:
+        # Already on target device type, no movement needed
+        if debug:
+            debug.log(f"{model_name} already on {current_device_upper}, skipping movement", category="general")
+        return False
+    # Handle BlockSwap models specially
+    if is_blockswap_model:
+        return _handle_blockswap_model_movement(
+            runner, actual_model, current_device, target_device, target_type,
+            model_name, debug, reason
+        )
+    # Standard model movement (non-BlockSwap)
+    return _standard_model_movement(
+        model, current_device, target_device, target_type, model_name,
+        debug, reason
+    )
+def _handle_blockswap_model_movement(runner: Any, model: torch.nn.Module,
+                                    current_device: torch.device, target_device: torch.device,
+                                    target_type: str, model_name: str,
+                                    debug: Optional['Debug'] = None, reason: Optional[str] = None) -> bool:
+    """
+    Handle device movement for BlockSwap-enabled models.
+    Args:
+        runner: Runner instance with BlockSwap configuration
+        model: Model to move (actual unwrapped model)
+        current_device: Current device of the model
+        target_device: Target device (torch.device object)
+        target_type: Target device type (cpu/cuda/mps)
+        model_name: Model name for logging
+        debug: Debug instance
+        reason: Movement reason
+    Returns:
+        bool: True if model was moved
+    """
+    # Import here to avoid circular dependency
+    from .blockswap import set_blockswap_bypass
+    if target_type == "cpu":
+        # Moving to offload device (typically CPU)
+        # Check if any parameter is on GPU (for accurate logging)
+        actual_source_device = None
+        for param in model.parameters():
+            if param.device.type in ['cuda', 'mps']:
+                actual_source_device = param.device
+                break
+        source_device_desc = _device_str(actual_source_device) if actual_source_device else _device_str(target_device)
+        if debug:
+            debug.log(f"Moving {model_name} from {source_device_desc} to {_device_str(target_device)} ({reason or 'model caching'})", category="general")
+        # Enable bypass to allow movement
+        set_blockswap_bypass(runner=runner, bypass=True, debug=debug)
+        # Start timer
+        timer_name = f"{model_name.lower()}_to_{target_type}"
+        if debug:
+            debug.start_timer(timer_name)
+        # Move entire model to target offload device
+        model.to(target_device)
+        model.zero_grad(set_to_none=True)
+        if debug:
+            debug.end_timer(timer_name, f"BlockSwap model offloaded to {_device_str(target_device)}")
+        return True
+    else:
+        # Moving to GPU (reload)
+        # Check if we're in bypass mode (coming from offload)
+        if not getattr(model, "_blockswap_bypass_protection", False):
+            # Not in bypass mode, blocks are already configured
+            if debug:
+                debug.log(f"{model_name} with BlockSwap active - blocks already distributed across devices, skipping movement", category="general")
+            return False
+        # Get actual current device for accurate logging
+        actual_current_device = None
+        for param in model.parameters():
+            if param.device.type != 'meta':
+                actual_current_device = param.device
+                break
+        current_device_desc = _device_str(actual_current_device) if actual_current_device else "OFFLOAD"
+        if debug:
+            debug.log(f"Moving {model_name} from {current_device_desc} to {_device_str(target_device)} ({reason or 'inference requirement'})", category="general")
+        timer_name = f"{model_name.lower()}_to_gpu"
+        if debug:
+            debug.start_timer(timer_name)
+        # Restore blocks to their configured devices
+        if hasattr(model, "blocks") and hasattr(model, "blocks_to_swap"):
+            # Use configured offload_device from BlockSwap config
+            offload_device = model._block_swap_config.get("offload_device")
+            if not offload_device:
+                raise ValueError("BlockSwap config missing offload_device")
+            # Move blocks according to BlockSwap configuration
+            for b, block in enumerate(model.blocks):
+                if b > model.blocks_to_swap:
+                    # This block should be on GPU
+                    block.to(target_device)
+                else:
+                    # This block stays on offload device (will be swapped during forward)
+                    block.to(offload_device)
+            # Handle I/O components
+            if not model._block_swap_config.get("swap_io_components", False):
+                # I/O components should be on GPU if not offloaded
+                for name, module in model.named_children():
+                    if name != "blocks":
+                        module.to(target_device)
+            else:
+                # I/O components stay on offload device
+                for name, module in model.named_children():
+                    if name != "blocks":
+                        module.to(offload_device)
+            if debug:
+                # Get actual configuration from runner
+                if hasattr(model, '_block_swap_config'):
+                    blocks_on_gpu = model._block_swap_config.get('total_blocks', 32) - model._block_swap_config.get('blocks_swapped', 16)
+                    total_blocks = model._block_swap_config.get('total_blocks', 32)
+                    main_device = model._block_swap_config.get('main_device', 'GPU')
+                    debug.log(f"BlockSwap blocks restored to configured devices ({blocks_on_gpu}/{total_blocks} blocks on {_device_str(main_device)})", category="success")
+                else:
+                    debug.log("BlockSwap blocks restored to configured devices", category="success")
+        # Reactivate BlockSwap now that blocks are restored to their configured devices
+        runner._blockswap_active = True
+        # Disable bypass, re-enable protection
+        set_blockswap_bypass(runner=runner, bypass=False, debug=debug)
+        if debug:
+            debug.end_timer(timer_name, "BlockSwap model restored")
+        return True
+def _standard_model_movement(model: torch.nn.Module, current_device: torch.device,
+                            target_device: torch.device, target_type: str, model_name: str,
+                            debug: Optional['Debug'] = None, reason: Optional[str] = None) -> bool:
+    """
+    Handle standard (non-BlockSwap) model movement.
+    Args:
+        model: Model to move
+        current_device: Current device of the model
+        target_device: Target device (torch.device object)
+        target_type: Target device type
+        model_name: Model name for logging
+        debug: Debug instance
+        reason: Movement reason
+    Returns:
+        bool: True if model was moved
+    """
+    # Check if model is on meta device - can't move meta tensors
+    if current_device.type == 'meta':
+        if debug:
+            debug.log(f"{model_name} is on meta device - skipping movement (will materialize when needed)",
+                     category=model_name.lower())
+        return False
+    # Determine reason for movement
+    reason = reason or "inference requirement"
+    # Log the movement with full device strings
+    if debug:
+        current_device_str = _device_str(current_device)
+        target_device_str = _device_str(target_device)
+        debug.log(f"Moving {model_name} from {current_device_str} to {target_device_str} ({reason})", category="general")
+    # Start timer based on direction
+    timer_name = f"{model_name.lower()}_to_{'gpu' if target_type != 'cpu' else 'cpu'}"
+    if debug:
+        debug.start_timer(timer_name)
+    # Move model and clear gradients
+    model.to(target_device)
+    model.zero_grad(set_to_none=True)
+    # Clear VAE memory buffers when moving to CPU
+    if target_type == 'cpu' and model_name == "VAE":
+        cleared_count = 0
+        for module in model.modules():
+            if hasattr(module, 'memory') and module.memory is not None:
+                if torch.is_tensor(module.memory) and (module.memory.is_cuda or module.memory.is_mps):
+                    module.memory = None
+                    cleared_count += 1
+        if cleared_count > 0 and debug:
+            debug.log(f"Cleared {cleared_count} VAE memory buffers", category="success")
+    # End timer
+    if debug:
+        debug.end_timer(timer_name, f"{model_name} moved to {_device_str(target_device)}")
+    return True
+def clear_runtime_caches(runner: Any, debug: Optional['Debug'] = None) -> int:
+    """
+    Clear all runtime caches and temporary attributes.
+    """
+    if not runner:
+        return 0
+    if debug:
+        debug.start_timer("runtime_cache_clear")
+    cleaned_items = 0
+    # 1. Clear main runner cache
+    if hasattr(runner, 'cache') and hasattr(runner.cache, 'cache'):
+        if debug:
+            debug.start_timer("runner_cache_clear")
+        cache_entries = len(runner.cache.cache)
+        # Properly release tensor memory and delete as we go
+        for key in list(runner.cache.cache.keys()):
+            value = runner.cache.cache[key]
+            if torch.is_tensor(value):
+                release_tensor_memory(value)
+            elif isinstance(value, (list, tuple)):
+                for item in value:
+                    if torch.is_tensor(item):
+                        release_tensor_memory(item)
+            # Delete immediately to release reference
+            del runner.cache.cache[key]
+        # Final clear for safety
+        runner.cache.cache.clear()
+        cleaned_items += cache_entries
+        if debug:
+            debug.end_timer("runner_cache_clear", f"Clearing main runner cache entries")
+        if cache_entries > 0:
+            debug.log(f"Cleared {cache_entries} runtime cache entries", category="success")
+    # 2. Clear RoPE caches
+    if hasattr(runner, 'dit'):
+        if debug:
+            debug.start_timer("rope_cache_clear")
+        model = runner.dit
+        if hasattr(model, 'dit_model'):  # Handle wrapper
+            model = model.dit_model
+        rope_cleared = clear_rope_lru_caches(model=model, debug=debug)
+        cleaned_items += rope_cleared
+        if debug:
+            debug.end_timer("rope_cache_clear", "Clearing RoPE LRU caches")
+        if rope_cleared > 0:
+            debug.log(f"Cleared {rope_cleared} RoPE LRU caches", category="success")
+    # 3. Clear temporary attributes
+    temp_attrs = ['_temp_cache', '_block_cache', '_swap_cache', '_generation_cache',
+                  '_rope_cache', '_intermediate_cache', '_backward_cache']
+    for obj in [runner, getattr(runner, 'dit', None), getattr(runner, 'vae', None)]:
+        if obj is None:
+            continue
+        actual_obj = obj.dit_model if hasattr(obj, 'dit_model') else obj
+        for attr in temp_attrs:
+            if hasattr(actual_obj, attr):
+                delattr(actual_obj, attr)
+                cleaned_items += 1
+    if debug:
+        debug.end_timer("runtime_cache_clear", f"clear_runtime_caches() completion")
+    return cleaned_items
+def cleanup_dit(runner: Any, debug: Optional['Debug'] = None, cache_model: bool = False) -> None:
+    """
+    Cleanup DiT model and BlockSwap state after upscaling phase.
+    Called at the end of upscale_all_batches when DiT is no longer needed.
+    Args:
+        runner: Runner instance containing DiT model
+        debug: Debug instance for logging
+        cache_model: If True, move DiT to offload_device; if False, delete completely
+    """
+    if not runner or not hasattr(runner, 'dit'):
+        return
+    if debug:
+        debug.log("Cleaning up DiT components", category="cleanup")
+    # 1. Clear DiT-specific runtime caches first
+    if hasattr(runner, 'dit'):
+        model = runner.dit
+        if hasattr(model, 'dit_model'):  # Handle wrapper
+            model = model.dit_model
+        # Clear RoPE caches
+        rope_cleared = clear_rope_lru_caches(model=model, debug=debug)
+        if rope_cleared > 0 and debug:
+            debug.log(f"Cleared {rope_cleared} RoPE LRU caches", category="success")
+        # Clear DiT temporary attributes
+        temp_attrs = ['_temp_cache', '_block_cache', '_swap_cache', '_generation_cache',
+                      '_rope_cache', '_intermediate_cache', '_backward_cache']
+        actual_obj = model.dit_model if hasattr(model, 'dit_model') else model
+        for attr in temp_attrs:
+            if hasattr(actual_obj, attr):
+                delattr(actual_obj, attr)
+    # 2. Handle model offloading (for caching or before deletion)
+    try:
+        param_device = next(runner.dit.parameters()).device
+        # Move model off GPU if needed
+        if param_device.type not in ['meta', 'cpu']:
+            # MPS: skip CPU movement before deletion (unified memory, just causes sync)
+            if param_device.type == 'mps' and not cache_model:
+                if debug:
+                    debug.log("DiT on MPS - skipping CPU movement before deletion", category="cleanup")
+            else:
+                offload_target = getattr(runner, '_dit_offload_device', None)
+                if offload_target is None or offload_target == 'none':
+                    offload_target = torch.device('cpu')
+                reason = "model caching" if cache_model else "releasing GPU memory"
+                manage_model_device(model=runner.dit, target_device=offload_target, model_name="DiT",
+                                   debug=debug, reason=reason, runner=runner)
+        elif param_device.type == 'meta' and debug:
+            debug.log("DiT on meta device - keeping structure for cache", category="cleanup")
+    except StopIteration:
+        pass
+    # 3. Clean BlockSwap after model movement
+    if hasattr(runner, "_blockswap_active") and runner._blockswap_active:
+        # Import here to avoid circular dependency
+        from .blockswap import cleanup_blockswap
+        cleanup_blockswap(runner=runner, keep_state_for_cache=cache_model)
+    # 4. Complete cleanup if not caching
+    if not cache_model:
+        release_model_memory(model=runner.dit, debug=debug)
+        runner.dit = None
+        if debug:
+            debug.log("DiT model deleted", category="cleanup")
+        # Clear DiT config attributes - not needed when model is not cached (will be recreated)
+        if hasattr(runner, '_dit_compile_args'):
+            delattr(runner, '_dit_compile_args')
+        if hasattr(runner, '_dit_block_swap_config'):
+            delattr(runner, '_dit_block_swap_config')
+        if hasattr(runner, '_dit_attention_mode'):
+            delattr(runner, '_dit_attention_mode')
+    # 5. Clear DiT temporary attributes (should be already cleared in materialize_model)
+    runner._dit_checkpoint = None
+    runner._dit_dtype_override = None
+    # 6. Clear DiT-related components and temporary attributes
+    runner.sampler = None
+    runner.sampling_timesteps = None
+    runner.schedule = None
+def cleanup_vae(runner: Any, debug: Optional['Debug'] = None, cache_model: bool = False) -> None:
+    """
+    Cleanup VAE model after decoding phase.
+    Called at the end of decode_all_batches when VAE is no longer needed.
+    Args:
+        runner: Runner instance containing VAE model
+        debug: Debug instance for logging
+        cache_model: If True, move VAE to offload_device; if False, delete completely
+    """
+    if not runner or not hasattr(runner, 'vae'):
+        return
+    if debug:
+        debug.log("Cleaning up VAE components", category="cleanup")
+    # 1. Clear VAE-specific temporary attributes
+    if hasattr(runner, 'vae'):
+        temp_attrs = ['_temp_cache', '_block_cache', '_swap_cache', '_generation_cache',
+                      '_rope_cache', '_intermediate_cache', '_backward_cache']
+        for attr in temp_attrs:
+            if hasattr(runner.vae, attr):
+                delattr(runner.vae, attr)
+    # 2. Handle model offloading (for caching or before deletion)
+    try:
+        param_device = next(runner.vae.parameters()).device
+        # Move model off GPU if needed
+        if param_device.type not in ['meta', 'cpu']:
+            # MPS: skip CPU movement before deletion (unified memory, just causes sync)
+            if param_device.type == 'mps' and not cache_model:
+                if debug:
+                    debug.log("VAE on MPS - skipping CPU movement before deletion", category="cleanup")
+            else:
+                offload_target = getattr(runner, '_vae_offload_device', None)
+                if offload_target is None or offload_target == 'none':
+                    offload_target = torch.device('cpu')
+                reason = "model caching" if cache_model else "releasing GPU memory"
+                manage_model_device(model=runner.vae, target_device=offload_target, model_name="VAE",
+                                   debug=debug, reason=reason, runner=runner)
+        elif param_device.type == 'meta' and debug:
+            debug.log("VAE on meta device - keeping structure for cache", category="cleanup")
+    except StopIteration:
+        pass
+    # 3. Complete cleanup if not caching
+    if not cache_model:
+        release_model_memory(model=runner.vae, debug=debug)
+        runner.vae = None
+        if debug:
+            debug.log("VAE model deleted", category="cleanup")
+        # Clear VAE config attributes - not needed when model is not cached (will be recreated)
+        if hasattr(runner, '_vae_compile_args'):
+            delattr(runner, '_vae_compile_args')
+        if hasattr(runner, '_vae_tiling_config'):
+            delattr(runner, '_vae_tiling_config')
+    # 3. Clear VAE temporary attributes (should be already cleared in materialize_model)
+    runner._vae_checkpoint = None
+    runner._vae_dtype_override = None
+def complete_cleanup(runner: Any, debug: Optional['Debug'] = None, dit_cache: bool = False, vae_cache: bool = False) -> None:
+    """
+    Complete cleanup of runner and remaining components with independent model caching support.
+    This is a lightweight cleanup for final stage, as model-specific cleanup
+    happens in their respective phases (cleanup_dit, cleanup_vae).
+    Args:
+        runner: Runner instance to clean up
+        debug: Debug instance for logging
+        dit_cache: If True, preserve DiT model on offload_device for future runs
+        vae_cache: If True, preserve VAE model on offload_device for future runs
+    Behavior:
+        - Can cache DiT and VAE independently for flexible memory management
+        - Preserves _dit_model_name and _vae_model_name when either model is cached for change detection
+        - Clears all temporary attributes and runtime caches
+        - Performs deep memory cleanup only when both models are fully released
+    Note:
+        Model name tracking (_dit_model_name, _vae_model_name) is only cleared if neither
+        model is cached, enabling proper model change detection on subsequent runs.
+    """
+    if not runner:
+        return
+    if debug:
+        cleanup_type = "partial cleanup" if (dit_cache or vae_cache) else "full cleanup"
+        debug.log(f"Starting {cleanup_type}", category="cleanup")
+    # 1. Cleanup any remaining models if they still exist
+    # (This handles cases where phases were skipped or errored)
+    if hasattr(runner, 'dit') and runner.dit is not None:
+        cleanup_dit(runner=runner, debug=debug, cache_model=dit_cache)
+    if hasattr(runner, 'vae') and runner.vae is not None:
+        cleanup_vae(runner=runner, debug=debug, cache_model=vae_cache)
+    # 2. Clear remaining runtime caches
+    clear_runtime_caches(runner=runner, debug=debug)
+    # 3. Clear config and other non-model components when fully releasing runner
+    if not (dit_cache or vae_cache):
+        # Full cleanup - clear config and model tracking
+        runner.config = None
+        runner._dit_model_name = None
+        runner._vae_model_name = None
+    # 4. Final memory cleanup
+    clear_memory(debug=debug, deep=True, force=True, timer_name="complete_cleanup")
+    # 5. Clear cuBLAS workspaces
+    torch._C._cuda_clearCublasWorkspaces() if hasattr(torch._C, '_cuda_clearCublasWorkspaces') else None
+    # Log what models are cached for next run
+    if dit_cache or vae_cache:
+        cached_models = []
+        if dit_cache and hasattr(runner, '_dit_model_name'):
+            cached_models.append(f"DiT ({runner._dit_model_name})")
+        if vae_cache and hasattr(runner, '_vae_model_name'):
+            cached_models.append(f"VAE ({runner._vae_model_name})")
+        if cached_models:
+            models_str = " and ".join(cached_models)
+            debug.log(f"Models cached for next run: {models_str}", category="cache", force=True)
+    if debug:
+        debug.log(f"Completed {cleanup_type}", category="success")

webui.bat ADDED Viewed

	@@ -0,0 +1,187 @@

+@echo off
+chcp 65001
+set PYTHONUTF8=1
+:: The original source of the webui.bat file is stable-diffusion-webui
+:: Modified and enhanced by Gemini with features for venv management and requirements handling.
+:: --------- Configuration ---------
+set COMMANDLINE_ARGS=
+:: Define the application directory (folder name)
+:: Leave empty if the app is in the root directory.
+set APP_DIR=
+:: Define the name of the Launch application
+set APPLICATION_NAME=app.py
+:: Define the requirements filename, default is requirements.txt
+set REQUIREMENTS_FILE=requirements.txt
+:: Define the name of the virtual environment directory
+set VENV_NAME=venv
+:: Set to 1 to always attempt to update packages from requirements.txt on every launch
+set ALWAYS_UPDATE_REQS=1
+:: ---------------------------------
+:: --------- Path Setup Logic ---------
+:: Logic to handle paths based on whether APP_DIR is set
+if defined APP_DIR (
+    set "TARGET_REQ=%~dp0%APP_DIR%\%REQUIREMENTS_FILE%"
+    set "TARGET_SCRIPT=%~dp0%APP_DIR%\%APPLICATION_NAME%"
+    echo Working in subdirectory: %APP_DIR%
+) else (
+    set "TARGET_REQ=%~dp0%REQUIREMENTS_FILE%"
+    set "TARGET_SCRIPT=%~dp0%APPLICATION_NAME%"
+    echo Working in root directory.
+)
+:: ------------------------------------
+:: Set PYTHON executable if not already defined
+if not defined PYTHON (set PYTHON=python)
+:: Set VENV_DIR using VENV_NAME if not already defined
+if not defined VENV_DIR (set "VENV_DIR=%~dp0%VENV_NAME%")
+mkdir tmp 2>NUL
+:: Check if Python is callable
+%PYTHON% -c "" >tmp/stdout.txt 2>tmp/stderr.txt
+if %ERRORLEVEL% == 0 goto :check_pip
+echo Couldn't launch python
+goto :show_stdout_stderr
+:check_pip
+:: Check if pip is available
+%PYTHON% -mpip --help >tmp/stdout.txt 2>tmp/stderr.txt
+if %ERRORLEVEL% == 0 goto :start_venv
+:: If pip is not available and PIP_INSTALLER_LOCATION is set, try to install pip
+if "%PIP_INSTALLER_LOCATION%" == "" goto :show_stdout_stderr
+%PYTHON% "%PIP_INSTALLER_LOCATION%" >tmp/stdout.txt 2>tmp/stderr.txt
+if %ERRORLEVEL% == 0 goto :start_venv
+echo Couldn't install pip
+goto :show_stdout_stderr
+:start_venv
+:: Skip venv creation/activation if VENV_DIR is explicitly set to "-"
+if ["%VENV_DIR%"] == ["-"] goto :skip_venv_entirely
+:: Skip venv creation/activation if SKIP_VENV is set to "1"
+if ["%SKIP_VENV%"] == ["1"] goto :skip_venv_entirely
+:: Check if the venv already exists by looking for Python.exe in its Scripts directory
+dir "%VENV_DIR%\Scripts\Python.exe" >tmp/stdout.txt 2>tmp/stderr.txt
+if %ERRORLEVEL% == 0 goto :activate_venv_and_maybe_update
+:: Venv does not exist, create it
+echo Virtual environment not found in "%VENV_DIR%". Creating a new one.
+for /f "delims=" %%i in ('CALL %PYTHON% -c "import sys; print(sys.executable)"') do set PYTHON_FULLNAME="%%i"
+echo Creating venv in directory %VENV_DIR% using python %PYTHON_FULLNAME%
+%PYTHON_FULLNAME% -m venv "%VENV_DIR%" >tmp/stdout.txt 2>tmp/stderr.txt
+if %ERRORLEVEL% NEQ 0 (
+    echo Unable to create venv in directory "%VENV_DIR%"
+    goto :show_stdout_stderr
+)
+echo Venv created.
+:: Install requirements for the first time if venv was just created
+:: This section handles the initial installation of packages from requirements.txt
+:: immediately after a new virtual environment is created.
+echo Checking for %REQUIREMENTS_FILE% for initial setup...
+if exist "%TARGET_REQ%" (
+    echo Found %REQUIREMENTS_FILE% at "%TARGET_REQ%", attempting to install for initial setup...
+    call "%VENV_DIR%\Scripts\activate.bat"
+    echo Installing packages from %REQUIREMENTS_FILE% ^(initial setup^)...
+    "%VENV_DIR%\Scripts\python.exe" -m pip install -r "%TARGET_REQ%"
+    if %ERRORLEVEL% NEQ 0 (
+        echo Failed to install requirements during initial setup. Please check the output above.
+        pause
+        goto :show_stdout_stderr_custom_pip_initial
+    )
+    echo Initial requirements installed successfully.
+    call "%VENV_DIR%\Scripts\deactivate.bat"
+) else (
+    echo No %REQUIREMENTS_FILE% found at "%TARGET_REQ%", skipping package installation.
+)
+goto :activate_venv_and_maybe_update
+:activate_venv_and_maybe_update
+:: This label is reached if the venv exists or was just created.
+:: Set PYTHON to point to the venv's Python interpreter.
+set PYTHON="%VENV_DIR%\Scripts\Python.exe"
+echo Activating venv: %PYTHON%
+:: Always update requirements if ALWAYS_UPDATE_REQS is 1
+:: This section allows for updating packages from requirements.txt on every launch
+:: if the ALWAYS_UPDATE_REQS variable is set to 1.
+if defined ALWAYS_UPDATE_REQS (
+    if "%ALWAYS_UPDATE_REQS%"=="1" (
+        echo ALWAYS_UPDATE_REQS is enabled.
+        if exist "%TARGET_REQ%" (
+            echo Attempting to update packages from "%TARGET_REQ%"...
+            REM No need to call activate.bat here again, PYTHON is already set to the venv's python
+            %PYTHON% -m pip install -r "%TARGET_REQ%"
+            if %ERRORLEVEL% NEQ 0 (
+                echo Failed to update requirements. Please check the output above.
+                pause
+                goto :endofscript
+            )
+            echo Requirements updated successfully.
+        ) else (
+            echo ALWAYS_UPDATE_REQS is enabled, but no %REQUIREMENTS_FILE% found. Skipping update.
+        )
+    ) else (
+        echo ALWAYS_UPDATE_REQS is not enabled or not set to 1. Skipping routine update.
+    )
+)
+goto :launch
+:skip_venv_entirely
+:: This label is reached if venv usage is explicitly skipped.
+echo Skipping venv.
+goto :launch
+:launch
+:: Launch the main application
+echo Launching Web UI with arguments: %COMMANDLINE_ARGS% %*
+echo Script path: %TARGET_SCRIPT%
+%PYTHON% "%TARGET_SCRIPT%" %COMMANDLINE_ARGS% %*
+echo Launch finished.
+pause
+exit /b
+:show_stdout_stderr_custom_pip_initial
+:: Custom error handler for failures during the initial pip install process.
+echo.
+echo exit code ^(pip initial install^): %errorlevel%
+echo Errors during initial pip install. See output above.
+echo.
+echo Launch unsuccessful. Exiting.
+pause
+exit /b
+:show_stdout_stderr
+:: General error handler: displays stdout and stderr from the tmp directory.
+echo.
+echo exit code: %errorlevel%
+for /f %%i in ("tmp\stdout.txt") do set size=%%~zi
+if %size% equ 0 goto :show_stderr
+echo.
+echo stdout:
+type tmp\stdout.txt
+:show_stderr
+for /f %%i in ("tmp\stderr.txt") do set size=%%~zi
+if %size% equ 0 goto :endofscript
+echo.
+echo stderr:
+type tmp\stderr.txt
+:endofscript
+echo.
+echo Launch unsuccessful. Exiting.
+pause
+exit /b