Fix multi-GPU support and memory management (#17)

* Ensure projector is on the same device as the matrix for multi-GPU support * Optimize memory management for loaded model weights * Refactor memory management by removing unnecessary gc.collect() calls * Optimize memory usage (#1) * Improve memory management by explicitly deleting model layers and optimizing projector usage * Optimize memory management by explicitly deleting the model and forcing garbage collection * Add back deleted `empty_cache` call * Fix broken file * Remove unnecessary deletions * Remove unnecessary empty_cache() calls * Remove unused import of gc * Duplicate `gc.collect` call in `empty_cache()` * Move additional `gc.collect` call in front of `torch.x.empty_cache`
2025-11-19 00:39:12 +01:00
parent 61fdf72b42
commit c8b6663b93
3 changed files with 11 additions and 1 deletions
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: AGPL-3.0-or-later
 # Copyright (C) 2025  Philipp Emanuel Weidmann <pew@worldwidemann.com>
 import os
 import math
 import sys
 import time
@@ -34,6 +35,7 @@ from .config import Settings
 from .evaluator import Evaluator
 from .model import AbliterationParameters, Model
 from .utils import (
    empty_cache,
    format_duration,
    get_readme_intro,
    get_trial_parameters,
@@ -44,6 +46,10 @@ from .utils import (
 def run():
    # Enable expandable segments to reduce memory fragmentation on multi-GPU setups.
    if "PYTORCH_CUDA_ALLOC_CONF" not in os.environ:
        os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
    # Modified "Pagga" font from https://budavariam.github.io/asciiart-text/
    print(f"[cyan]█░█░█▀▀░█▀▄░█▀▀░▀█▀░█░█▀▀[/]  v{version('heretic-llm')}")
    print("[cyan]█▀█░█▀▀░█▀▄░█▀▀░░█░░█░█░░[/]")
@@ -213,8 +213,10 @@ class Model:
                ).to(self.model.dtype)
                for matrix in matrices:
                    # Ensure projector is on the same device as the matrix for multi-GPU support.
                    device_projector = projector.to(matrix.device)
                    # In-place subtraction is safe as we're not using Autograd.
-                    matrix.sub_(weight * (projector @ matrix))
+                    matrix.sub_(weight * (device_projector @ matrix))
    def get_chat(self, prompt: str) -> list[dict[str, str]]:
        return [
@@ -48,6 +48,8 @@ def batchify(items: list[T], batch_size: int) -> list[list[T]]:
 def empty_cache():
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    elif is_xpu_available():