Fix multi-GPU support and memory management (#17)

* Ensure projector is on the same device as the matrix for multi-GPU support

* Optimize memory management for loaded model weights

* Refactor memory management by removing unnecessary gc.collect() calls

* Optimize memory usage (#1)

* Improve memory management by explicitly deleting model layers and optimizing projector usage

* Optimize memory management by explicitly deleting the model and forcing garbage collection

* Add back deleted `empty_cache` call

* Fix broken file

* Remove unnecessary deletions

* Remove unnecessary empty_cache() calls

* Remove unused import of gc

* Duplicate `gc.collect` call in `empty_cache()`

* Move additional `gc.collect` call in front of `torch.x.empty_cache`
This commit is contained in:
Nikolai Kolodziej
2025-11-19 00:39:12 +01:00
committed by GitHub
parent 61fdf72b42
commit c8b6663b93
3 changed files with 11 additions and 1 deletions
+6
View File
@@ -1,6 +1,7 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# Copyright (C) 2025 Philipp Emanuel Weidmann <pew@worldwidemann.com>
import os
import math
import sys
import time
@@ -34,6 +35,7 @@ from .config import Settings
from .evaluator import Evaluator
from .model import AbliterationParameters, Model
from .utils import (
empty_cache,
format_duration,
get_readme_intro,
get_trial_parameters,
@@ -44,6 +46,10 @@ from .utils import (
def run():
# Enable expandable segments to reduce memory fragmentation on multi-GPU setups.
if "PYTORCH_CUDA_ALLOC_CONF" not in os.environ:
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
# Modified "Pagga" font from https://budavariam.github.io/asciiart-text/
print(f"[cyan]█░█░█▀▀░█▀▄░█▀▀░▀█▀░█░█▀▀[/] v{version('heretic-llm')}")
print("[cyan]█▀█░█▀▀░█▀▄░█▀▀░░█░░█░█░░[/]")
+3 -1
View File
@@ -213,8 +213,10 @@ class Model:
).to(self.model.dtype)
for matrix in matrices:
# Ensure projector is on the same device as the matrix for multi-GPU support.
device_projector = projector.to(matrix.device)
# In-place subtraction is safe as we're not using Autograd.
matrix.sub_(weight * (projector @ matrix))
matrix.sub_(weight * (device_projector @ matrix))
def get_chat(self, prompt: str) -> list[dict[str, str]]:
return [
+2
View File
@@ -48,6 +48,8 @@ def batchify(items: list[T], batch_size: int) -> list[list[T]]:
def empty_cache():
gc.collect()
if torch.cuda.is_available():
torch.cuda.empty_cache()
elif is_xpu_available():