Improve support for loading local datasets (#33)

* Handle loading local datasets * Reorder branches to avoid chain of negatives
2025-11-23 06:45:34 +01:00
parent 83cbf0612a
commit b79b8b1475
1 changed files with 36 additions and 2 deletions
@@ -2,8 +2,10 @@
 # Copyright (C) 2025  Philipp Emanuel Weidmann <pew@worldwidemann.com>

 import gc
+import os
 from dataclasses import asdict
 from importlib.metadata import version
+from pathlib import Path
 from typing import TypeVar

 import torch
@@ -13,7 +15,10 @@ from accelerate.utils import (
    is_sdaa_available,
    is_xpu_available,
 )
-from datasets import load_dataset
+from datasets import ReadInstruction, load_dataset, load_from_disk
+from datasets.config import DATASET_STATE_JSON_FILENAME
+from datasets.download.download_manager import DownloadMode
+from datasets.utils.info_utils import VerificationMode
 from optuna import Trial
 from rich.console import Console

@@ -36,7 +41,36 @@ def format_duration(seconds: float) -> str:


 def load_prompts(specification: DatasetSpecification) -> list[str]:
-    dataset = load_dataset(specification.dataset, split=specification.split)
+    path = specification.dataset
+    split_str = specification.split
+    if os.path.isdir(path):
+        if Path(path, DATASET_STATE_JSON_FILENAME).exists():
+            # Dataset saved with datasets.save_to_disk; needs special handling.
+            # Path should be the subdirectory for a particular split.
+            dataset = load_from_disk(path)
+            # Parse the split instructions.
+            ri = ReadInstruction.from_spec(split_str)
+            # Associate the split with its number of examples (lines).
+            split_name = str(dataset.split)
+            name2len = {split_name: len(dataset)}
+            # Convert the instructions to absolute indices and select the first one.
+            abs_i = ri.to_absolute(name2len)[0]
+            # Get the dataset by applying the indices.
+            dataset = dataset[abs_i.from_ : abs_i.to]
+        else:
+            # Path is a local directory.
+            dataset = load_dataset(
+                path,
+                split=split_str,
+                # Don't require the number of examples (lines) per split to be pre-defined.
+                verification_mode=VerificationMode.NO_CHECKS,
+                # But also don't use cached data, as the dataset may have changed on disk.
+                download_mode=DownloadMode.FORCE_REDOWNLOAD,
+            )
+    else:
+        # Probably a repository path; let load_dataset figure it out.
+        dataset = load_dataset(path, split=split_str)
+
    return list(dataset[specification.column])