Bring the cleanup work into the python script and drop the shell scripts

2025-09-27 18:29:05 +01:00
parent 0c748f1497
commit 849e7c4699
4 changed files with 203 additions and 175 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,2 @@
 .idea
 tmp
--- a/tools/cleanup.sh
+++ b/tools/cleanup.sh
@@ -1,49 +0,0 @@
 #!/usr/bin/env bash
 set -euo pipefail
 # cleanup.sh - Commits, pushes, and prunes LFS files.
 #
 # - Detects *untracked* files (git status --porcelain), so we don’t skip commits.
 # - Uses 'git add --renormalize .' so new/changed .gitattributes rules convert
 #   existing files into LFS pointers on re-add.
 # - Keeps the prune step to free local disk space after a successful push.
 #
 # Usage: ./tools/cleanup.sh <commit-message>
 if [ "$#" -ne 1 ]; then
  echo "Usage: $0 <commit-message>" >&2
  exit 1
 fi
 COMMIT_MESSAGE="$1"
 # Detect any changes, including untracked.
 if [[ -z "$(git status --porcelain=v1)" ]]; then
  echo "No new files or changes to commit. Skipping commit and push."
  exit 0
 fi
 echo "Committing and pushing changes..."
 # Make sure .gitattributes changes are included and normalization runs,
 # so LFS filters rewrite eligible files as pointers.
 git add .gitattributes || true
 git add --renormalize .
 # If nothing ended up staged (e.g. only ignored files changed), exit gracefully.
 if git diff --cached --quiet; then
  echo "No staged changes after normalization. Skipping commit and push."
  exit 0
 fi
 git commit -m "$COMMIT_MESSAGE"
 git push
 # Optional but useful: ensure all LFS objects are on the remote.
 # Uncomment if you want belt-and-suspenders uploads.
 # git lfs push origin --all
 echo "Pruning local LFS files..."
 git lfs prune --force
 echo "✅ Cleanup complete."
--- a/tools/download.py
+++ b/tools/download.py
@@ -1,25 +1,36 @@
 #!/usr/bin/env python3
 """
-download.py - Download/repair model files and update model.yaml metadata.
+download.py - Download/repair model files, update model.yaml metadata,
 and commit/push changes with proper Git LFS handling (no bash script needed).
 Usage:
    ./tools/download.py models/llama-2-7b-chat/model.yaml
- Always (re)runs snapshot_download with resume support, so partially
+What this does:
-  fetched directories get completed instead of being skipped.
+- (Re)runs snapshot_download with resume support, so partially fetched directories
  get completed instead of being skipped.
 - Avoids adding Hugging Face housekeeping like ".cache/**" to your YAML.
 - Updates YAML after each variant with fresh file list + total size.
- Tracks LFS via sensible patterns (plus a size threshold fallback).
+- Tracks LFS via sensible patterns (plus a size threshold fallback) using
- Emits clear logs so you can see progress per variant.
+  repo-relative paths so it actually applies.
 - Runs a built-in cleanup step (commit, push, optional LFS push, and prune),
  replacing the old cleanup.sh.
 """
-import sys
+from __future__ import annotations
 import os
 import sys
 import yaml
 import subprocess
 from pathlib import Path
-from typing import Iterable
+from typing import Iterable, List, Optional
 from huggingface_hub import snapshot_download
 # ----------------------------
 # Configuration
 # ----------------------------
 LFS_PATTERNS: list[str] = [
    # Extensions commonly used for model artifacts
    "*.safetensors",
@@ -37,14 +48,68 @@ LFS_PATTERNS: list[str] = [
 SIZE_THRESHOLD_BYTES = 1_000_000  # 1 MB fallback if a file doesn't match any pattern
-def run(cmd: list[str], check: bool = True) -> None:
+# By default we skip pushing all LFS objects (same as prior bash script).
-    subprocess.run(cmd, check=check)
+# Set env GIT_LFS_PUSH_ALL=1 to force a full "git lfs push origin --all".
 LFS_PUSH_ALL = os.environ.get("GIT_LFS_PUSH_ALL", "0") == "1"
-def track_lfs_patterns(patterns: Iterable[str]) -> None:
+# ----------------------------
 # Small subprocess helpers
 # ----------------------------
 def run(cmd: list[str], check: bool = True, cwd: Optional[Path] = None) -> None:
    subprocess.run(cmd, check=check, cwd=str(cwd) if cwd else None)
 def run_capture(cmd: list[str], cwd: Optional[Path] = None) -> str:
    out = subprocess.check_output(cmd, cwd=str(cwd) if cwd else None, stderr=subprocess.DEVNULL)
    return out.decode().strip()
 # ----------------------------
 # Git / LFS utilities
 # ----------------------------
 def ensure_repo_root() -> Path:
    """
-    Track a set of patterns in Git LFS. This is idempotent; it just
+    Ensure we're in a git repo; install LFS filters locally; return repo root.
-    appends to .gitattributes as needed.
+    """
    try:
        subprocess.run(
            ["git", "rev-parse", "--is-inside-work-tree"],
            check=True,
            stdout=subprocess.DEVNULL,
            stderr=subprocess.DEVNULL,
        )
        # Make sure LFS filters are active in this repo (idempotent)
        subprocess.run(
            ["git", "lfs", "install", "--local"],
            check=False,
            stdout=subprocess.DEVNULL,
            stderr=subprocess.DEVNULL,
        )
        root = run_capture(["git", "rev-parse", "--show-toplevel"])
        return Path(root)
    except Exception:
        print("⚠️  Not inside a Git repository? Git/LFS steps may fail.", file=sys.stderr)
        return Path.cwd()
 def repo_relative_path(repo_root: Path, p: Path) -> Path:
    """
    Return a path to p relative to repo_root. Works even if p is not a strict subpath
    (falls back to os.path.relpath).
    """
    try:
        return p.resolve().relative_to(repo_root.resolve())
    except Exception:
        # Fallback (handles symlinks / different mounts)
        return Path(os.path.relpath(p.resolve(), repo_root.resolve()))
 def lfs_track_patterns(patterns: Iterable[str]) -> None:
    """
    Track a set of glob patterns in Git LFS (idempotent).
    """
    for patt in patterns:
        try:
@@ -54,18 +119,111 @@ def track_lfs_patterns(patterns: Iterable[str]) -> None:
            pass
-def list_files_under(root: Path) -> list[Path]:
+def lfs_track_file(repo_root: Path, path_in_repo: Path) -> None:
-    return [p for p in root.rglob("*") if p.is_file()]
+    """
-
+    Track an individual file in Git LFS using a repo-relative path.
-
+    """
-def ensure_repo_root() -> None:
+    # Normalize to POSIX-like string for .gitattributes consistency
-    # best effort: warn (but don’t die) if not in a git repo
+    rel = str(path_in_repo.as_posix())
    try:
-        subprocess.run(["git", "rev-parse", "--is-inside-work-tree"], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+        run(["git", "lfs", "track", rel], check=False, cwd=repo_root)
    except Exception:
-        print("⚠️  Not inside a Git repository? Git/LFS steps may fail.", file=sys.stderr)
+        pass
 def git_status_has_changes(repo_root: Path) -> bool:
    try:
        status = run_capture(["git", "status", "--porcelain=v1"], cwd=repo_root)
        return bool(status.strip())
    except Exception:
        return False
 def git_stage_and_commit_push(
    repo_root: Path,
    scope_paths: list[Path],
    commit_message: str,
    lfs_push_all: bool = False,
 ) -> None:
    """
    Stages .gitattributes + renormalizes only the provided scope paths (e.g., 'models/…'),
    then commits, pushes, optionally pushes all LFS objects, and finally prunes LFS.
    """
    # Stage .gitattributes explicitly (ignore failures)
    try:
        run(["git", "add", ".gitattributes"], check=False, cwd=repo_root)
    except Exception:
        pass
    # Renormalize only the relevant directories to avoid sweeping the whole repo.
    # If scope_paths is empty, fall back to full repo (conservative).
    if scope_paths:
        for sp in scope_paths:
            rel = repo_relative_path(repo_root, sp)
            run(["git", "add", "--renormalize", str(rel)], check=False, cwd=repo_root)
    else:
        run(["git", "add", "--renormalize", "."], check=False, cwd=repo_root)
    # If nothing is staged, skip
    staged_is_empty = False
    try:
        # 'git diff --cached --quiet' exits 0 when no staged changes
        subprocess.run(["git", "diff", "--cached", "--quiet"], cwd=str(repo_root))
        staged_is_empty = True
    except Exception:
        staged_is_empty = False
    if staged_is_empty:
        print("No staged changes after normalization. Skipping commit and push.")
        return
    print("Committing and pushing changes...")
    run(["git", "commit", "-m", commit_message], cwd=repo_root)
    # Push main refs
    run(["git", "push"], cwd=repo_root)
    # Optionally ensure all LFS objects are uploaded
    if lfs_push_all:
        try:
            run(["git", "lfs", "push", "origin", "--all"], check=True, cwd=repo_root)
        except subprocess.CalledProcessError as e:
            print(f"⚠️  'git lfs push --all' failed: {e}. Continuing.", file=sys.stderr)
    # Prune local LFS to save disk
    try:
        run(["git", "lfs", "prune", "--force"], check=False, cwd=repo_root)
    except Exception as e:
        print(f"⚠️  'git lfs prune' failed: {e}. Continuing.", file=sys.stderr)
    print("✅ Cleanup complete.")
 # ----------------------------
 # Filesystem helpers
 # ----------------------------
 def list_files_under(root: Path) -> list[Path]:
    """
    Recursively collect files under `root`, skipping housekeeping dirs.
    """
    skip_dirs = {".git", ".cache", ".hf_mirror_cache"}
    files: list[Path] = []
    for p in root.rglob("*"):
        if not p.is_file():
            continue
        rel_parts = p.relative_to(root).parts
        # Skip files if any parent is a hidden/skip dir
        if any(part in skip_dirs or part.startswith(".") for part in rel_parts[:-1]):
            continue
        files.append(p)
    return files
 # ----------------------------
 # Main routine
 # ----------------------------
 def main() -> None:
    if len(sys.argv) != 2:
        print(f"Usage: {sys.argv[0]} <path-to-model.yaml>", file=sys.stderr)
@@ -76,7 +234,7 @@ def main() -> None:
        print(f"Model YAML not found: {model_yaml_path}", file=sys.stderr)
        sys.exit(1)
-    ensure_repo_root()
+    repo_root = ensure_repo_root()
    # Load YAML
    with open(model_yaml_path, "r", encoding="utf-8") as f:
@@ -85,7 +243,7 @@ def main() -> None:
    model_dir = model_yaml_path.parent
    # Proactively set up LFS tracking by patterns (idempotent)
-    track_lfs_patterns(LFS_PATTERNS)
+    lfs_track_patterns(LFS_PATTERNS)
    # Iterate formats & variants
    formats = (data.get("model") or {}).get("formats") or []
@@ -104,19 +262,14 @@ def main() -> None:
            repo_id = hf_repo.replace("https://huggingface.co/", "")
            print(f"\n[DL] Downloading/resuming variant '{variant_id}' from '{repo_id}' into '{dest_path}'")
-            # Always call snapshot_download with resume enabled. This will:
+            # Always call snapshot_download with resume enabled. Filter out .cache.
            # - no-op for already-complete files
            # - resume partials
            # - fetch any missing files
            try:
                snapshot_download(
                    repo_id=repo_id,
                    local_dir=str(dest_path),
                    local_dir_use_symlinks=False,
                    resume_download=True,     # explicit
-                    # You can add allow_patterns / ignore_patterns if you want to filter
+                    ignore_patterns=[".cache/**"],  # prevent housekeeping into tree
                    # allow_patterns=None,
                    # ignore_patterns=None,
                )
            except Exception as e:
                print(f"❌ snapshot_download failed for {variant_id}: {e}", file=sys.stderr)
@@ -127,8 +280,8 @@ def main() -> None:
            total_size_bytes = 0
            for p in list_files_under(dest_path):
-                rel = p.relative_to(model_dir)
+                rel_to_model = p.relative_to(model_dir)
-                files_list.append(str(rel))
+                files_list.append(str(rel_to_model).replace("\\", "/"))
                try:
                    size = p.stat().st_size
                except FileNotFoundError:
@@ -138,8 +291,8 @@ def main() -> None:
                # Fallback: ensure big files get tracked even if patterns miss them
                if size > SIZE_THRESHOLD_BYTES:
-                    # Idempotent; harmless if already tracked.
+                    rel_to_repo = repo_relative_path(repo_root, p)
-                    run(["git", "lfs", "track", str(p)], check=False)
+                    lfs_track_file(repo_root, rel_to_repo)
            files_list.sort()
            variant["files"] = files_list
@@ -150,15 +303,25 @@ def main() -> None:
                yaml.dump(data, f, sort_keys=False, allow_unicode=True)
            print(f"✅ Updated {model_yaml_path} for variant '{variant_id}'")
-            # Run cleanup script to commit, push, and prune
+
-            commit_message = f"Add/update model files for {model_dir.name}/{variant_id}"
+            # ---- Built-in cleanup (replaces cleanup.sh) ----
            print(f"🧹 Running cleanup for {variant_id}...")
            try:
-                run(["./tools/cleanup.sh", commit_message], check=True)
+                # Only scope renormalization to model_dir to keep things fast.
                git_changed = git_status_has_changes(repo_root)
                if not git_changed:
                    print("No new files or changes to commit. Skipping commit and push.")
                else:
                    commit_message = f"Add/update model files for {model_dir.name}/{variant_id}"
                    git_stage_and_commit_push(
                        repo_root=repo_root,
                        scope_paths=[model_dir],
                        commit_message=commit_message,
                        lfs_push_all=LFS_PUSH_ALL,
                    )
            except subprocess.CalledProcessError as e:
-                print(f"❌ cleanup.sh failed (continue to next variant): {e}", file=sys.stderr)
+                print(f"❌ Cleanup failed (continue to next variant): {e}", file=sys.stderr)
-                # Decide whether to continue or abort; continuing is usually fine.
+                # Continue to next variant
                # raise  # uncomment to abort on failure
    print(f"\n✅ Download and YAML update complete for {model_yaml_path}.")
--- a/tools/download.sh
+++ b/tools/download.sh
@@ -1,87 +0,0 @@
 #!/usr/bin/env bash
 set -euo pipefail
 # download.sh - Download model files and update model.yaml metadata.
 #
 # This script reads a model.yaml file, downloads the complete model data from
 # the specified Hugging Face repository, and then updates the 'files' array
 # in the YAML with the paths of the downloaded files.
 #
 # This approach is more robust than specifying files manually, as it ensures
 # the YAML reflects the actual downloaded content.
 #
 # Usage: ./tools/download.sh models/llama-2-7b/model.yaml
 if [ "$#" -ne 1 ]; then
    echo "Usage: $0 <path-to-model.yaml>" >&2
    exit 1
 fi
 MODEL_YAML="$1"
 MODEL_DIR=$(dirname "$MODEL_YAML")
 if [ ! -f "$MODEL_YAML" ]; then
    echo "Model YAML not found: $MODEL_YAML" >&2
    exit 1
 fi
 # Ensure yq is installed
 if ! command -v yq &> /dev/null; then
    echo "Error: yq is not installed. Install it with: pip install yq or brew install yq" >&2
    exit 1
 fi
 # Ensure huggingface-cli is installed
 if ! command -v huggingface-cli &> /dev/null; then
    echo "Error: huggingface-cli is not installed. Install it with: pip install huggingface_hub" >&2
    exit 1
 fi
 echo "Reading metadata from $MODEL_YAML..."
 # Create a temporary file to store the updated YAML content
 TMP_YAML=$(mktemp)
 trap 'rm -f "$TMP_YAML"' EXIT
 cp "$MODEL_YAML" "$TMP_YAML"
 # Loop over each format and variant to download files
 yq -r '.formats[] | . as $format | .variants[] | . as $variant | "\($format.type)\|\($variant.id)\|\($variant.hf_repo)"' "$MODEL_YAML" | while IFS='|' read -r format_type variant_id hf_repo; do
    echo
    echo "Processing variant: $variant_id (format: $format_type) from $hf_repo"
    DEST_PATH="$MODEL_DIR/$variant_id"
    mkdir -p "$DEST_PATH"
    # Check if files are already downloaded by checking for a non-empty directory
    if [ -n "$(ls -A "$DEST_PATH" 2>/dev/null)" ]; then
        echo "[OK] Files for $variant_id already exist in $DEST_PATH. Skipping download."
    else
        repo_id=${hf_repo#https://huggingface.co/}
        echo "[DL] Downloading files for $variant_id from $repo_id..."
        huggingface-cli download "$repo_id" --local-dir "$DEST_PATH" --local-dir-use-symlinks False
    fi
    # After downloading, list the downloaded files relative to the model directory
    downloaded_files=()
    while IFS= read -r file; do
        downloaded_files+=("$(realpath --relative-to="$MODEL_DIR" "$file")")
    done < <(find "$DEST_PATH" -type f)
    # Update the YAML file with the list of downloaded files for the current variant
    echo "Updating $MODEL_YAML with downloaded file paths for $variant_id..."
    # Create a yq expression to update the files for the specific variant
    yq_exp="(.formats[] | select(.type == \"$format_type\") | .variants[] | select(.id == \"$variant_id\") | .files) = []"
    yq eval -i "$yq_exp" "$TMP_YAML"
    for file in "${downloaded_files[@]}"; do
        yq_exp="(.formats[] | select(.type == \"$format_type\") | .variants[] | select(.id == \"$variant_id\") | .files) += [\"$file\"]"
        yq eval -i "$yq_exp" "$TMP_YAML"
    done
 done
 # Replace the original YAML with the updated one
 mv "$TMP_YAML" "$MODEL_YAML"
 echo
 echo "✅ Download and YAML update complete for $MODEL_YAML."