Bring the cleanup work into the python script and drop the shell scripts
Some checks failed
Download Missing Models / download-models (push) Has been cancelled

This commit is contained in:
zcourts
2025-09-27 18:29:05 +01:00
parent 0c748f1497
commit 849e7c4699
4 changed files with 203 additions and 175 deletions

1
.gitignore vendored
View File

@@ -1 +1,2 @@
.idea .idea
tmp

View File

@@ -1,49 +0,0 @@
#!/usr/bin/env bash
set -euo pipefail
# cleanup.sh - Commits, pushes, and prunes LFS files.
#
# - Detects *untracked* files (git status --porcelain), so we dont skip commits.
# - Uses 'git add --renormalize .' so new/changed .gitattributes rules convert
# existing files into LFS pointers on re-add.
# - Keeps the prune step to free local disk space after a successful push.
#
# Usage: ./tools/cleanup.sh <commit-message>
if [ "$#" -ne 1 ]; then
echo "Usage: $0 <commit-message>" >&2
exit 1
fi
COMMIT_MESSAGE="$1"
# Detect any changes, including untracked.
if [[ -z "$(git status --porcelain=v1)" ]]; then
echo "No new files or changes to commit. Skipping commit and push."
exit 0
fi
echo "Committing and pushing changes..."
# Make sure .gitattributes changes are included and normalization runs,
# so LFS filters rewrite eligible files as pointers.
git add .gitattributes || true
git add --renormalize .
# If nothing ended up staged (e.g. only ignored files changed), exit gracefully.
if git diff --cached --quiet; then
echo "No staged changes after normalization. Skipping commit and push."
exit 0
fi
git commit -m "$COMMIT_MESSAGE"
git push
# Optional but useful: ensure all LFS objects are on the remote.
# Uncomment if you want belt-and-suspenders uploads.
# git lfs push origin --all
echo "Pruning local LFS files..."
git lfs prune --force
echo "✅ Cleanup complete."

View File

@@ -1,25 +1,36 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
""" """
download.py - Download/repair model files and update model.yaml metadata. download.py - Download/repair model files, update model.yaml metadata,
and commit/push changes with proper Git LFS handling (no bash script needed).
Usage: Usage:
./tools/download.py models/llama-2-7b-chat/model.yaml ./tools/download.py models/llama-2-7b-chat/model.yaml
- Always (re)runs snapshot_download with resume support, so partially What this does:
fetched directories get completed instead of being skipped. - (Re)runs snapshot_download with resume support, so partially fetched directories
get completed instead of being skipped.
- Avoids adding Hugging Face housekeeping like ".cache/**" to your YAML.
- Updates YAML after each variant with fresh file list + total size. - Updates YAML after each variant with fresh file list + total size.
- Tracks LFS via sensible patterns (plus a size threshold fallback). - Tracks LFS via sensible patterns (plus a size threshold fallback) using
- Emits clear logs so you can see progress per variant. repo-relative paths so it actually applies.
- Runs a built-in cleanup step (commit, push, optional LFS push, and prune),
replacing the old cleanup.sh.
""" """
import sys from __future__ import annotations
import os import os
import sys
import yaml import yaml
import subprocess import subprocess
from pathlib import Path from pathlib import Path
from typing import Iterable from typing import Iterable, List, Optional
from huggingface_hub import snapshot_download from huggingface_hub import snapshot_download
# ----------------------------
# Configuration
# ----------------------------
LFS_PATTERNS: list[str] = [ LFS_PATTERNS: list[str] = [
# Extensions commonly used for model artifacts # Extensions commonly used for model artifacts
"*.safetensors", "*.safetensors",
@@ -37,14 +48,68 @@ LFS_PATTERNS: list[str] = [
SIZE_THRESHOLD_BYTES = 1_000_000 # 1 MB fallback if a file doesn't match any pattern SIZE_THRESHOLD_BYTES = 1_000_000 # 1 MB fallback if a file doesn't match any pattern
def run(cmd: list[str], check: bool = True) -> None: # By default we skip pushing all LFS objects (same as prior bash script).
subprocess.run(cmd, check=check) # Set env GIT_LFS_PUSH_ALL=1 to force a full "git lfs push origin --all".
LFS_PUSH_ALL = os.environ.get("GIT_LFS_PUSH_ALL", "0") == "1"
def track_lfs_patterns(patterns: Iterable[str]) -> None: # ----------------------------
# Small subprocess helpers
# ----------------------------
def run(cmd: list[str], check: bool = True, cwd: Optional[Path] = None) -> None:
subprocess.run(cmd, check=check, cwd=str(cwd) if cwd else None)
def run_capture(cmd: list[str], cwd: Optional[Path] = None) -> str:
out = subprocess.check_output(cmd, cwd=str(cwd) if cwd else None, stderr=subprocess.DEVNULL)
return out.decode().strip()
# ----------------------------
# Git / LFS utilities
# ----------------------------
def ensure_repo_root() -> Path:
""" """
Track a set of patterns in Git LFS. This is idempotent; it just Ensure we're in a git repo; install LFS filters locally; return repo root.
appends to .gitattributes as needed. """
try:
subprocess.run(
["git", "rev-parse", "--is-inside-work-tree"],
check=True,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
)
# Make sure LFS filters are active in this repo (idempotent)
subprocess.run(
["git", "lfs", "install", "--local"],
check=False,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
)
root = run_capture(["git", "rev-parse", "--show-toplevel"])
return Path(root)
except Exception:
print("⚠️ Not inside a Git repository? Git/LFS steps may fail.", file=sys.stderr)
return Path.cwd()
def repo_relative_path(repo_root: Path, p: Path) -> Path:
"""
Return a path to p relative to repo_root. Works even if p is not a strict subpath
(falls back to os.path.relpath).
"""
try:
return p.resolve().relative_to(repo_root.resolve())
except Exception:
# Fallback (handles symlinks / different mounts)
return Path(os.path.relpath(p.resolve(), repo_root.resolve()))
def lfs_track_patterns(patterns: Iterable[str]) -> None:
"""
Track a set of glob patterns in Git LFS (idempotent).
""" """
for patt in patterns: for patt in patterns:
try: try:
@@ -54,18 +119,111 @@ def track_lfs_patterns(patterns: Iterable[str]) -> None:
pass pass
def list_files_under(root: Path) -> list[Path]: def lfs_track_file(repo_root: Path, path_in_repo: Path) -> None:
return [p for p in root.rglob("*") if p.is_file()] """
Track an individual file in Git LFS using a repo-relative path.
"""
def ensure_repo_root() -> None: # Normalize to POSIX-like string for .gitattributes consistency
# best effort: warn (but dont die) if not in a git repo rel = str(path_in_repo.as_posix())
try: try:
subprocess.run(["git", "rev-parse", "--is-inside-work-tree"], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) run(["git", "lfs", "track", rel], check=False, cwd=repo_root)
except Exception: except Exception:
print("⚠️ Not inside a Git repository? Git/LFS steps may fail.", file=sys.stderr) pass
def git_status_has_changes(repo_root: Path) -> bool:
try:
status = run_capture(["git", "status", "--porcelain=v1"], cwd=repo_root)
return bool(status.strip())
except Exception:
return False
def git_stage_and_commit_push(
repo_root: Path,
scope_paths: list[Path],
commit_message: str,
lfs_push_all: bool = False,
) -> None:
"""
Stages .gitattributes + renormalizes only the provided scope paths (e.g., 'models/…'),
then commits, pushes, optionally pushes all LFS objects, and finally prunes LFS.
"""
# Stage .gitattributes explicitly (ignore failures)
try:
run(["git", "add", ".gitattributes"], check=False, cwd=repo_root)
except Exception:
pass
# Renormalize only the relevant directories to avoid sweeping the whole repo.
# If scope_paths is empty, fall back to full repo (conservative).
if scope_paths:
for sp in scope_paths:
rel = repo_relative_path(repo_root, sp)
run(["git", "add", "--renormalize", str(rel)], check=False, cwd=repo_root)
else:
run(["git", "add", "--renormalize", "."], check=False, cwd=repo_root)
# If nothing is staged, skip
staged_is_empty = False
try:
# 'git diff --cached --quiet' exits 0 when no staged changes
subprocess.run(["git", "diff", "--cached", "--quiet"], cwd=str(repo_root))
staged_is_empty = True
except Exception:
staged_is_empty = False
if staged_is_empty:
print("No staged changes after normalization. Skipping commit and push.")
return
print("Committing and pushing changes...")
run(["git", "commit", "-m", commit_message], cwd=repo_root)
# Push main refs
run(["git", "push"], cwd=repo_root)
# Optionally ensure all LFS objects are uploaded
if lfs_push_all:
try:
run(["git", "lfs", "push", "origin", "--all"], check=True, cwd=repo_root)
except subprocess.CalledProcessError as e:
print(f"⚠️ 'git lfs push --all' failed: {e}. Continuing.", file=sys.stderr)
# Prune local LFS to save disk
try:
run(["git", "lfs", "prune", "--force"], check=False, cwd=repo_root)
except Exception as e:
print(f"⚠️ 'git lfs prune' failed: {e}. Continuing.", file=sys.stderr)
print("✅ Cleanup complete.")
# ----------------------------
# Filesystem helpers
# ----------------------------
def list_files_under(root: Path) -> list[Path]:
"""
Recursively collect files under `root`, skipping housekeeping dirs.
"""
skip_dirs = {".git", ".cache", ".hf_mirror_cache"}
files: list[Path] = []
for p in root.rglob("*"):
if not p.is_file():
continue
rel_parts = p.relative_to(root).parts
# Skip files if any parent is a hidden/skip dir
if any(part in skip_dirs or part.startswith(".") for part in rel_parts[:-1]):
continue
files.append(p)
return files
# ----------------------------
# Main routine
# ----------------------------
def main() -> None: def main() -> None:
if len(sys.argv) != 2: if len(sys.argv) != 2:
print(f"Usage: {sys.argv[0]} <path-to-model.yaml>", file=sys.stderr) print(f"Usage: {sys.argv[0]} <path-to-model.yaml>", file=sys.stderr)
@@ -76,7 +234,7 @@ def main() -> None:
print(f"Model YAML not found: {model_yaml_path}", file=sys.stderr) print(f"Model YAML not found: {model_yaml_path}", file=sys.stderr)
sys.exit(1) sys.exit(1)
ensure_repo_root() repo_root = ensure_repo_root()
# Load YAML # Load YAML
with open(model_yaml_path, "r", encoding="utf-8") as f: with open(model_yaml_path, "r", encoding="utf-8") as f:
@@ -85,7 +243,7 @@ def main() -> None:
model_dir = model_yaml_path.parent model_dir = model_yaml_path.parent
# Proactively set up LFS tracking by patterns (idempotent) # Proactively set up LFS tracking by patterns (idempotent)
track_lfs_patterns(LFS_PATTERNS) lfs_track_patterns(LFS_PATTERNS)
# Iterate formats & variants # Iterate formats & variants
formats = (data.get("model") or {}).get("formats") or [] formats = (data.get("model") or {}).get("formats") or []
@@ -104,19 +262,14 @@ def main() -> None:
repo_id = hf_repo.replace("https://huggingface.co/", "") repo_id = hf_repo.replace("https://huggingface.co/", "")
print(f"\n[DL] Downloading/resuming variant '{variant_id}' from '{repo_id}' into '{dest_path}'") print(f"\n[DL] Downloading/resuming variant '{variant_id}' from '{repo_id}' into '{dest_path}'")
# Always call snapshot_download with resume enabled. This will: # Always call snapshot_download with resume enabled. Filter out .cache.
# - no-op for already-complete files
# - resume partials
# - fetch any missing files
try: try:
snapshot_download( snapshot_download(
repo_id=repo_id, repo_id=repo_id,
local_dir=str(dest_path), local_dir=str(dest_path),
local_dir_use_symlinks=False, local_dir_use_symlinks=False,
resume_download=True, # explicit resume_download=True, # explicit
# You can add allow_patterns / ignore_patterns if you want to filter ignore_patterns=[".cache/**"], # prevent housekeeping into tree
# allow_patterns=None,
# ignore_patterns=None,
) )
except Exception as e: except Exception as e:
print(f"❌ snapshot_download failed for {variant_id}: {e}", file=sys.stderr) print(f"❌ snapshot_download failed for {variant_id}: {e}", file=sys.stderr)
@@ -127,8 +280,8 @@ def main() -> None:
total_size_bytes = 0 total_size_bytes = 0
for p in list_files_under(dest_path): for p in list_files_under(dest_path):
rel = p.relative_to(model_dir) rel_to_model = p.relative_to(model_dir)
files_list.append(str(rel)) files_list.append(str(rel_to_model).replace("\\", "/"))
try: try:
size = p.stat().st_size size = p.stat().st_size
except FileNotFoundError: except FileNotFoundError:
@@ -138,8 +291,8 @@ def main() -> None:
# Fallback: ensure big files get tracked even if patterns miss them # Fallback: ensure big files get tracked even if patterns miss them
if size > SIZE_THRESHOLD_BYTES: if size > SIZE_THRESHOLD_BYTES:
# Idempotent; harmless if already tracked. rel_to_repo = repo_relative_path(repo_root, p)
run(["git", "lfs", "track", str(p)], check=False) lfs_track_file(repo_root, rel_to_repo)
files_list.sort() files_list.sort()
variant["files"] = files_list variant["files"] = files_list
@@ -150,15 +303,25 @@ def main() -> None:
yaml.dump(data, f, sort_keys=False, allow_unicode=True) yaml.dump(data, f, sort_keys=False, allow_unicode=True)
print(f"✅ Updated {model_yaml_path} for variant '{variant_id}'") print(f"✅ Updated {model_yaml_path} for variant '{variant_id}'")
# Run cleanup script to commit, push, and prune
commit_message = f"Add/update model files for {model_dir.name}/{variant_id}" # ---- Built-in cleanup (replaces cleanup.sh) ----
print(f"🧹 Running cleanup for {variant_id}...") print(f"🧹 Running cleanup for {variant_id}...")
try: try:
run(["./tools/cleanup.sh", commit_message], check=True) # Only scope renormalization to model_dir to keep things fast.
git_changed = git_status_has_changes(repo_root)
if not git_changed:
print("No new files or changes to commit. Skipping commit and push.")
else:
commit_message = f"Add/update model files for {model_dir.name}/{variant_id}"
git_stage_and_commit_push(
repo_root=repo_root,
scope_paths=[model_dir],
commit_message=commit_message,
lfs_push_all=LFS_PUSH_ALL,
)
except subprocess.CalledProcessError as e: except subprocess.CalledProcessError as e:
print(f"cleanup.sh failed (continue to next variant): {e}", file=sys.stderr) print(f"Cleanup failed (continue to next variant): {e}", file=sys.stderr)
# Decide whether to continue or abort; continuing is usually fine. # Continue to next variant
# raise # uncomment to abort on failure
print(f"\n✅ Download and YAML update complete for {model_yaml_path}.") print(f"\n✅ Download and YAML update complete for {model_yaml_path}.")

View File

@@ -1,87 +0,0 @@
#!/usr/bin/env bash
set -euo pipefail
# download.sh - Download model files and update model.yaml metadata.
#
# This script reads a model.yaml file, downloads the complete model data from
# the specified Hugging Face repository, and then updates the 'files' array
# in the YAML with the paths of the downloaded files.
#
# This approach is more robust than specifying files manually, as it ensures
# the YAML reflects the actual downloaded content.
#
# Usage: ./tools/download.sh models/llama-2-7b/model.yaml
if [ "$#" -ne 1 ]; then
echo "Usage: $0 <path-to-model.yaml>" >&2
exit 1
fi
MODEL_YAML="$1"
MODEL_DIR=$(dirname "$MODEL_YAML")
if [ ! -f "$MODEL_YAML" ]; then
echo "Model YAML not found: $MODEL_YAML" >&2
exit 1
fi
# Ensure yq is installed
if ! command -v yq &> /dev/null; then
echo "Error: yq is not installed. Install it with: pip install yq or brew install yq" >&2
exit 1
fi
# Ensure huggingface-cli is installed
if ! command -v huggingface-cli &> /dev/null; then
echo "Error: huggingface-cli is not installed. Install it with: pip install huggingface_hub" >&2
exit 1
fi
echo "Reading metadata from $MODEL_YAML..."
# Create a temporary file to store the updated YAML content
TMP_YAML=$(mktemp)
trap 'rm -f "$TMP_YAML"' EXIT
cp "$MODEL_YAML" "$TMP_YAML"
# Loop over each format and variant to download files
yq -r '.formats[] | . as $format | .variants[] | . as $variant | "\($format.type)\|\($variant.id)\|\($variant.hf_repo)"' "$MODEL_YAML" | while IFS='|' read -r format_type variant_id hf_repo; do
echo
echo "Processing variant: $variant_id (format: $format_type) from $hf_repo"
DEST_PATH="$MODEL_DIR/$variant_id"
mkdir -p "$DEST_PATH"
# Check if files are already downloaded by checking for a non-empty directory
if [ -n "$(ls -A "$DEST_PATH" 2>/dev/null)" ]; then
echo "[OK] Files for $variant_id already exist in $DEST_PATH. Skipping download."
else
repo_id=${hf_repo#https://huggingface.co/}
echo "[DL] Downloading files for $variant_id from $repo_id..."
huggingface-cli download "$repo_id" --local-dir "$DEST_PATH" --local-dir-use-symlinks False
fi
# After downloading, list the downloaded files relative to the model directory
downloaded_files=()
while IFS= read -r file; do
downloaded_files+=("$(realpath --relative-to="$MODEL_DIR" "$file")")
done < <(find "$DEST_PATH" -type f)
# Update the YAML file with the list of downloaded files for the current variant
echo "Updating $MODEL_YAML with downloaded file paths for $variant_id..."
# Create a yq expression to update the files for the specific variant
yq_exp="(.formats[] | select(.type == \"$format_type\") | .variants[] | select(.id == \"$variant_id\") | .files) = []"
yq eval -i "$yq_exp" "$TMP_YAML"
for file in "${downloaded_files[@]}"; do
yq_exp="(.formats[] | select(.type == \"$format_type\") | .variants[] | select(.id == \"$variant_id\") | .files) += [\"$file\"]"
yq eval -i "$yq_exp" "$TMP_YAML"
done
done
# Replace the original YAML with the updated one
mv "$TMP_YAML" "$MODEL_YAML"
echo
echo "✅ Download and YAML update complete for $MODEL_YAML."