diff --git a/.gitignore b/.gitignore index 485dee6..17952c7 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ .idea +tmp diff --git a/tools/cleanup.sh b/tools/cleanup.sh deleted file mode 100755 index 02cdce6..0000000 --- a/tools/cleanup.sh +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -# cleanup.sh - Commits, pushes, and prunes LFS files. -# -# - Detects *untracked* files (git status --porcelain), so we don’t skip commits. -# - Uses 'git add --renormalize .' so new/changed .gitattributes rules convert -# existing files into LFS pointers on re-add. -# - Keeps the prune step to free local disk space after a successful push. -# -# Usage: ./tools/cleanup.sh - -if [ "$#" -ne 1 ]; then - echo "Usage: $0 " >&2 - exit 1 -fi - -COMMIT_MESSAGE="$1" - -# Detect any changes, including untracked. -if [[ -z "$(git status --porcelain=v1)" ]]; then - echo "No new files or changes to commit. Skipping commit and push." - exit 0 -fi - -echo "Committing and pushing changes..." - -# Make sure .gitattributes changes are included and normalization runs, -# so LFS filters rewrite eligible files as pointers. -git add .gitattributes || true -git add --renormalize . - -# If nothing ended up staged (e.g. only ignored files changed), exit gracefully. -if git diff --cached --quiet; then - echo "No staged changes after normalization. Skipping commit and push." - exit 0 -fi - -git commit -m "$COMMIT_MESSAGE" -git push - -# Optional but useful: ensure all LFS objects are on the remote. -# Uncomment if you want belt-and-suspenders uploads. -# git lfs push origin --all - -echo "Pruning local LFS files..." -git lfs prune --force - -echo "✅ Cleanup complete." diff --git a/tools/download.py b/tools/download.py index b948b3d..55847c5 100644 --- a/tools/download.py +++ b/tools/download.py @@ -1,25 +1,36 @@ #!/usr/bin/env python3 """ -download.py - Download/repair model files and update model.yaml metadata. +download.py - Download/repair model files, update model.yaml metadata, +and commit/push changes with proper Git LFS handling (no bash script needed). Usage: ./tools/download.py models/llama-2-7b-chat/model.yaml -- Always (re)runs snapshot_download with resume support, so partially - fetched directories get completed instead of being skipped. +What this does: +- (Re)runs snapshot_download with resume support, so partially fetched directories + get completed instead of being skipped. +- Avoids adding Hugging Face housekeeping like ".cache/**" to your YAML. - Updates YAML after each variant with fresh file list + total size. -- Tracks LFS via sensible patterns (plus a size threshold fallback). -- Emits clear logs so you can see progress per variant. +- Tracks LFS via sensible patterns (plus a size threshold fallback) using + repo-relative paths so it actually applies. +- Runs a built-in cleanup step (commit, push, optional LFS push, and prune), + replacing the old cleanup.sh. """ -import sys +from __future__ import annotations + import os +import sys import yaml import subprocess from pathlib import Path -from typing import Iterable +from typing import Iterable, List, Optional from huggingface_hub import snapshot_download +# ---------------------------- +# Configuration +# ---------------------------- + LFS_PATTERNS: list[str] = [ # Extensions commonly used for model artifacts "*.safetensors", @@ -37,14 +48,68 @@ LFS_PATTERNS: list[str] = [ SIZE_THRESHOLD_BYTES = 1_000_000 # 1 MB fallback if a file doesn't match any pattern -def run(cmd: list[str], check: bool = True) -> None: - subprocess.run(cmd, check=check) +# By default we skip pushing all LFS objects (same as prior bash script). +# Set env GIT_LFS_PUSH_ALL=1 to force a full "git lfs push origin --all". +LFS_PUSH_ALL = os.environ.get("GIT_LFS_PUSH_ALL", "0") == "1" -def track_lfs_patterns(patterns: Iterable[str]) -> None: +# ---------------------------- +# Small subprocess helpers +# ---------------------------- + +def run(cmd: list[str], check: bool = True, cwd: Optional[Path] = None) -> None: + subprocess.run(cmd, check=check, cwd=str(cwd) if cwd else None) + + +def run_capture(cmd: list[str], cwd: Optional[Path] = None) -> str: + out = subprocess.check_output(cmd, cwd=str(cwd) if cwd else None, stderr=subprocess.DEVNULL) + return out.decode().strip() + + +# ---------------------------- +# Git / LFS utilities +# ---------------------------- + +def ensure_repo_root() -> Path: """ - Track a set of patterns in Git LFS. This is idempotent; it just - appends to .gitattributes as needed. + Ensure we're in a git repo; install LFS filters locally; return repo root. + """ + try: + subprocess.run( + ["git", "rev-parse", "--is-inside-work-tree"], + check=True, + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + ) + # Make sure LFS filters are active in this repo (idempotent) + subprocess.run( + ["git", "lfs", "install", "--local"], + check=False, + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + ) + root = run_capture(["git", "rev-parse", "--show-toplevel"]) + return Path(root) + except Exception: + print("⚠️ Not inside a Git repository? Git/LFS steps may fail.", file=sys.stderr) + return Path.cwd() + + +def repo_relative_path(repo_root: Path, p: Path) -> Path: + """ + Return a path to p relative to repo_root. Works even if p is not a strict subpath + (falls back to os.path.relpath). + """ + try: + return p.resolve().relative_to(repo_root.resolve()) + except Exception: + # Fallback (handles symlinks / different mounts) + return Path(os.path.relpath(p.resolve(), repo_root.resolve())) + + +def lfs_track_patterns(patterns: Iterable[str]) -> None: + """ + Track a set of glob patterns in Git LFS (idempotent). """ for patt in patterns: try: @@ -54,18 +119,111 @@ def track_lfs_patterns(patterns: Iterable[str]) -> None: pass -def list_files_under(root: Path) -> list[Path]: - return [p for p in root.rglob("*") if p.is_file()] - - -def ensure_repo_root() -> None: - # best effort: warn (but don’t die) if not in a git repo +def lfs_track_file(repo_root: Path, path_in_repo: Path) -> None: + """ + Track an individual file in Git LFS using a repo-relative path. + """ + # Normalize to POSIX-like string for .gitattributes consistency + rel = str(path_in_repo.as_posix()) try: - subprocess.run(["git", "rev-parse", "--is-inside-work-tree"], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + run(["git", "lfs", "track", rel], check=False, cwd=repo_root) except Exception: - print("⚠️ Not inside a Git repository? Git/LFS steps may fail.", file=sys.stderr) + pass +def git_status_has_changes(repo_root: Path) -> bool: + try: + status = run_capture(["git", "status", "--porcelain=v1"], cwd=repo_root) + return bool(status.strip()) + except Exception: + return False + + +def git_stage_and_commit_push( + repo_root: Path, + scope_paths: list[Path], + commit_message: str, + lfs_push_all: bool = False, +) -> None: + """ + Stages .gitattributes + renormalizes only the provided scope paths (e.g., 'models/…'), + then commits, pushes, optionally pushes all LFS objects, and finally prunes LFS. + """ + # Stage .gitattributes explicitly (ignore failures) + try: + run(["git", "add", ".gitattributes"], check=False, cwd=repo_root) + except Exception: + pass + + # Renormalize only the relevant directories to avoid sweeping the whole repo. + # If scope_paths is empty, fall back to full repo (conservative). + if scope_paths: + for sp in scope_paths: + rel = repo_relative_path(repo_root, sp) + run(["git", "add", "--renormalize", str(rel)], check=False, cwd=repo_root) + else: + run(["git", "add", "--renormalize", "."], check=False, cwd=repo_root) + + # If nothing is staged, skip + staged_is_empty = False + try: + # 'git diff --cached --quiet' exits 0 when no staged changes + subprocess.run(["git", "diff", "--cached", "--quiet"], cwd=str(repo_root)) + staged_is_empty = True + except Exception: + staged_is_empty = False + + if staged_is_empty: + print("No staged changes after normalization. Skipping commit and push.") + return + + print("Committing and pushing changes...") + run(["git", "commit", "-m", commit_message], cwd=repo_root) + + # Push main refs + run(["git", "push"], cwd=repo_root) + + # Optionally ensure all LFS objects are uploaded + if lfs_push_all: + try: + run(["git", "lfs", "push", "origin", "--all"], check=True, cwd=repo_root) + except subprocess.CalledProcessError as e: + print(f"⚠️ 'git lfs push --all' failed: {e}. Continuing.", file=sys.stderr) + + # Prune local LFS to save disk + try: + run(["git", "lfs", "prune", "--force"], check=False, cwd=repo_root) + except Exception as e: + print(f"⚠️ 'git lfs prune' failed: {e}. Continuing.", file=sys.stderr) + + print("✅ Cleanup complete.") + + +# ---------------------------- +# Filesystem helpers +# ---------------------------- + +def list_files_under(root: Path) -> list[Path]: + """ + Recursively collect files under `root`, skipping housekeeping dirs. + """ + skip_dirs = {".git", ".cache", ".hf_mirror_cache"} + files: list[Path] = [] + for p in root.rglob("*"): + if not p.is_file(): + continue + rel_parts = p.relative_to(root).parts + # Skip files if any parent is a hidden/skip dir + if any(part in skip_dirs or part.startswith(".") for part in rel_parts[:-1]): + continue + files.append(p) + return files + + +# ---------------------------- +# Main routine +# ---------------------------- + def main() -> None: if len(sys.argv) != 2: print(f"Usage: {sys.argv[0]} ", file=sys.stderr) @@ -76,7 +234,7 @@ def main() -> None: print(f"Model YAML not found: {model_yaml_path}", file=sys.stderr) sys.exit(1) - ensure_repo_root() + repo_root = ensure_repo_root() # Load YAML with open(model_yaml_path, "r", encoding="utf-8") as f: @@ -85,7 +243,7 @@ def main() -> None: model_dir = model_yaml_path.parent # Proactively set up LFS tracking by patterns (idempotent) - track_lfs_patterns(LFS_PATTERNS) + lfs_track_patterns(LFS_PATTERNS) # Iterate formats & variants formats = (data.get("model") or {}).get("formats") or [] @@ -104,19 +262,14 @@ def main() -> None: repo_id = hf_repo.replace("https://huggingface.co/", "") print(f"\n[DL] Downloading/resuming variant '{variant_id}' from '{repo_id}' into '{dest_path}'") - # Always call snapshot_download with resume enabled. This will: - # - no-op for already-complete files - # - resume partials - # - fetch any missing files + # Always call snapshot_download with resume enabled. Filter out .cache. try: snapshot_download( repo_id=repo_id, local_dir=str(dest_path), local_dir_use_symlinks=False, resume_download=True, # explicit - # You can add allow_patterns / ignore_patterns if you want to filter - # allow_patterns=None, - # ignore_patterns=None, + ignore_patterns=[".cache/**"], # prevent housekeeping into tree ) except Exception as e: print(f"❌ snapshot_download failed for {variant_id}: {e}", file=sys.stderr) @@ -127,8 +280,8 @@ def main() -> None: total_size_bytes = 0 for p in list_files_under(dest_path): - rel = p.relative_to(model_dir) - files_list.append(str(rel)) + rel_to_model = p.relative_to(model_dir) + files_list.append(str(rel_to_model).replace("\\", "/")) try: size = p.stat().st_size except FileNotFoundError: @@ -138,8 +291,8 @@ def main() -> None: # Fallback: ensure big files get tracked even if patterns miss them if size > SIZE_THRESHOLD_BYTES: - # Idempotent; harmless if already tracked. - run(["git", "lfs", "track", str(p)], check=False) + rel_to_repo = repo_relative_path(repo_root, p) + lfs_track_file(repo_root, rel_to_repo) files_list.sort() variant["files"] = files_list @@ -150,15 +303,25 @@ def main() -> None: yaml.dump(data, f, sort_keys=False, allow_unicode=True) print(f"✅ Updated {model_yaml_path} for variant '{variant_id}'") - # Run cleanup script to commit, push, and prune - commit_message = f"Add/update model files for {model_dir.name}/{variant_id}" + + # ---- Built-in cleanup (replaces cleanup.sh) ---- print(f"🧹 Running cleanup for {variant_id}...") try: - run(["./tools/cleanup.sh", commit_message], check=True) + # Only scope renormalization to model_dir to keep things fast. + git_changed = git_status_has_changes(repo_root) + if not git_changed: + print("No new files or changes to commit. Skipping commit and push.") + else: + commit_message = f"Add/update model files for {model_dir.name}/{variant_id}" + git_stage_and_commit_push( + repo_root=repo_root, + scope_paths=[model_dir], + commit_message=commit_message, + lfs_push_all=LFS_PUSH_ALL, + ) except subprocess.CalledProcessError as e: - print(f"❌ cleanup.sh failed (continue to next variant): {e}", file=sys.stderr) - # Decide whether to continue or abort; continuing is usually fine. - # raise # uncomment to abort on failure + print(f"❌ Cleanup failed (continue to next variant): {e}", file=sys.stderr) + # Continue to next variant print(f"\n✅ Download and YAML update complete for {model_yaml_path}.") diff --git a/tools/download.sh b/tools/download.sh deleted file mode 100755 index 3c074d8..0000000 --- a/tools/download.sh +++ /dev/null @@ -1,87 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -# download.sh - Download model files and update model.yaml metadata. -# -# This script reads a model.yaml file, downloads the complete model data from -# the specified Hugging Face repository, and then updates the 'files' array -# in the YAML with the paths of the downloaded files. -# -# This approach is more robust than specifying files manually, as it ensures -# the YAML reflects the actual downloaded content. -# -# Usage: ./tools/download.sh models/llama-2-7b/model.yaml - -if [ "$#" -ne 1 ]; then - echo "Usage: $0 " >&2 - exit 1 -fi - -MODEL_YAML="$1" -MODEL_DIR=$(dirname "$MODEL_YAML") - -if [ ! -f "$MODEL_YAML" ]; then - echo "Model YAML not found: $MODEL_YAML" >&2 - exit 1 -fi - -# Ensure yq is installed -if ! command -v yq &> /dev/null; then - echo "Error: yq is not installed. Install it with: pip install yq or brew install yq" >&2 - exit 1 -fi - -# Ensure huggingface-cli is installed -if ! command -v huggingface-cli &> /dev/null; then - echo "Error: huggingface-cli is not installed. Install it with: pip install huggingface_hub" >&2 - exit 1 -fi - -echo "Reading metadata from $MODEL_YAML..." - -# Create a temporary file to store the updated YAML content -TMP_YAML=$(mktemp) -trap 'rm -f "$TMP_YAML"' EXIT - -cp "$MODEL_YAML" "$TMP_YAML" - -# Loop over each format and variant to download files -yq -r '.formats[] | . as $format | .variants[] | . as $variant | "\($format.type)\|\($variant.id)\|\($variant.hf_repo)"' "$MODEL_YAML" | while IFS='|' read -r format_type variant_id hf_repo; do - echo - echo "Processing variant: $variant_id (format: $format_type) from $hf_repo" - - DEST_PATH="$MODEL_DIR/$variant_id" - mkdir -p "$DEST_PATH" - - # Check if files are already downloaded by checking for a non-empty directory - if [ -n "$(ls -A "$DEST_PATH" 2>/dev/null)" ]; then - echo "[OK] Files for $variant_id already exist in $DEST_PATH. Skipping download." - else - repo_id=${hf_repo#https://huggingface.co/} - echo "[DL] Downloading files for $variant_id from $repo_id..." - huggingface-cli download "$repo_id" --local-dir "$DEST_PATH" --local-dir-use-symlinks False - fi - - # After downloading, list the downloaded files relative to the model directory - downloaded_files=() - while IFS= read -r file; do - downloaded_files+=("$(realpath --relative-to="$MODEL_DIR" "$file")") - done < <(find "$DEST_PATH" -type f) - - # Update the YAML file with the list of downloaded files for the current variant - echo "Updating $MODEL_YAML with downloaded file paths for $variant_id..." - # Create a yq expression to update the files for the specific variant - yq_exp="(.formats[] | select(.type == \"$format_type\") | .variants[] | select(.id == \"$variant_id\") | .files) = []" - yq eval -i "$yq_exp" "$TMP_YAML" - - for file in "${downloaded_files[@]}"; do - yq_exp="(.formats[] | select(.type == \"$format_type\") | .variants[] | select(.id == \"$variant_id\") | .files) += [\"$file\"]" - yq eval -i "$yq_exp" "$TMP_YAML" - done -done - -# Replace the original YAML with the updated one -mv "$TMP_YAML" "$MODEL_YAML" - -echo -echo "✅ Download and YAML update complete for $MODEL_YAML."