Bring the cleanup work into the python script and drop the shell scripts
Some checks failed
Download Missing Models / download-models (push) Has been cancelled
Some checks failed
Download Missing Models / download-models (push) Has been cancelled
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -1 +1,2 @@
|
|||||||
.idea
|
.idea
|
||||||
|
tmp
|
||||||
|
|||||||
@@ -1,49 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
# cleanup.sh - Commits, pushes, and prunes LFS files.
|
|
||||||
#
|
|
||||||
# - Detects *untracked* files (git status --porcelain), so we don’t skip commits.
|
|
||||||
# - Uses 'git add --renormalize .' so new/changed .gitattributes rules convert
|
|
||||||
# existing files into LFS pointers on re-add.
|
|
||||||
# - Keeps the prune step to free local disk space after a successful push.
|
|
||||||
#
|
|
||||||
# Usage: ./tools/cleanup.sh <commit-message>
|
|
||||||
|
|
||||||
if [ "$#" -ne 1 ]; then
|
|
||||||
echo "Usage: $0 <commit-message>" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
COMMIT_MESSAGE="$1"
|
|
||||||
|
|
||||||
# Detect any changes, including untracked.
|
|
||||||
if [[ -z "$(git status --porcelain=v1)" ]]; then
|
|
||||||
echo "No new files or changes to commit. Skipping commit and push."
|
|
||||||
exit 0
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "Committing and pushing changes..."
|
|
||||||
|
|
||||||
# Make sure .gitattributes changes are included and normalization runs,
|
|
||||||
# so LFS filters rewrite eligible files as pointers.
|
|
||||||
git add .gitattributes || true
|
|
||||||
git add --renormalize .
|
|
||||||
|
|
||||||
# If nothing ended up staged (e.g. only ignored files changed), exit gracefully.
|
|
||||||
if git diff --cached --quiet; then
|
|
||||||
echo "No staged changes after normalization. Skipping commit and push."
|
|
||||||
exit 0
|
|
||||||
fi
|
|
||||||
|
|
||||||
git commit -m "$COMMIT_MESSAGE"
|
|
||||||
git push
|
|
||||||
|
|
||||||
# Optional but useful: ensure all LFS objects are on the remote.
|
|
||||||
# Uncomment if you want belt-and-suspenders uploads.
|
|
||||||
# git lfs push origin --all
|
|
||||||
|
|
||||||
echo "Pruning local LFS files..."
|
|
||||||
git lfs prune --force
|
|
||||||
|
|
||||||
echo "✅ Cleanup complete."
|
|
||||||
@@ -1,25 +1,36 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
"""
|
"""
|
||||||
download.py - Download/repair model files and update model.yaml metadata.
|
download.py - Download/repair model files, update model.yaml metadata,
|
||||||
|
and commit/push changes with proper Git LFS handling (no bash script needed).
|
||||||
|
|
||||||
Usage:
|
Usage:
|
||||||
./tools/download.py models/llama-2-7b-chat/model.yaml
|
./tools/download.py models/llama-2-7b-chat/model.yaml
|
||||||
|
|
||||||
- Always (re)runs snapshot_download with resume support, so partially
|
What this does:
|
||||||
fetched directories get completed instead of being skipped.
|
- (Re)runs snapshot_download with resume support, so partially fetched directories
|
||||||
|
get completed instead of being skipped.
|
||||||
|
- Avoids adding Hugging Face housekeeping like ".cache/**" to your YAML.
|
||||||
- Updates YAML after each variant with fresh file list + total size.
|
- Updates YAML after each variant with fresh file list + total size.
|
||||||
- Tracks LFS via sensible patterns (plus a size threshold fallback).
|
- Tracks LFS via sensible patterns (plus a size threshold fallback) using
|
||||||
- Emits clear logs so you can see progress per variant.
|
repo-relative paths so it actually applies.
|
||||||
|
- Runs a built-in cleanup step (commit, push, optional LFS push, and prune),
|
||||||
|
replacing the old cleanup.sh.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import sys
|
from __future__ import annotations
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
import sys
|
||||||
import yaml
|
import yaml
|
||||||
import subprocess
|
import subprocess
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Iterable
|
from typing import Iterable, List, Optional
|
||||||
from huggingface_hub import snapshot_download
|
from huggingface_hub import snapshot_download
|
||||||
|
|
||||||
|
# ----------------------------
|
||||||
|
# Configuration
|
||||||
|
# ----------------------------
|
||||||
|
|
||||||
LFS_PATTERNS: list[str] = [
|
LFS_PATTERNS: list[str] = [
|
||||||
# Extensions commonly used for model artifacts
|
# Extensions commonly used for model artifacts
|
||||||
"*.safetensors",
|
"*.safetensors",
|
||||||
@@ -37,14 +48,68 @@ LFS_PATTERNS: list[str] = [
|
|||||||
|
|
||||||
SIZE_THRESHOLD_BYTES = 1_000_000 # 1 MB fallback if a file doesn't match any pattern
|
SIZE_THRESHOLD_BYTES = 1_000_000 # 1 MB fallback if a file doesn't match any pattern
|
||||||
|
|
||||||
def run(cmd: list[str], check: bool = True) -> None:
|
# By default we skip pushing all LFS objects (same as prior bash script).
|
||||||
subprocess.run(cmd, check=check)
|
# Set env GIT_LFS_PUSH_ALL=1 to force a full "git lfs push origin --all".
|
||||||
|
LFS_PUSH_ALL = os.environ.get("GIT_LFS_PUSH_ALL", "0") == "1"
|
||||||
|
|
||||||
|
|
||||||
def track_lfs_patterns(patterns: Iterable[str]) -> None:
|
# ----------------------------
|
||||||
|
# Small subprocess helpers
|
||||||
|
# ----------------------------
|
||||||
|
|
||||||
|
def run(cmd: list[str], check: bool = True, cwd: Optional[Path] = None) -> None:
|
||||||
|
subprocess.run(cmd, check=check, cwd=str(cwd) if cwd else None)
|
||||||
|
|
||||||
|
|
||||||
|
def run_capture(cmd: list[str], cwd: Optional[Path] = None) -> str:
|
||||||
|
out = subprocess.check_output(cmd, cwd=str(cwd) if cwd else None, stderr=subprocess.DEVNULL)
|
||||||
|
return out.decode().strip()
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------------------
|
||||||
|
# Git / LFS utilities
|
||||||
|
# ----------------------------
|
||||||
|
|
||||||
|
def ensure_repo_root() -> Path:
|
||||||
"""
|
"""
|
||||||
Track a set of patterns in Git LFS. This is idempotent; it just
|
Ensure we're in a git repo; install LFS filters locally; return repo root.
|
||||||
appends to .gitattributes as needed.
|
"""
|
||||||
|
try:
|
||||||
|
subprocess.run(
|
||||||
|
["git", "rev-parse", "--is-inside-work-tree"],
|
||||||
|
check=True,
|
||||||
|
stdout=subprocess.DEVNULL,
|
||||||
|
stderr=subprocess.DEVNULL,
|
||||||
|
)
|
||||||
|
# Make sure LFS filters are active in this repo (idempotent)
|
||||||
|
subprocess.run(
|
||||||
|
["git", "lfs", "install", "--local"],
|
||||||
|
check=False,
|
||||||
|
stdout=subprocess.DEVNULL,
|
||||||
|
stderr=subprocess.DEVNULL,
|
||||||
|
)
|
||||||
|
root = run_capture(["git", "rev-parse", "--show-toplevel"])
|
||||||
|
return Path(root)
|
||||||
|
except Exception:
|
||||||
|
print("⚠️ Not inside a Git repository? Git/LFS steps may fail.", file=sys.stderr)
|
||||||
|
return Path.cwd()
|
||||||
|
|
||||||
|
|
||||||
|
def repo_relative_path(repo_root: Path, p: Path) -> Path:
|
||||||
|
"""
|
||||||
|
Return a path to p relative to repo_root. Works even if p is not a strict subpath
|
||||||
|
(falls back to os.path.relpath).
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
return p.resolve().relative_to(repo_root.resolve())
|
||||||
|
except Exception:
|
||||||
|
# Fallback (handles symlinks / different mounts)
|
||||||
|
return Path(os.path.relpath(p.resolve(), repo_root.resolve()))
|
||||||
|
|
||||||
|
|
||||||
|
def lfs_track_patterns(patterns: Iterable[str]) -> None:
|
||||||
|
"""
|
||||||
|
Track a set of glob patterns in Git LFS (idempotent).
|
||||||
"""
|
"""
|
||||||
for patt in patterns:
|
for patt in patterns:
|
||||||
try:
|
try:
|
||||||
@@ -54,18 +119,111 @@ def track_lfs_patterns(patterns: Iterable[str]) -> None:
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
def list_files_under(root: Path) -> list[Path]:
|
def lfs_track_file(repo_root: Path, path_in_repo: Path) -> None:
|
||||||
return [p for p in root.rglob("*") if p.is_file()]
|
"""
|
||||||
|
Track an individual file in Git LFS using a repo-relative path.
|
||||||
|
"""
|
||||||
def ensure_repo_root() -> None:
|
# Normalize to POSIX-like string for .gitattributes consistency
|
||||||
# best effort: warn (but don’t die) if not in a git repo
|
rel = str(path_in_repo.as_posix())
|
||||||
try:
|
try:
|
||||||
subprocess.run(["git", "rev-parse", "--is-inside-work-tree"], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
run(["git", "lfs", "track", rel], check=False, cwd=repo_root)
|
||||||
except Exception:
|
except Exception:
|
||||||
print("⚠️ Not inside a Git repository? Git/LFS steps may fail.", file=sys.stderr)
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def git_status_has_changes(repo_root: Path) -> bool:
|
||||||
|
try:
|
||||||
|
status = run_capture(["git", "status", "--porcelain=v1"], cwd=repo_root)
|
||||||
|
return bool(status.strip())
|
||||||
|
except Exception:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def git_stage_and_commit_push(
|
||||||
|
repo_root: Path,
|
||||||
|
scope_paths: list[Path],
|
||||||
|
commit_message: str,
|
||||||
|
lfs_push_all: bool = False,
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Stages .gitattributes + renormalizes only the provided scope paths (e.g., 'models/…'),
|
||||||
|
then commits, pushes, optionally pushes all LFS objects, and finally prunes LFS.
|
||||||
|
"""
|
||||||
|
# Stage .gitattributes explicitly (ignore failures)
|
||||||
|
try:
|
||||||
|
run(["git", "add", ".gitattributes"], check=False, cwd=repo_root)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Renormalize only the relevant directories to avoid sweeping the whole repo.
|
||||||
|
# If scope_paths is empty, fall back to full repo (conservative).
|
||||||
|
if scope_paths:
|
||||||
|
for sp in scope_paths:
|
||||||
|
rel = repo_relative_path(repo_root, sp)
|
||||||
|
run(["git", "add", "--renormalize", str(rel)], check=False, cwd=repo_root)
|
||||||
|
else:
|
||||||
|
run(["git", "add", "--renormalize", "."], check=False, cwd=repo_root)
|
||||||
|
|
||||||
|
# If nothing is staged, skip
|
||||||
|
staged_is_empty = False
|
||||||
|
try:
|
||||||
|
# 'git diff --cached --quiet' exits 0 when no staged changes
|
||||||
|
subprocess.run(["git", "diff", "--cached", "--quiet"], cwd=str(repo_root))
|
||||||
|
staged_is_empty = True
|
||||||
|
except Exception:
|
||||||
|
staged_is_empty = False
|
||||||
|
|
||||||
|
if staged_is_empty:
|
||||||
|
print("No staged changes after normalization. Skipping commit and push.")
|
||||||
|
return
|
||||||
|
|
||||||
|
print("Committing and pushing changes...")
|
||||||
|
run(["git", "commit", "-m", commit_message], cwd=repo_root)
|
||||||
|
|
||||||
|
# Push main refs
|
||||||
|
run(["git", "push"], cwd=repo_root)
|
||||||
|
|
||||||
|
# Optionally ensure all LFS objects are uploaded
|
||||||
|
if lfs_push_all:
|
||||||
|
try:
|
||||||
|
run(["git", "lfs", "push", "origin", "--all"], check=True, cwd=repo_root)
|
||||||
|
except subprocess.CalledProcessError as e:
|
||||||
|
print(f"⚠️ 'git lfs push --all' failed: {e}. Continuing.", file=sys.stderr)
|
||||||
|
|
||||||
|
# Prune local LFS to save disk
|
||||||
|
try:
|
||||||
|
run(["git", "lfs", "prune", "--force"], check=False, cwd=repo_root)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"⚠️ 'git lfs prune' failed: {e}. Continuing.", file=sys.stderr)
|
||||||
|
|
||||||
|
print("✅ Cleanup complete.")
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------------------
|
||||||
|
# Filesystem helpers
|
||||||
|
# ----------------------------
|
||||||
|
|
||||||
|
def list_files_under(root: Path) -> list[Path]:
|
||||||
|
"""
|
||||||
|
Recursively collect files under `root`, skipping housekeeping dirs.
|
||||||
|
"""
|
||||||
|
skip_dirs = {".git", ".cache", ".hf_mirror_cache"}
|
||||||
|
files: list[Path] = []
|
||||||
|
for p in root.rglob("*"):
|
||||||
|
if not p.is_file():
|
||||||
|
continue
|
||||||
|
rel_parts = p.relative_to(root).parts
|
||||||
|
# Skip files if any parent is a hidden/skip dir
|
||||||
|
if any(part in skip_dirs or part.startswith(".") for part in rel_parts[:-1]):
|
||||||
|
continue
|
||||||
|
files.append(p)
|
||||||
|
return files
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------------------
|
||||||
|
# Main routine
|
||||||
|
# ----------------------------
|
||||||
|
|
||||||
def main() -> None:
|
def main() -> None:
|
||||||
if len(sys.argv) != 2:
|
if len(sys.argv) != 2:
|
||||||
print(f"Usage: {sys.argv[0]} <path-to-model.yaml>", file=sys.stderr)
|
print(f"Usage: {sys.argv[0]} <path-to-model.yaml>", file=sys.stderr)
|
||||||
@@ -76,7 +234,7 @@ def main() -> None:
|
|||||||
print(f"Model YAML not found: {model_yaml_path}", file=sys.stderr)
|
print(f"Model YAML not found: {model_yaml_path}", file=sys.stderr)
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
ensure_repo_root()
|
repo_root = ensure_repo_root()
|
||||||
|
|
||||||
# Load YAML
|
# Load YAML
|
||||||
with open(model_yaml_path, "r", encoding="utf-8") as f:
|
with open(model_yaml_path, "r", encoding="utf-8") as f:
|
||||||
@@ -85,7 +243,7 @@ def main() -> None:
|
|||||||
model_dir = model_yaml_path.parent
|
model_dir = model_yaml_path.parent
|
||||||
|
|
||||||
# Proactively set up LFS tracking by patterns (idempotent)
|
# Proactively set up LFS tracking by patterns (idempotent)
|
||||||
track_lfs_patterns(LFS_PATTERNS)
|
lfs_track_patterns(LFS_PATTERNS)
|
||||||
|
|
||||||
# Iterate formats & variants
|
# Iterate formats & variants
|
||||||
formats = (data.get("model") or {}).get("formats") or []
|
formats = (data.get("model") or {}).get("formats") or []
|
||||||
@@ -104,19 +262,14 @@ def main() -> None:
|
|||||||
repo_id = hf_repo.replace("https://huggingface.co/", "")
|
repo_id = hf_repo.replace("https://huggingface.co/", "")
|
||||||
print(f"\n[DL] Downloading/resuming variant '{variant_id}' from '{repo_id}' into '{dest_path}'")
|
print(f"\n[DL] Downloading/resuming variant '{variant_id}' from '{repo_id}' into '{dest_path}'")
|
||||||
|
|
||||||
# Always call snapshot_download with resume enabled. This will:
|
# Always call snapshot_download with resume enabled. Filter out .cache.
|
||||||
# - no-op for already-complete files
|
|
||||||
# - resume partials
|
|
||||||
# - fetch any missing files
|
|
||||||
try:
|
try:
|
||||||
snapshot_download(
|
snapshot_download(
|
||||||
repo_id=repo_id,
|
repo_id=repo_id,
|
||||||
local_dir=str(dest_path),
|
local_dir=str(dest_path),
|
||||||
local_dir_use_symlinks=False,
|
local_dir_use_symlinks=False,
|
||||||
resume_download=True, # explicit
|
resume_download=True, # explicit
|
||||||
# You can add allow_patterns / ignore_patterns if you want to filter
|
ignore_patterns=[".cache/**"], # prevent housekeeping into tree
|
||||||
# allow_patterns=None,
|
|
||||||
# ignore_patterns=None,
|
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"❌ snapshot_download failed for {variant_id}: {e}", file=sys.stderr)
|
print(f"❌ snapshot_download failed for {variant_id}: {e}", file=sys.stderr)
|
||||||
@@ -127,8 +280,8 @@ def main() -> None:
|
|||||||
total_size_bytes = 0
|
total_size_bytes = 0
|
||||||
|
|
||||||
for p in list_files_under(dest_path):
|
for p in list_files_under(dest_path):
|
||||||
rel = p.relative_to(model_dir)
|
rel_to_model = p.relative_to(model_dir)
|
||||||
files_list.append(str(rel))
|
files_list.append(str(rel_to_model).replace("\\", "/"))
|
||||||
try:
|
try:
|
||||||
size = p.stat().st_size
|
size = p.stat().st_size
|
||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
@@ -138,8 +291,8 @@ def main() -> None:
|
|||||||
|
|
||||||
# Fallback: ensure big files get tracked even if patterns miss them
|
# Fallback: ensure big files get tracked even if patterns miss them
|
||||||
if size > SIZE_THRESHOLD_BYTES:
|
if size > SIZE_THRESHOLD_BYTES:
|
||||||
# Idempotent; harmless if already tracked.
|
rel_to_repo = repo_relative_path(repo_root, p)
|
||||||
run(["git", "lfs", "track", str(p)], check=False)
|
lfs_track_file(repo_root, rel_to_repo)
|
||||||
|
|
||||||
files_list.sort()
|
files_list.sort()
|
||||||
variant["files"] = files_list
|
variant["files"] = files_list
|
||||||
@@ -150,15 +303,25 @@ def main() -> None:
|
|||||||
yaml.dump(data, f, sort_keys=False, allow_unicode=True)
|
yaml.dump(data, f, sort_keys=False, allow_unicode=True)
|
||||||
|
|
||||||
print(f"✅ Updated {model_yaml_path} for variant '{variant_id}'")
|
print(f"✅ Updated {model_yaml_path} for variant '{variant_id}'")
|
||||||
# Run cleanup script to commit, push, and prune
|
|
||||||
commit_message = f"Add/update model files for {model_dir.name}/{variant_id}"
|
# ---- Built-in cleanup (replaces cleanup.sh) ----
|
||||||
print(f"🧹 Running cleanup for {variant_id}...")
|
print(f"🧹 Running cleanup for {variant_id}...")
|
||||||
try:
|
try:
|
||||||
run(["./tools/cleanup.sh", commit_message], check=True)
|
# Only scope renormalization to model_dir to keep things fast.
|
||||||
|
git_changed = git_status_has_changes(repo_root)
|
||||||
|
if not git_changed:
|
||||||
|
print("No new files or changes to commit. Skipping commit and push.")
|
||||||
|
else:
|
||||||
|
commit_message = f"Add/update model files for {model_dir.name}/{variant_id}"
|
||||||
|
git_stage_and_commit_push(
|
||||||
|
repo_root=repo_root,
|
||||||
|
scope_paths=[model_dir],
|
||||||
|
commit_message=commit_message,
|
||||||
|
lfs_push_all=LFS_PUSH_ALL,
|
||||||
|
)
|
||||||
except subprocess.CalledProcessError as e:
|
except subprocess.CalledProcessError as e:
|
||||||
print(f"❌ cleanup.sh failed (continue to next variant): {e}", file=sys.stderr)
|
print(f"❌ Cleanup failed (continue to next variant): {e}", file=sys.stderr)
|
||||||
# Decide whether to continue or abort; continuing is usually fine.
|
# Continue to next variant
|
||||||
# raise # uncomment to abort on failure
|
|
||||||
|
|
||||||
print(f"\n✅ Download and YAML update complete for {model_yaml_path}.")
|
print(f"\n✅ Download and YAML update complete for {model_yaml_path}.")
|
||||||
|
|
||||||
|
|||||||
@@ -1,87 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
# download.sh - Download model files and update model.yaml metadata.
|
|
||||||
#
|
|
||||||
# This script reads a model.yaml file, downloads the complete model data from
|
|
||||||
# the specified Hugging Face repository, and then updates the 'files' array
|
|
||||||
# in the YAML with the paths of the downloaded files.
|
|
||||||
#
|
|
||||||
# This approach is more robust than specifying files manually, as it ensures
|
|
||||||
# the YAML reflects the actual downloaded content.
|
|
||||||
#
|
|
||||||
# Usage: ./tools/download.sh models/llama-2-7b/model.yaml
|
|
||||||
|
|
||||||
if [ "$#" -ne 1 ]; then
|
|
||||||
echo "Usage: $0 <path-to-model.yaml>" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
MODEL_YAML="$1"
|
|
||||||
MODEL_DIR=$(dirname "$MODEL_YAML")
|
|
||||||
|
|
||||||
if [ ! -f "$MODEL_YAML" ]; then
|
|
||||||
echo "Model YAML not found: $MODEL_YAML" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Ensure yq is installed
|
|
||||||
if ! command -v yq &> /dev/null; then
|
|
||||||
echo "Error: yq is not installed. Install it with: pip install yq or brew install yq" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Ensure huggingface-cli is installed
|
|
||||||
if ! command -v huggingface-cli &> /dev/null; then
|
|
||||||
echo "Error: huggingface-cli is not installed. Install it with: pip install huggingface_hub" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "Reading metadata from $MODEL_YAML..."
|
|
||||||
|
|
||||||
# Create a temporary file to store the updated YAML content
|
|
||||||
TMP_YAML=$(mktemp)
|
|
||||||
trap 'rm -f "$TMP_YAML"' EXIT
|
|
||||||
|
|
||||||
cp "$MODEL_YAML" "$TMP_YAML"
|
|
||||||
|
|
||||||
# Loop over each format and variant to download files
|
|
||||||
yq -r '.formats[] | . as $format | .variants[] | . as $variant | "\($format.type)\|\($variant.id)\|\($variant.hf_repo)"' "$MODEL_YAML" | while IFS='|' read -r format_type variant_id hf_repo; do
|
|
||||||
echo
|
|
||||||
echo "Processing variant: $variant_id (format: $format_type) from $hf_repo"
|
|
||||||
|
|
||||||
DEST_PATH="$MODEL_DIR/$variant_id"
|
|
||||||
mkdir -p "$DEST_PATH"
|
|
||||||
|
|
||||||
# Check if files are already downloaded by checking for a non-empty directory
|
|
||||||
if [ -n "$(ls -A "$DEST_PATH" 2>/dev/null)" ]; then
|
|
||||||
echo "[OK] Files for $variant_id already exist in $DEST_PATH. Skipping download."
|
|
||||||
else
|
|
||||||
repo_id=${hf_repo#https://huggingface.co/}
|
|
||||||
echo "[DL] Downloading files for $variant_id from $repo_id..."
|
|
||||||
huggingface-cli download "$repo_id" --local-dir "$DEST_PATH" --local-dir-use-symlinks False
|
|
||||||
fi
|
|
||||||
|
|
||||||
# After downloading, list the downloaded files relative to the model directory
|
|
||||||
downloaded_files=()
|
|
||||||
while IFS= read -r file; do
|
|
||||||
downloaded_files+=("$(realpath --relative-to="$MODEL_DIR" "$file")")
|
|
||||||
done < <(find "$DEST_PATH" -type f)
|
|
||||||
|
|
||||||
# Update the YAML file with the list of downloaded files for the current variant
|
|
||||||
echo "Updating $MODEL_YAML with downloaded file paths for $variant_id..."
|
|
||||||
# Create a yq expression to update the files for the specific variant
|
|
||||||
yq_exp="(.formats[] | select(.type == \"$format_type\") | .variants[] | select(.id == \"$variant_id\") | .files) = []"
|
|
||||||
yq eval -i "$yq_exp" "$TMP_YAML"
|
|
||||||
|
|
||||||
for file in "${downloaded_files[@]}"; do
|
|
||||||
yq_exp="(.formats[] | select(.type == \"$format_type\") | .variants[] | select(.id == \"$variant_id\") | .files) += [\"$file\"]"
|
|
||||||
yq eval -i "$yq_exp" "$TMP_YAML"
|
|
||||||
done
|
|
||||||
done
|
|
||||||
|
|
||||||
# Replace the original YAML with the updated one
|
|
||||||
mv "$TMP_YAML" "$MODEL_YAML"
|
|
||||||
|
|
||||||
echo
|
|
||||||
echo "✅ Download and YAML update complete for $MODEL_YAML."
|
|
||||||
Reference in New Issue
Block a user