Setup repo with Phi 3

2025-09-27 18:26:34 +01:00
commit 0c748f1497
16 changed files with 1122 additions and 0 deletions
--- a/tools/README.md
+++ b/tools/README.md
@@ -0,0 +1,5 @@
+# Setup
+
+```
+pip install pyyaml huggingface_hub
+```
--- a/tools/cleanup.sh
+++ b/tools/cleanup.sh
@@ -0,0 +1,49 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# cleanup.sh - Commits, pushes, and prunes LFS files.
+#
+# - Detects *untracked* files (git status --porcelain), so we don’t skip commits.
+# - Uses 'git add --renormalize .' so new/changed .gitattributes rules convert
+#   existing files into LFS pointers on re-add.
+# - Keeps the prune step to free local disk space after a successful push.
+#
+# Usage: ./tools/cleanup.sh <commit-message>
+
+if [ "$#" -ne 1 ]; then
+  echo "Usage: $0 <commit-message>" >&2
+  exit 1
+fi
+
+COMMIT_MESSAGE="$1"
+
+# Detect any changes, including untracked.
+if [[ -z "$(git status --porcelain=v1)" ]]; then
+  echo "No new files or changes to commit. Skipping commit and push."
+  exit 0
+fi
+
+echo "Committing and pushing changes..."
+
+# Make sure .gitattributes changes are included and normalization runs,
+# so LFS filters rewrite eligible files as pointers.
+git add .gitattributes || true
+git add --renormalize .
+
+# If nothing ended up staged (e.g. only ignored files changed), exit gracefully.
+if git diff --cached --quiet; then
+  echo "No staged changes after normalization. Skipping commit and push."
+  exit 0
+fi
+
+git commit -m "$COMMIT_MESSAGE"
+git push
+
+# Optional but useful: ensure all LFS objects are on the remote.
+# Uncomment if you want belt-and-suspenders uploads.
+# git lfs push origin --all
+
+echo "Pruning local LFS files..."
+git lfs prune --force
+
+echo "✅ Cleanup complete."
--- a/tools/download.py
+++ b/tools/download.py
@@ -0,0 +1,167 @@
+#!/usr/bin/env python3
+"""
+download.py - Download/repair model files and update model.yaml metadata.
+
+Usage:
+    ./tools/download.py models/llama-2-7b-chat/model.yaml
+
+- Always (re)runs snapshot_download with resume support, so partially
+  fetched directories get completed instead of being skipped.
+- Updates YAML after each variant with fresh file list + total size.
+- Tracks LFS via sensible patterns (plus a size threshold fallback).
+- Emits clear logs so you can see progress per variant.
+"""
+
+import sys
+import os
+import yaml
+import subprocess
+from pathlib import Path
+from typing import Iterable
+from huggingface_hub import snapshot_download
+
+LFS_PATTERNS: list[str] = [
+    # Extensions commonly used for model artifacts
+    "*.safetensors",
+    "*.bin",
+    "*.pt",
+    "*.gguf",
+    "*.onnx",
+    "*.ckpt",
+    "*.tensors",
+    "*.npz",
+    "*.tar",
+    "*.tar.gz",
+    "*.zip",
+]
+
+SIZE_THRESHOLD_BYTES = 1_000_000  # 1 MB fallback if a file doesn't match any pattern
+
+def run(cmd: list[str], check: bool = True) -> None:
+    subprocess.run(cmd, check=check)
+
+
+def track_lfs_patterns(patterns: Iterable[str]) -> None:
+    """
+    Track a set of patterns in Git LFS. This is idempotent; it just
+    appends to .gitattributes as needed.
+    """
+    for patt in patterns:
+        try:
+            run(["git", "lfs", "track", patt], check=False)
+        except Exception:
+            # Non-fatal: we’ll still fall back to per-file size rule below.
+            pass
+
+
+def list_files_under(root: Path) -> list[Path]:
+    return [p for p in root.rglob("*") if p.is_file()]
+
+
+def ensure_repo_root() -> None:
+    # best effort: warn (but don’t die) if not in a git repo
+    try:
+        subprocess.run(["git", "rev-parse", "--is-inside-work-tree"], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+    except Exception:
+        print("⚠️  Not inside a Git repository? Git/LFS steps may fail.", file=sys.stderr)
+
+
+def main() -> None:
+    if len(sys.argv) != 2:
+        print(f"Usage: {sys.argv[0]} <path-to-model.yaml>", file=sys.stderr)
+        sys.exit(1)
+
+    model_yaml_path = Path(sys.argv[1])
+    if not model_yaml_path.exists():
+        print(f"Model YAML not found: {model_yaml_path}", file=sys.stderr)
+        sys.exit(1)
+
+    ensure_repo_root()
+
+    # Load YAML
+    with open(model_yaml_path, "r", encoding="utf-8") as f:
+        data = yaml.safe_load(f) or {}
+
+    model_dir = model_yaml_path.parent
+
+    # Proactively set up LFS tracking by patterns (idempotent)
+    track_lfs_patterns(LFS_PATTERNS)
+
+    # Iterate formats & variants
+    formats = (data.get("model") or {}).get("formats") or []
+    for fmt in formats:
+        variants = fmt.get("variants") or []
+        for variant in variants:
+            variant_id = variant.get("id")
+            hf_repo = variant.get("hf_repo")
+
+            if not hf_repo or not variant_id:
+                continue
+
+            dest_path = model_dir / variant_id
+            dest_path.mkdir(parents=True, exist_ok=True)
+
+            repo_id = hf_repo.replace("https://huggingface.co/", "")
+            print(f"\n[DL] Downloading/resuming variant '{variant_id}' from '{repo_id}' into '{dest_path}'")
+
+            # Always call snapshot_download with resume enabled. This will:
+            # - no-op for already-complete files
+            # - resume partials
+            # - fetch any missing files
+            try:
+                snapshot_download(
+                    repo_id=repo_id,
+                    local_dir=str(dest_path),
+                    local_dir_use_symlinks=False,
+                    resume_download=True,     # explicit
+                    # You can add allow_patterns / ignore_patterns if you want to filter
+                    # allow_patterns=None,
+                    # ignore_patterns=None,
+                )
+            except Exception as e:
+                print(f"❌ snapshot_download failed for {variant_id}: {e}", file=sys.stderr)
+                raise
+
+            # Scan files, compute size, and ensure big files are tracked by LFS
+            files_list: list[str] = []
+            total_size_bytes = 0
+
+            for p in list_files_under(dest_path):
+                rel = p.relative_to(model_dir)
+                files_list.append(str(rel))
+                try:
+                    size = p.stat().st_size
+                except FileNotFoundError:
+                    # if a file was removed mid-scan, skip it
+                    continue
+                total_size_bytes += size
+
+                # Fallback: ensure big files get tracked even if patterns miss them
+                if size > SIZE_THRESHOLD_BYTES:
+                    # Idempotent; harmless if already tracked.
+                    run(["git", "lfs", "track", str(p)], check=False)
+
+            files_list.sort()
+            variant["files"] = files_list
+            variant["size_bytes"] = int(total_size_bytes)
+
+            # Save updated YAML progressively after each variant
+            with open(model_yaml_path, "w", encoding="utf-8") as f:
+                yaml.dump(data, f, sort_keys=False, allow_unicode=True)
+
+            print(f"✅ Updated {model_yaml_path} for variant '{variant_id}'")
+            # Run cleanup script to commit, push, and prune
+            commit_message = f"Add/update model files for {model_dir.name}/{variant_id}"
+            print(f"🧹 Running cleanup for {variant_id}...")
+            try:
+                run(["./tools/cleanup.sh", commit_message], check=True)
+            except subprocess.CalledProcessError as e:
+                print(f"❌ cleanup.sh failed (continue to next variant): {e}", file=sys.stderr)
+                # Decide whether to continue or abort; continuing is usually fine.
+                # raise  # uncomment to abort on failure
+
+    print(f"\n✅ Download and YAML update complete for {model_yaml_path}.")
+
+
+if __name__ == "__main__":
+    main()
--- a/tools/download.sh
+++ b/tools/download.sh
@@ -0,0 +1,87 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# download.sh - Download model files and update model.yaml metadata.
+#
+# This script reads a model.yaml file, downloads the complete model data from
+# the specified Hugging Face repository, and then updates the 'files' array
+# in the YAML with the paths of the downloaded files.
+#
+# This approach is more robust than specifying files manually, as it ensures
+# the YAML reflects the actual downloaded content.
+#
+# Usage: ./tools/download.sh models/llama-2-7b/model.yaml
+
+if [ "$#" -ne 1 ]; then
+    echo "Usage: $0 <path-to-model.yaml>" >&2
+    exit 1
+fi
+
+MODEL_YAML="$1"
+MODEL_DIR=$(dirname "$MODEL_YAML")
+
+if [ ! -f "$MODEL_YAML" ]; then
+    echo "Model YAML not found: $MODEL_YAML" >&2
+    exit 1
+fi
+
+# Ensure yq is installed
+if ! command -v yq &> /dev/null; then
+    echo "Error: yq is not installed. Install it with: pip install yq or brew install yq" >&2
+    exit 1
+fi
+
+# Ensure huggingface-cli is installed
+if ! command -v huggingface-cli &> /dev/null; then
+    echo "Error: huggingface-cli is not installed. Install it with: pip install huggingface_hub" >&2
+    exit 1
+fi
+
+echo "Reading metadata from $MODEL_YAML..."
+
+# Create a temporary file to store the updated YAML content
+TMP_YAML=$(mktemp)
+trap 'rm -f "$TMP_YAML"' EXIT
+
+cp "$MODEL_YAML" "$TMP_YAML"
+
+# Loop over each format and variant to download files
+yq -r '.formats[] | . as $format | .variants[] | . as $variant | "\($format.type)\|\($variant.id)\|\($variant.hf_repo)"' "$MODEL_YAML" | while IFS='|' read -r format_type variant_id hf_repo; do
+    echo
+    echo "Processing variant: $variant_id (format: $format_type) from $hf_repo"
+
+    DEST_PATH="$MODEL_DIR/$variant_id"
+    mkdir -p "$DEST_PATH"
+
+    # Check if files are already downloaded by checking for a non-empty directory
+    if [ -n "$(ls -A "$DEST_PATH" 2>/dev/null)" ]; then
+        echo "[OK] Files for $variant_id already exist in $DEST_PATH. Skipping download."
+    else
+        repo_id=${hf_repo#https://huggingface.co/}
+        echo "[DL] Downloading files for $variant_id from $repo_id..."
+        huggingface-cli download "$repo_id" --local-dir "$DEST_PATH" --local-dir-use-symlinks False
+    fi
+
+    # After downloading, list the downloaded files relative to the model directory
+    downloaded_files=()
+    while IFS= read -r file; do
+        downloaded_files+=("$(realpath --relative-to="$MODEL_DIR" "$file")")
+    done < <(find "$DEST_PATH" -type f)
+
+    # Update the YAML file with the list of downloaded files for the current variant
+    echo "Updating $MODEL_YAML with downloaded file paths for $variant_id..."
+    # Create a yq expression to update the files for the specific variant
+    yq_exp="(.formats[] | select(.type == \"$format_type\") | .variants[] | select(.id == \"$variant_id\") | .files) = []"
+    yq eval -i "$yq_exp" "$TMP_YAML"
+
+    for file in "${downloaded_files[@]}"; do
+        yq_exp="(.formats[] | select(.type == \"$format_type\") | .variants[] | select(.id == \"$variant_id\") | .files) += [\"$file\"]"
+        yq eval -i "$yq_exp" "$TMP_YAML"
+    done
+done
+
+# Replace the original YAML with the updated one
+mv "$TMP_YAML" "$MODEL_YAML"
+
+echo
+echo "✅ Download and YAML update complete for $MODEL_YAML."
--- a/tools/generate-registry.py
+++ b/tools/generate-registry.py
@@ -0,0 +1,34 @@
+#!/usr/bin/env python3
+import os
+import sys
+import yaml
+import json
+
+def collect_models(models_root):
+    registry = []
+    for root, dirs, files in os.walk(models_root):
+        if "model.yaml" in files:
+            model_path = os.path.join(root, "model.yaml")
+            try:
+                with open(model_path, 'r', encoding='utf-8') as f:
+                    model_data = yaml.safe_load(f)
+                registry.append(model_data)
+            except Exception as e:
+                print(f"❌ Failed to parse {model_path}: {e}", file=sys.stderr)
+    return registry
+
+if __name__ == "__main__":
+    repo_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
+    models_root = os.path.join(repo_root, "models")
+    output_path = os.path.join(repo_root, "registry.json")
+
+    if not os.path.isdir(models_root):
+        print(f"❌ Models directory not found: {models_root}")
+        sys.exit(1)
+
+    registry = collect_models(models_root)
+
+    with open(output_path, 'w', encoding='utf-8') as f:
+        json.dump(registry, f, indent=2, ensure_ascii=False)
+
+    print(f"✅ Registry written to {output_path} with {len(registry)} models.")
--- a/tools/generate_model_yaml.py
+++ b/tools/generate_model_yaml.py
@@ -0,0 +1,134 @@
+from huggingface_hub import HfApi, HfFileSystem
+from pathlib import Path
+import yaml
+import requests
+import os
+from datetime import datetime
+from collections import defaultdict
+import re
+import sys
+
+
+def generate_model_bundle(repo_id: str, output_dir: str):
+    api = HfApi()
+    fs = HfFileSystem()
+    model_info = api.model_info(repo_id)
+
+    # Create output path
+    out_path = Path(output_dir)
+    out_path.mkdir(parents=True, exist_ok=True)
+
+    # ----- 1. Fetch metadata -----
+    model_card = model_info.cardData or {}
+    tags = model_info.tags or []
+    files = api.list_repo_files(repo_id)
+
+    # ----- 2. Filter files -----
+    model_files = [f for f in files if f.endswith(".gguf") or f.endswith(".safetensors")]
+    tokenizer_files = [f for f in files if "tokenizer" in f.lower()]
+    license_file = next((f for f in files if "license" in f.lower()), None)
+
+    # ----- 3. Fetch README -----
+    readme_url = f"https://huggingface.co/{repo_id}/raw/main/README.md"
+    readme_path = out_path / "README.md"
+    try:
+        r = requests.get(readme_url)
+        r.raise_for_status()
+        readme_path.write_text(r.text)
+    except Exception:
+        readme_path.write_text(f"# README for {repo_id}\n(Not found on HuggingFace)")
+
+    # ----- 4. Fetch LICENSE -----
+    if license_file:
+        license_text = api.hf_hub_download(repo_id, license_file)
+        license_dst = out_path / Path(license_file).name
+        Path(license_dst).write_text(Path(license_text).read_text())
+
+    # ----- 5. Build variant groups -----
+    variants = []
+    shard_groups = defaultdict(list)
+    unsharded_files = []
+
+    for f in model_files:
+        match = re.match(r"(.+)-\d+-of-\d+\.safetensors$", f)
+        if match:
+            prefix = match.group(1)
+            shard_groups[prefix].append(f)
+        else:
+            unsharded_files.append(f)
+
+    for prefix, files_group in shard_groups.items():
+        total_size = sum(fs.info(f"hf://{repo_id}/{f}").get("size", 0) for f in files_group)
+        context_length = 128000 if "128k" in prefix.lower() else 4096
+        bits = 16  # Assume safetensors shards are FP16
+
+        variants.append({
+            "id": prefix,
+            "label": prefix,
+            "bits": bits,
+            "context_length": context_length,
+            "size_bytes": total_size,
+            "hf_repo": f"https://huggingface.co/{repo_id}",
+            "files": sorted(files_group)
+        })
+
+    for f in unsharded_files:
+        ext = Path(f).suffix
+        size_bytes = fs.info(f"hf://{repo_id}/{f}").get("size", 0)
+        bits = 16 if "fp16" in f.lower() or ext == ".safetensors" else 4 if "q4" in f.lower() else 8
+        context_length = 128000 if "128k" in f.lower() else 4096
+
+        variants.append({
+            "id": Path(f).stem,
+            "label": f,
+            "bits": bits,
+            "context_length": context_length,
+            "size_bytes": size_bytes,
+            "hf_repo": f"https://huggingface.co/{repo_id}",
+            "files": [f]
+        })
+
+    # ----- 6. Handle date -----
+    last_modified = model_info.lastModified
+    if isinstance(last_modified, str):
+        last_modified = datetime.fromisoformat(last_modified.replace("Z", "+00:00"))
+
+    # ----- 7. YAML data -----
+    yaml_data = {
+        "model": {
+            "name": repo_id.split("/")[-1],
+            "display_name": model_card.get("title", repo_id),
+            "description": model_card.get("summary", "No description available."),
+            "publisher_original": model_card.get("license", "other"),
+            "publisher_quantized": "Community",
+            "license": model_card.get("license", "other"),
+            "license_url": f"https://huggingface.co/{repo_id}/blob/main/{license_file}" if license_file else "N/A",
+            "publish_date": last_modified.date().isoformat(),
+            "modality": "text",
+            "thinking_model": True,
+            "tokenizer": {"files": tokenizer_files},
+            "architecture": model_card.get("model_architecture", "transformer"),
+            "formats": [{
+                "type": "gguf" if any(f.endswith(".gguf") for f in model_files) else "safetensors",
+                "variants": variants
+            }]
+        }
+    }
+
+    with open(out_path / "model.yaml", "w") as f:
+        yaml.dump(yaml_data, f, sort_keys=False)
+
+    return str(out_path)
+
+
+# -------- Entry point for CLI --------
+if __name__ == "__main__":
+    if len(sys.argv) != 3:
+        print("Usage: python generate_model_yaml.py <huggingface/repo-id> <output-folder>")
+        sys.exit(1)
+
+    repo_id = sys.argv[1]
+    output_dir = sys.argv[2]
+
+    output_path = generate_model_bundle(repo_id, output_dir)
+    print(f"✅ Model bundle generated at: {output_path}")
--- a/tools/verify-checksums.py
+++ b/tools/verify-checksums.py
@@ -0,0 +1,60 @@
+#!/usr/bin/env python3
+import sys
+import os
+import yaml
+import hashlib
+
+def sha256sum(filename, buf_size=65536):
+    sha256 = hashlib.sha256()
+    with open(filename, 'rb') as f:
+        while True:
+            data = f.read(buf_size)
+            if not data:
+                break
+            sha256.update(data)
+    return sha256.hexdigest()
+
+def verify_model(model_yaml_path):
+    if not os.path.isfile(model_yaml_path):
+        print(f"❌ Model YAML not found: {model_yaml_path}")
+        sys.exit(1)
+
+    with open(model_yaml_path, 'r', encoding='utf-8') as f:
+        model_data = yaml.safe_load(f)
+
+    base_dir = os.path.dirname(model_yaml_path)
+    all_ok = True
+
+    for fmt in model_data.get("formats", []):
+        for variant in fmt.get("variants", []):
+            for file_path in variant.get("files", []):
+                checksum_expected = variant.get("checksums", {}).get(file_path)
+                abs_path = os.path.join(base_dir, file_path)
+
+                if not os.path.isfile(abs_path):
+                    print(f"❌ Missing file: {abs_path}")
+                    all_ok = False
+                    continue
+
+                if not checksum_expected:
+                    print(f"⚠️ No checksum for {file_path}, skipping verification.")
+                    continue
+
+                checksum_actual = sha256sum(abs_path)
+                if checksum_actual.lower() == checksum_expected.lower():
+                    print(f"✅ {file_path} OK")
+                else:
+                    print(f"❌ {file_path} checksum mismatch! Expected {checksum_expected}, got {checksum_actual}")
+                    all_ok = False
+
+    if all_ok:
+        print("✅ All files verified successfully.")
+    else:
+        print("❌ Verification failed.")
+
+if __name__ == "__main__":
+    if len(sys.argv) != 2:
+        print(f"Usage: {sys.argv[0]} <path-to-model.yaml>")
+        sys.exit(1)
+
+    verify_model(sys.argv[1])
--- a/tools/watcher.sh
+++ b/tools/watcher.sh
@@ -0,0 +1,38 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# watcher.sh - Watches for new models, downloads their files, and tracks large files with Git LFS.
+#
+# This script continuously scans the 'models' directory for 'model.yaml' files.
+# For each model, it runs the 'download.sh' script to fetch model files from
+# Hugging Face. After downloading, it identifies files larger than 1MB and
+# ensures they are tracked by Git LFS.
+#
+# Usage: ./tools/watcher.sh
+# Run from the root of the repository.
+
+# This script should be run from the root of the repository.
+if [ ! -d ".git" ]; then
+    echo "Error: This script must be run from the root of the repository." >&2
+    exit 1
+fi
+
+while true; do
+    echo "🔍 Starting model discovery cycle..."
+
+    # Find all model.yaml files in the models directory
+    find models -name model.yaml | while read -r MODEL_YAML; do
+        MODEL_DIR=$(dirname "$MODEL_YAML")
+        
+        echo "--------------------------------------------------"
+        echo "Processing model in $MODEL_DIR"
+        
+        # The download script will now handle LFS tracking and cleanup for each variant.
+        python3 ./tools/download.py "$MODEL_YAML"
+    done
+
+    echo "--------------------------------------------------"
+    echo "✅ Watcher finished a cycle. Sleeping for 60 seconds before next scan."
+    echo "Press [CTRL+C] to stop."
+    sleep 60
+done