Setup repo with Phi 3

This commit is contained in:
zcourts
2025-09-27 18:26:34 +01:00
commit 0c748f1497
16 changed files with 1122 additions and 0 deletions

5
tools/README.md Normal file
View File

@@ -0,0 +1,5 @@
# Setup
```
pip install pyyaml huggingface_hub
```

49
tools/cleanup.sh Executable file
View File

@@ -0,0 +1,49 @@
#!/usr/bin/env bash
set -euo pipefail
# cleanup.sh - Commits, pushes, and prunes LFS files.
#
# - Detects *untracked* files (git status --porcelain), so we dont skip commits.
# - Uses 'git add --renormalize .' so new/changed .gitattributes rules convert
# existing files into LFS pointers on re-add.
# - Keeps the prune step to free local disk space after a successful push.
#
# Usage: ./tools/cleanup.sh <commit-message>
if [ "$#" -ne 1 ]; then
echo "Usage: $0 <commit-message>" >&2
exit 1
fi
COMMIT_MESSAGE="$1"
# Detect any changes, including untracked.
if [[ -z "$(git status --porcelain=v1)" ]]; then
echo "No new files or changes to commit. Skipping commit and push."
exit 0
fi
echo "Committing and pushing changes..."
# Make sure .gitattributes changes are included and normalization runs,
# so LFS filters rewrite eligible files as pointers.
git add .gitattributes || true
git add --renormalize .
# If nothing ended up staged (e.g. only ignored files changed), exit gracefully.
if git diff --cached --quiet; then
echo "No staged changes after normalization. Skipping commit and push."
exit 0
fi
git commit -m "$COMMIT_MESSAGE"
git push
# Optional but useful: ensure all LFS objects are on the remote.
# Uncomment if you want belt-and-suspenders uploads.
# git lfs push origin --all
echo "Pruning local LFS files..."
git lfs prune --force
echo "✅ Cleanup complete."

167
tools/download.py Normal file
View File

@@ -0,0 +1,167 @@
#!/usr/bin/env python3
"""
download.py - Download/repair model files and update model.yaml metadata.
Usage:
./tools/download.py models/llama-2-7b-chat/model.yaml
- Always (re)runs snapshot_download with resume support, so partially
fetched directories get completed instead of being skipped.
- Updates YAML after each variant with fresh file list + total size.
- Tracks LFS via sensible patterns (plus a size threshold fallback).
- Emits clear logs so you can see progress per variant.
"""
import sys
import os
import yaml
import subprocess
from pathlib import Path
from typing import Iterable
from huggingface_hub import snapshot_download
LFS_PATTERNS: list[str] = [
# Extensions commonly used for model artifacts
"*.safetensors",
"*.bin",
"*.pt",
"*.gguf",
"*.onnx",
"*.ckpt",
"*.tensors",
"*.npz",
"*.tar",
"*.tar.gz",
"*.zip",
]
SIZE_THRESHOLD_BYTES = 1_000_000 # 1 MB fallback if a file doesn't match any pattern
def run(cmd: list[str], check: bool = True) -> None:
subprocess.run(cmd, check=check)
def track_lfs_patterns(patterns: Iterable[str]) -> None:
"""
Track a set of patterns in Git LFS. This is idempotent; it just
appends to .gitattributes as needed.
"""
for patt in patterns:
try:
run(["git", "lfs", "track", patt], check=False)
except Exception:
# Non-fatal: well still fall back to per-file size rule below.
pass
def list_files_under(root: Path) -> list[Path]:
return [p for p in root.rglob("*") if p.is_file()]
def ensure_repo_root() -> None:
# best effort: warn (but dont die) if not in a git repo
try:
subprocess.run(["git", "rev-parse", "--is-inside-work-tree"], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
except Exception:
print("⚠️ Not inside a Git repository? Git/LFS steps may fail.", file=sys.stderr)
def main() -> None:
if len(sys.argv) != 2:
print(f"Usage: {sys.argv[0]} <path-to-model.yaml>", file=sys.stderr)
sys.exit(1)
model_yaml_path = Path(sys.argv[1])
if not model_yaml_path.exists():
print(f"Model YAML not found: {model_yaml_path}", file=sys.stderr)
sys.exit(1)
ensure_repo_root()
# Load YAML
with open(model_yaml_path, "r", encoding="utf-8") as f:
data = yaml.safe_load(f) or {}
model_dir = model_yaml_path.parent
# Proactively set up LFS tracking by patterns (idempotent)
track_lfs_patterns(LFS_PATTERNS)
# Iterate formats & variants
formats = (data.get("model") or {}).get("formats") or []
for fmt in formats:
variants = fmt.get("variants") or []
for variant in variants:
variant_id = variant.get("id")
hf_repo = variant.get("hf_repo")
if not hf_repo or not variant_id:
continue
dest_path = model_dir / variant_id
dest_path.mkdir(parents=True, exist_ok=True)
repo_id = hf_repo.replace("https://huggingface.co/", "")
print(f"\n[DL] Downloading/resuming variant '{variant_id}' from '{repo_id}' into '{dest_path}'")
# Always call snapshot_download with resume enabled. This will:
# - no-op for already-complete files
# - resume partials
# - fetch any missing files
try:
snapshot_download(
repo_id=repo_id,
local_dir=str(dest_path),
local_dir_use_symlinks=False,
resume_download=True, # explicit
# You can add allow_patterns / ignore_patterns if you want to filter
# allow_patterns=None,
# ignore_patterns=None,
)
except Exception as e:
print(f"❌ snapshot_download failed for {variant_id}: {e}", file=sys.stderr)
raise
# Scan files, compute size, and ensure big files are tracked by LFS
files_list: list[str] = []
total_size_bytes = 0
for p in list_files_under(dest_path):
rel = p.relative_to(model_dir)
files_list.append(str(rel))
try:
size = p.stat().st_size
except FileNotFoundError:
# if a file was removed mid-scan, skip it
continue
total_size_bytes += size
# Fallback: ensure big files get tracked even if patterns miss them
if size > SIZE_THRESHOLD_BYTES:
# Idempotent; harmless if already tracked.
run(["git", "lfs", "track", str(p)], check=False)
files_list.sort()
variant["files"] = files_list
variant["size_bytes"] = int(total_size_bytes)
# Save updated YAML progressively after each variant
with open(model_yaml_path, "w", encoding="utf-8") as f:
yaml.dump(data, f, sort_keys=False, allow_unicode=True)
print(f"✅ Updated {model_yaml_path} for variant '{variant_id}'")
# Run cleanup script to commit, push, and prune
commit_message = f"Add/update model files for {model_dir.name}/{variant_id}"
print(f"🧹 Running cleanup for {variant_id}...")
try:
run(["./tools/cleanup.sh", commit_message], check=True)
except subprocess.CalledProcessError as e:
print(f"❌ cleanup.sh failed (continue to next variant): {e}", file=sys.stderr)
# Decide whether to continue or abort; continuing is usually fine.
# raise # uncomment to abort on failure
print(f"\n✅ Download and YAML update complete for {model_yaml_path}.")
if __name__ == "__main__":
main()

87
tools/download.sh Executable file
View File

@@ -0,0 +1,87 @@
#!/usr/bin/env bash
set -euo pipefail
# download.sh - Download model files and update model.yaml metadata.
#
# This script reads a model.yaml file, downloads the complete model data from
# the specified Hugging Face repository, and then updates the 'files' array
# in the YAML with the paths of the downloaded files.
#
# This approach is more robust than specifying files manually, as it ensures
# the YAML reflects the actual downloaded content.
#
# Usage: ./tools/download.sh models/llama-2-7b/model.yaml
if [ "$#" -ne 1 ]; then
echo "Usage: $0 <path-to-model.yaml>" >&2
exit 1
fi
MODEL_YAML="$1"
MODEL_DIR=$(dirname "$MODEL_YAML")
if [ ! -f "$MODEL_YAML" ]; then
echo "Model YAML not found: $MODEL_YAML" >&2
exit 1
fi
# Ensure yq is installed
if ! command -v yq &> /dev/null; then
echo "Error: yq is not installed. Install it with: pip install yq or brew install yq" >&2
exit 1
fi
# Ensure huggingface-cli is installed
if ! command -v huggingface-cli &> /dev/null; then
echo "Error: huggingface-cli is not installed. Install it with: pip install huggingface_hub" >&2
exit 1
fi
echo "Reading metadata from $MODEL_YAML..."
# Create a temporary file to store the updated YAML content
TMP_YAML=$(mktemp)
trap 'rm -f "$TMP_YAML"' EXIT
cp "$MODEL_YAML" "$TMP_YAML"
# Loop over each format and variant to download files
yq -r '.formats[] | . as $format | .variants[] | . as $variant | "\($format.type)\|\($variant.id)\|\($variant.hf_repo)"' "$MODEL_YAML" | while IFS='|' read -r format_type variant_id hf_repo; do
echo
echo "Processing variant: $variant_id (format: $format_type) from $hf_repo"
DEST_PATH="$MODEL_DIR/$variant_id"
mkdir -p "$DEST_PATH"
# Check if files are already downloaded by checking for a non-empty directory
if [ -n "$(ls -A "$DEST_PATH" 2>/dev/null)" ]; then
echo "[OK] Files for $variant_id already exist in $DEST_PATH. Skipping download."
else
repo_id=${hf_repo#https://huggingface.co/}
echo "[DL] Downloading files for $variant_id from $repo_id..."
huggingface-cli download "$repo_id" --local-dir "$DEST_PATH" --local-dir-use-symlinks False
fi
# After downloading, list the downloaded files relative to the model directory
downloaded_files=()
while IFS= read -r file; do
downloaded_files+=("$(realpath --relative-to="$MODEL_DIR" "$file")")
done < <(find "$DEST_PATH" -type f)
# Update the YAML file with the list of downloaded files for the current variant
echo "Updating $MODEL_YAML with downloaded file paths for $variant_id..."
# Create a yq expression to update the files for the specific variant
yq_exp="(.formats[] | select(.type == \"$format_type\") | .variants[] | select(.id == \"$variant_id\") | .files) = []"
yq eval -i "$yq_exp" "$TMP_YAML"
for file in "${downloaded_files[@]}"; do
yq_exp="(.formats[] | select(.type == \"$format_type\") | .variants[] | select(.id == \"$variant_id\") | .files) += [\"$file\"]"
yq eval -i "$yq_exp" "$TMP_YAML"
done
done
# Replace the original YAML with the updated one
mv "$TMP_YAML" "$MODEL_YAML"
echo
echo "✅ Download and YAML update complete for $MODEL_YAML."

View File

@@ -0,0 +1,34 @@
#!/usr/bin/env python3
import os
import sys
import yaml
import json
def collect_models(models_root):
registry = []
for root, dirs, files in os.walk(models_root):
if "model.yaml" in files:
model_path = os.path.join(root, "model.yaml")
try:
with open(model_path, 'r', encoding='utf-8') as f:
model_data = yaml.safe_load(f)
registry.append(model_data)
except Exception as e:
print(f"❌ Failed to parse {model_path}: {e}", file=sys.stderr)
return registry
if __name__ == "__main__":
repo_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
models_root = os.path.join(repo_root, "models")
output_path = os.path.join(repo_root, "registry.json")
if not os.path.isdir(models_root):
print(f"❌ Models directory not found: {models_root}")
sys.exit(1)
registry = collect_models(models_root)
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(registry, f, indent=2, ensure_ascii=False)
print(f"✅ Registry written to {output_path} with {len(registry)} models.")

View File

@@ -0,0 +1,134 @@
from huggingface_hub import HfApi, HfFileSystem
from pathlib import Path
import yaml
import requests
import os
from datetime import datetime
from collections import defaultdict
import re
import sys
def generate_model_bundle(repo_id: str, output_dir: str):
api = HfApi()
fs = HfFileSystem()
model_info = api.model_info(repo_id)
# Create output path
out_path = Path(output_dir)
out_path.mkdir(parents=True, exist_ok=True)
# ----- 1. Fetch metadata -----
model_card = model_info.cardData or {}
tags = model_info.tags or []
files = api.list_repo_files(repo_id)
# ----- 2. Filter files -----
model_files = [f for f in files if f.endswith(".gguf") or f.endswith(".safetensors")]
tokenizer_files = [f for f in files if "tokenizer" in f.lower()]
license_file = next((f for f in files if "license" in f.lower()), None)
# ----- 3. Fetch README -----
readme_url = f"https://huggingface.co/{repo_id}/raw/main/README.md"
readme_path = out_path / "README.md"
try:
r = requests.get(readme_url)
r.raise_for_status()
readme_path.write_text(r.text)
except Exception:
readme_path.write_text(f"# README for {repo_id}\n(Not found on HuggingFace)")
# ----- 4. Fetch LICENSE -----
if license_file:
license_text = api.hf_hub_download(repo_id, license_file)
license_dst = out_path / Path(license_file).name
Path(license_dst).write_text(Path(license_text).read_text())
# ----- 5. Build variant groups -----
variants = []
shard_groups = defaultdict(list)
unsharded_files = []
for f in model_files:
match = re.match(r"(.+)-\d+-of-\d+\.safetensors$", f)
if match:
prefix = match.group(1)
shard_groups[prefix].append(f)
else:
unsharded_files.append(f)
for prefix, files_group in shard_groups.items():
total_size = sum(fs.info(f"hf://{repo_id}/{f}").get("size", 0) for f in files_group)
context_length = 128000 if "128k" in prefix.lower() else 4096
bits = 16 # Assume safetensors shards are FP16
variants.append({
"id": prefix,
"label": prefix,
"bits": bits,
"context_length": context_length,
"size_bytes": total_size,
"hf_repo": f"https://huggingface.co/{repo_id}",
"files": sorted(files_group)
})
for f in unsharded_files:
ext = Path(f).suffix
size_bytes = fs.info(f"hf://{repo_id}/{f}").get("size", 0)
bits = 16 if "fp16" in f.lower() or ext == ".safetensors" else 4 if "q4" in f.lower() else 8
context_length = 128000 if "128k" in f.lower() else 4096
variants.append({
"id": Path(f).stem,
"label": f,
"bits": bits,
"context_length": context_length,
"size_bytes": size_bytes,
"hf_repo": f"https://huggingface.co/{repo_id}",
"files": [f]
})
# ----- 6. Handle date -----
last_modified = model_info.lastModified
if isinstance(last_modified, str):
last_modified = datetime.fromisoformat(last_modified.replace("Z", "+00:00"))
# ----- 7. YAML data -----
yaml_data = {
"model": {
"name": repo_id.split("/")[-1],
"display_name": model_card.get("title", repo_id),
"description": model_card.get("summary", "No description available."),
"publisher_original": model_card.get("license", "other"),
"publisher_quantized": "Community",
"license": model_card.get("license", "other"),
"license_url": f"https://huggingface.co/{repo_id}/blob/main/{license_file}" if license_file else "N/A",
"publish_date": last_modified.date().isoformat(),
"modality": "text",
"thinking_model": True,
"tokenizer": {"files": tokenizer_files},
"architecture": model_card.get("model_architecture", "transformer"),
"formats": [{
"type": "gguf" if any(f.endswith(".gguf") for f in model_files) else "safetensors",
"variants": variants
}]
}
}
with open(out_path / "model.yaml", "w") as f:
yaml.dump(yaml_data, f, sort_keys=False)
return str(out_path)
# -------- Entry point for CLI --------
if __name__ == "__main__":
if len(sys.argv) != 3:
print("Usage: python generate_model_yaml.py <huggingface/repo-id> <output-folder>")
sys.exit(1)
repo_id = sys.argv[1]
output_dir = sys.argv[2]
output_path = generate_model_bundle(repo_id, output_dir)
print(f"✅ Model bundle generated at: {output_path}")

60
tools/verify-checksums.py Normal file
View File

@@ -0,0 +1,60 @@
#!/usr/bin/env python3
import sys
import os
import yaml
import hashlib
def sha256sum(filename, buf_size=65536):
sha256 = hashlib.sha256()
with open(filename, 'rb') as f:
while True:
data = f.read(buf_size)
if not data:
break
sha256.update(data)
return sha256.hexdigest()
def verify_model(model_yaml_path):
if not os.path.isfile(model_yaml_path):
print(f"❌ Model YAML not found: {model_yaml_path}")
sys.exit(1)
with open(model_yaml_path, 'r', encoding='utf-8') as f:
model_data = yaml.safe_load(f)
base_dir = os.path.dirname(model_yaml_path)
all_ok = True
for fmt in model_data.get("formats", []):
for variant in fmt.get("variants", []):
for file_path in variant.get("files", []):
checksum_expected = variant.get("checksums", {}).get(file_path)
abs_path = os.path.join(base_dir, file_path)
if not os.path.isfile(abs_path):
print(f"❌ Missing file: {abs_path}")
all_ok = False
continue
if not checksum_expected:
print(f"⚠️ No checksum for {file_path}, skipping verification.")
continue
checksum_actual = sha256sum(abs_path)
if checksum_actual.lower() == checksum_expected.lower():
print(f"{file_path} OK")
else:
print(f"{file_path} checksum mismatch! Expected {checksum_expected}, got {checksum_actual}")
all_ok = False
if all_ok:
print("✅ All files verified successfully.")
else:
print("❌ Verification failed.")
if __name__ == "__main__":
if len(sys.argv) != 2:
print(f"Usage: {sys.argv[0]} <path-to-model.yaml>")
sys.exit(1)
verify_model(sys.argv[1])

38
tools/watcher.sh Executable file
View File

@@ -0,0 +1,38 @@
#!/usr/bin/env bash
set -euo pipefail
# watcher.sh - Watches for new models, downloads their files, and tracks large files with Git LFS.
#
# This script continuously scans the 'models' directory for 'model.yaml' files.
# For each model, it runs the 'download.sh' script to fetch model files from
# Hugging Face. After downloading, it identifies files larger than 1MB and
# ensures they are tracked by Git LFS.
#
# Usage: ./tools/watcher.sh
# Run from the root of the repository.
# This script should be run from the root of the repository.
if [ ! -d ".git" ]; then
echo "Error: This script must be run from the root of the repository." >&2
exit 1
fi
while true; do
echo "🔍 Starting model discovery cycle..."
# Find all model.yaml files in the models directory
find models -name model.yaml | while read -r MODEL_YAML; do
MODEL_DIR=$(dirname "$MODEL_YAML")
echo "--------------------------------------------------"
echo "Processing model in $MODEL_DIR"
# The download script will now handle LFS tracking and cleanup for each variant.
python3 ./tools/download.py "$MODEL_YAML"
done
echo "--------------------------------------------------"
echo "✅ Watcher finished a cycle. Sleeping for 60 seconds before next scan."
echo "Press [CTRL+C] to stop."
sleep 60
done