R-Kentaren's picture
Upload folder using huggingface_hub
ff86d3d verified
raw
history blame contribute delete
12.9 kB
"""HuggingFace Hub push and project ZIP packaging.
Creates ZIP archives from extracted project files and pushes
projects to HuggingFace Spaces or model repos.
"""
from __future__ import annotations
import logging
import os
import re
import tempfile
import zipfile
from pathlib import Path
from typing import Any
from code.config.constants import MODEL_ID
from code.huggingface.dockerfile_gen import (
detect_framework,
is_js_project,
scaffold_js_project,
)
logger = logging.getLogger(__name__)
# ─── Import-to-Package Mapping ──────────────────────────────────────────
IMPORT_TO_PACKAGE: dict[str, str] = {
"gradio": "gradio>=4.0.0",
"flask": "flask>=3.0.0",
"django": "django>=4.2.0",
"fastapi": "fastapi>=0.100.0",
"uvicorn": "uvicorn>=0.23.0",
"streamlit": "streamlit>=1.28.0",
"matplotlib": "matplotlib>=3.8.0",
"PIL": "Pillow>=10.0.0",
"Pillow": "Pillow>=10.0.0",
"numpy": "numpy>=1.24.0",
"pandas": "pandas>=2.0.0",
"scipy": "scipy>=1.11.0",
"sklearn": "scikit-learn>=1.3.0",
"scikit_learn": "scikit-learn>=1.3.0",
"torch": "torch>=2.1.0",
"tensorflow": "tensorflow>=2.14.0",
"transformers": "transformers>=4.35.0",
"requests": "requests>=2.31.0",
"beautifulsoup4": "beautifulsoup4>=4.12.0",
"bs4": "beautifulsoup4>=4.12.0",
"selenium": "selenium>=4.15.0",
"sqlalchemy": "sqlalchemy>=2.0.0",
"pydantic": "pydantic>=2.0.0",
"httpx": "httpx>=0.25.0",
"aiohttp": "aiohttp>=3.9.0",
"opencv": "opencv-python-headless>=4.8.0",
"cv2": "opencv-python-headless>=4.8.0",
"plotly": "plotly>=5.18.0",
"seaborn": "seaborn>=0.13.0",
"wordcloud": "wordcloud>=1.9.0",
"networkx": "networkx>=3.2.0",
"sympy": "sympy>=1.12",
"Pillow": "Pillow>=10.0.0",
"skimage": "scikit-image>=0.21.0",
"soundfile": "soundfile>=0.12.0",
"pydub": "pydub>=0.25.1",
"moviepy": "moviepy>=1.0.3",
"openpyxl": "openpyxl>=3.1.0",
"xlsxwriter": "xlsxwriter>=3.1.0",
"python-docx": "python-docx>=0.8.11",
"docx": "python-docx>=0.8.11",
"reportlab": "reportlab>=4.0.0",
"jinja2": "jinja2>=3.1.0",
"wtforms": "wtforms>=3.1.0",
"flask_sqlalchemy": "flask-sqlalchemy>=3.1.0",
"flask_login": "flask-login>=0.6.0",
"flask_wtf": "flask-wtf>=1.2.0",
"flask_cors": "flask-cors>=4.0.0",
}
def _scan_imports(code: str) -> list[str]:
"""Scan Python code for import statements and return package names."""
packages = set()
# Match: import xxx
for m in re.finditer(r"^\s*import\s+([a-zA-Z_][\w.]*)", code, re.MULTILINE):
top_level = m.group(1).split(".")[0]
packages.add(top_level)
# Match: from xxx import ...
for m in re.finditer(r"^\s*from\s+([a-zA-Z_][\w.]*)", code, re.MULTILINE):
top_level = m.group(1).split(".")[0]
packages.add(top_level)
return sorted(packages)
def generate_requirements(code: str) -> str:
"""Generate requirements.txt content from code by scanning imports.
Returns a newline-separated string of pip package specs.
"""
packages = _scan_imports(code)
reqs: list[str] = []
for pkg in packages:
if pkg in IMPORT_TO_PACKAGE:
req_spec = IMPORT_TO_PACKAGE[pkg]
if req_spec not in reqs:
reqs.append(req_spec)
# Skip stdlib modules (os, sys, json, re, math, etc.)
# Always include gradio for Gradio apps if not already
if "import gradio" in code or "from gradio" in code:
if "gradio" not in [r.split(">=")[0].split("[")[0] for r in reqs]:
reqs.insert(0, "gradio>=4.0.0")
return "\n".join(reqs) + "\n" if reqs else ""
def _find_entry_point(files: dict[str, str]) -> str:
"""Find the main entry point file for a project.
Looks for app.py, main.py, or any Python file with a launcher pattern.
"""
# Priority order for Python entry points
candidates = ["app.py", "main.py", "index.py", "server.py", "run.py"]
for c in candidates:
if c in files:
return c
# Priority order for JS entry points
js_candidates = ["index.js", "server.js", "src/index.js", "src/main.jsx", "src/main.tsx"]
for c in js_candidates:
if c in files:
return c
# Look for any .py file with if __name__ == "__main__" or .launch()
for fname, content in files.items():
if fname.endswith(".py"):
if "__main__" in content or ".launch(" in content or "app.run(" in content:
return fname
# Fall back to first .py file
for fname in files:
if fname.endswith(".py"):
return fname
# Fall back to first file
return next(iter(files), "app.py")
def _detect_sdk(files: dict[str, str], entry: str) -> str:
"""Auto-detect the best Space SDK from the project files."""
all_code = "\n".join(files.values())
if "import streamlit" in all_code or "from streamlit" in all_code:
return "streamlit"
if "import gradio" in all_code or "from gradio" in all_code:
return "gradio"
# JS/TS projects → Docker
if is_js_project(files):
return "docker"
if any(f.endswith(".html") for f in files):
return "static"
if entry.endswith(".py"):
return "gradio" # Default Python to Gradio SDK
return "static"
def create_project_zip(files: dict[str, str], project_name: str) -> str:
"""Create a ZIP file from extracted project files.
Returns the path to the created ZIP file.
"""
zip_dir = tempfile.mkdtemp(prefix="fullstack_project_")
zip_path = os.path.join(zip_dir, f"{project_name}.zip")
with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zf:
for filepath, content in files.items():
zf.writestr(f"{project_name}/{filepath}", content)
return zip_path
def push_to_huggingface(
files: dict[str, str],
project_name: str,
repo_name: str,
hf_token: str,
space_sdk: str = "static",
is_space: bool = True,
) -> dict[str, Any]:
"""Push generated project to HuggingFace Hub.
Creates the repo if it doesn't exist, writes all files,
and adds README.md, Dockerfile, package.json, and requirements.txt as needed.
"""
try:
from huggingface_hub import HfApi, create_repo
api = HfApi(token=hf_token)
if "/" in repo_name:
namespace, name = repo_name.split("/", 1)
else:
user_info = api.whoami()
namespace = user_info["name"]
name = repo_name
repo_name = f"{namespace}/{name}"
# Find entry point and auto-detect SDK
entry_point = _find_entry_point(files)
detected_sdk = _detect_sdk(files, entry_point)
# Use detected SDK if user left it as "static" but project needs something else
if space_sdk == "static" and detected_sdk != "static":
space_sdk = detected_sdk
# For JS projects, scaffold Docker support files
if is_js_project(files) or space_sdk == "docker":
framework = detect_framework(files)
if framework == "static":
# Single HTML file or simple JS — keep as static
if any(f.endswith(".html") for f in files) and not is_js_project(files):
space_sdk = "static"
else:
framework = "nodejs"
space_sdk = "docker"
if space_sdk == "docker":
files = scaffold_js_project(files, framework, project_name)
try:
if is_space:
create_repo(
repo_id=repo_name,
repo_type="space",
space_sdk=space_sdk,
token=hf_token,
exist_ok=True,
)
else:
create_repo(
repo_id=repo_name,
repo_type="model",
token=hf_token,
exist_ok=True,
)
except Exception as e:
logger.warning("Repo creation warning: %s", e)
with tempfile.TemporaryDirectory(prefix="hf_push_") as tmp_dir:
# Write all project files
for filepath, content in files.items():
full_path = os.path.join(tmp_dir, filepath)
os.makedirs(os.path.dirname(full_path), exist_ok=True)
Path(full_path).write_text(content, encoding="utf-8")
# Ensure the entry point is named app.py for HF Spaces (Python)
if entry_point != "app.py" and entry_point.endswith(".py") and is_space and space_sdk in ("gradio", "streamlit"):
src = os.path.join(tmp_dir, entry_point)
dst = os.path.join(tmp_dir, "app.py")
if os.path.exists(src) and not os.path.exists(dst):
import shutil
shutil.copy2(src, dst)
# Determine app_file for README
if space_sdk == "docker":
app_file = "Dockerfile"
elif space_sdk in ("gradio", "streamlit"):
app_file = "app.py"
elif space_sdk == "static":
if "index.html" in files:
app_file = "index.html"
elif any(f.endswith(".html") for f in files):
app_file = next(f for f in files if f.endswith(".html"))
else:
app_file = entry_point
else:
app_file = entry_point
# Add README.md if not present
readme_path = os.path.join(tmp_dir, "README.md")
if not os.path.exists(readme_path):
readme_content = f"""---
title: {name}
emoji: 🚀
colorFrom: blue
colorTo: purple
sdk: {space_sdk}
app_file: {app_file}
pinned: false
---
# {name}
Generated by Fullstack Code Builder using {MODEL_ID}.
"""
Path(readme_path).write_text(readme_content, encoding="utf-8")
else:
# Update app_file in existing README to match entry point
existing = Path(readme_path).read_text(encoding="utf-8")
if "app_file:" in existing:
existing = re.sub(
r"app_file:\s*\S+", f"app_file: {app_file}", existing
)
if "sdk:" in existing:
existing = re.sub(
r"sdk:\s*\S+", f"sdk: {space_sdk}", existing
)
Path(readme_path).write_text(existing, encoding="utf-8")
# Add/merge requirements.txt for Python projects
req_path = os.path.join(tmp_dir, "requirements.txt")
has_python = any(f.endswith(".py") for f in files.keys())
if has_python and space_sdk != "docker":
# Scan all Python code for imports
all_py_code = "\n".join(
content for fname, content in files.items()
if fname.endswith(".py")
)
auto_reqs = generate_requirements(all_py_code)
if os.path.exists(req_path):
# Merge with existing requirements.txt
existing_reqs = Path(req_path).read_text(encoding="utf-8").strip()
merged = set()
for line in (existing_reqs + "\n" + auto_reqs).splitlines():
line = line.strip()
if line and not line.startswith("#"):
merged.add(line)
Path(req_path).write_text("\n".join(sorted(merged)) + "\n", encoding="utf-8")
elif auto_reqs:
Path(req_path).write_text(auto_reqs, encoding="utf-8")
else:
# Minimal requirements for Python Spaces
Path(req_path).write_text("gradio>=4.0.0\n", encoding="utf-8")
api.upload_folder(
folder_path=tmp_dir,
repo_id=repo_name,
repo_type="space" if is_space else "model",
token=hf_token,
)
repo_url = f"https://huggingface.co/{repo_name}"
if is_space:
repo_url = f"https://huggingface.co/spaces/{repo_name}"
return {
"success": True,
"url": repo_url,
"repo_name": repo_name,
"message": f"Successfully pushed to {repo_url}",
}
except Exception as exc:
logger.exception("Failed to push to HuggingFace")
return {
"success": False,
"url": "",
"repo_name": repo_name,
"message": f"Failed to push: {str(exc)}",
}