Spaces:

JoeStrout
/

miniscript-code-helper

Runtime error

App Files Files Community

JoeStrout commited on 4 days ago

Commit

9a1939c

verified ·

1 Parent(s): 2a35216

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +234 -0

app.py ADDED Viewed

	@@ -0,0 +1,234 @@

+#!/usr/bin/env python3
+"""Gradio Space: MiniScript Code Helper (LoRA + RAG).
+Loads the fine-tuned Qwen2.5-Coder-7B-Instruct LoRA adapter and a ChromaDB
+vector index built from MiniScript documentation, then serves a chat interface.
+"""
+import os
+import re
+os.environ.setdefault("USE_TF", "0")
+import chromadb
+import gradio as gr
+import torch
+from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
+from peft import PeftModel
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+# ---------------------------------------------------------------------------
+# Configuration
+# ---------------------------------------------------------------------------
+BASE_MODEL = "Qwen/Qwen2.5-Coder-7B-Instruct"
+ADAPTER_REPO = "JoeStrout/miniscript-code-helper-lora"
+RAG_DIR = "./RAG_sources"
+DB_DIR = "./chroma_db"
+COLLECTION = "miniscript_docs"
+EMBEDDING_MODEL = "all-MiniLM-L6-v2"
+TOP_K = 5
+MAX_NEW_TOKENS = 1024
+MAX_CHUNK_CHARS = 1500
+BASE_SYSTEM_PROMPT = "You are a helpful assistant specializing in MiniScript programming."
+# ---------------------------------------------------------------------------
+# RAG index builder (inline so app is self-contained)
+# ---------------------------------------------------------------------------
+def strip_leanpub(text: str) -> str:
+    lines = text.splitlines()
+    cleaned = []
+    for line in lines:
+        if re.match(r'^\s*\{(chapterHead|width|i:|caption|pagebreak|startingPageNum)', line):
+            m = re.search(r'\{caption:\s*"([^"]+)"\}', line)
+            if m:
+                cleaned.append(f"[{m.group(1)}]")
+            continue
+        if re.match(r'^\s*!\[.*\]\(.*\)\s*$', line):
+            continue
+        line = re.sub(r'^([QADX])>\s?', '', line)
+        cleaned.append(line)
+    return '\n'.join(cleaned)
+def split_long_chunk(text: str, max_chars: int = MAX_CHUNK_CHARS) -> list:
+    if len(text) <= max_chars:
+        return [text]
+    paragraphs = re.split(r'\n\n+', text)
+    chunks, current = [], ""
+    for para in paragraphs:
+        if current and len(current) + len(para) + 2 > max_chars:
+            chunks.append(current.strip())
+            current = para
+        else:
+            current = current + "\n\n" + para if current else para
+    if current.strip():
+        chunks.append(current.strip())
+    return chunks
+def chunk_document(text: str, filename: str) -> list:
+    is_txt = filename.endswith('.txt')
+    if is_txt:
+        text = strip_leanpub(text)
+    lines = text.splitlines()
+    chunks, current_section, current_lines = [], filename, []
+    def flush():
+        body = '\n'.join(current_lines).strip()
+        if not body:
+            return
+        for part in split_long_chunk(body):
+            if part.strip():
+                chunks.append({"text": part, "source": filename, "section": current_section})
+    for line in lines:
+        heading = None
+        if is_txt:
+            m = re.match(r'^(#{1,4})\s+(.*)', line)
+            if m:
+                heading = m.group(2).strip()
+        elif re.match(r'^#{1,4}\s', line):
+            heading = re.sub(r'^#+\s*', '', line).strip()
+        if heading:
+            flush()
+            current_section = heading
+            current_lines = []
+        else:
+            current_lines.append(line)
+    flush()
+    return chunks
+def build_rag_index():
+    print(f"Building ChromaDB index from {RAG_DIR}/ ...")
+    embedding_fn = SentenceTransformerEmbeddingFunction(model_name=EMBEDDING_MODEL)
+    client = chromadb.PersistentClient(path=DB_DIR)
+    existing = [c.name for c in client.list_collections()]
+    if COLLECTION in existing:
+        col = client.get_collection(name=COLLECTION, embedding_function=embedding_fn)
+        print(f"  Reusing existing collection ({col.count()} chunks)")
+        return col
+    col = client.create_collection(
+        name=COLLECTION,
+        embedding_function=embedding_fn,
+        metadata={"hnsw:space": "cosine"},
+    )
+    source_files = sorted(f for f in os.listdir(RAG_DIR) if f.endswith(('.md', '.txt')))
+    all_chunks = []
+    for fname in source_files:
+        with open(os.path.join(RAG_DIR, fname), encoding='utf-8') as f:
+            text = f.read()
+        chunks = chunk_document(text, fname)
+        print(f"  {fname}: {len(chunks)} chunks")
+        all_chunks.extend(chunks)
+    BATCH = 100
+    for i in range(0, len(all_chunks), BATCH):
+        batch = all_chunks[i:i + BATCH]
+        col.add(
+            ids=[f"chunk_{i + j}" for j in range(len(batch))],
+            documents=[c["text"] for c in batch],
+            metadatas=[{"source": c["source"], "section": c["section"]} for c in batch],
+        )
+    print(f"  Indexed {col.count()} chunks total.")
+    return col
+# ---------------------------------------------------------------------------
+# Model loading
+# ---------------------------------------------------------------------------
+def load_model():
+    print(f"Loading tokenizer from {ADAPTER_REPO} ...")
+    tokenizer = AutoTokenizer.from_pretrained(ADAPTER_REPO)
+    print(f"Loading base model {BASE_MODEL} in 4-bit ...")
+    bnb_cfg = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_quant_type="nf4",
+        bnb_4bit_compute_dtype=torch.bfloat16,
+    )
+    base = AutoModelForCausalLM.from_pretrained(
+        BASE_MODEL,
+        quantization_config=bnb_cfg,
+        device_map="auto",
+    )
+    print(f"Loading LoRA adapter from {ADAPTER_REPO} ...")
+    model = PeftModel.from_pretrained(base, ADAPTER_REPO)
+    model.eval()
+    print("Model ready!")
+    return tokenizer, model
+# ---------------------------------------------------------------------------
+# Startup
+# ---------------------------------------------------------------------------
+collection = build_rag_index()
+tokenizer, model = load_model()
+# ---------------------------------------------------------------------------
+# Chat logic
+# ---------------------------------------------------------------------------
+def build_system_prompt(results: dict) -> str:
+    if not results or not results["documents"] or not results["documents"][0]:
+        return BASE_SYSTEM_PROMPT
+    parts = []
+    for doc, meta in zip(results["documents"][0], results["metadatas"][0]):
+        parts.append(f"[Source: {meta['source']}, Section: {meta['section']}]\n{doc}")
+    context = "\n\n".join(parts)
+    return (
+        f"{BASE_SYSTEM_PROMPT}\n\n"
+        f"Use the following reference material to help answer the user's question:\n\n"
+        f"{context}"
+    )
+def chat(message: str, history: list) -> str:
+    results = collection.query(query_texts=[message], n_results=TOP_K)
+    system_prompt = build_system_prompt(results)
+    messages = [{"role": "system", "content": system_prompt}]
+    for user_msg, assistant_msg in history:
+        messages.append({"role": "user", "content": user_msg})
+        messages.append({"role": "assistant", "content": assistant_msg})
+    messages.append({"role": "user", "content": message})
+    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    inputs = tokenizer([text], return_tensors="pt").to(model.device)
+    with torch.no_grad():
+        output = model.generate(**inputs, max_new_tokens=MAX_NEW_TOKENS)
+    response = tokenizer.decode(output[0][len(inputs.input_ids[0]):], skip_special_tokens=True)
+    return response
+# ---------------------------------------------------------------------------
+# Gradio UI
+# ---------------------------------------------------------------------------
+demo = gr.ChatInterface(
+    fn=chat,
+    title="MiniScript Code Helper",
+    description=(
+        "Ask questions about the [MiniScript](https://miniscript.org) programming language. "
+        "Powered by a fine-tuned Qwen2.5-Coder-7B-Instruct model with RAG over MiniScript documentation."
+    ),
+    examples=[
+        "How do I define a function in MiniScript?",
+        "How do I iterate over a list?",
+        "What is the difference between `and` and `&&` in MiniScript?",
+        "How do I read a file in MiniScript?",
+    ],
+    cache_examples=False,
+)
+if __name__ == "__main__":
+    demo.launch()