Walid Sobhi commited on
Commit
da6d51e
Β·
verified Β·
1 Parent(s): d4e7c9e

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +185 -0
app.py ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Stack X Ultimate β€” Hugging Face Space Inference
3
+ ================================================
4
+ A free HF Space that serves our model 24/7 on T4 GPU.
5
+ Works after training completes β€” auto-loads LoRA adapter + base model.
6
+
7
+ Run on: https://huggingface.co/spaces/my-ai-stack/Stack-X-Ultimate-Inference
8
+ """
9
+
10
+ import os
11
+ import torch
12
+ from typing import Optional
13
+
14
+ import gradio as gr
15
+ from transformers import AutoTokenizer, AutoModelForCausalLM
16
+ from peft import PeftModel
17
+
18
+ # ─── Config ─────────────────────────────────────────────────────────────────
19
+ BASE_MODEL = "Qwen/Qwen2.5-Coder-3B-Instruct"
20
+ ADAPTER_REPO = "my-ai-stack/Stack-X-Ultimate"
21
+ FALLBACK_ADAPTER = "my-ai-stack/Stack-4.0-Qwen-3B-Agentic"
22
+
23
+ # ─── Model Loading ──────────────────────────────────────────────────────────
24
+
25
+ def load_model():
26
+ """Load model with LoRA adapter."""
27
+ global model, tokenizer
28
+
29
+ print("Loading tokenizer...")
30
+ tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
31
+ tokenizer.pad_token = tokenizer.eos_token
32
+ tokenizer.padding_side = "right"
33
+
34
+ print(f"Loading base: {BASE_MODEL}")
35
+ base = AutoModelForCausalLM.from_pretrained(
36
+ BASE_MODEL,
37
+ torch_dtype=torch.bfloat16,
38
+ device_map="auto",
39
+ trust_remote_code=True,
40
+ )
41
+
42
+ # Try to load adapter
43
+ try:
44
+ print(f"Loading adapter: {ADAPTER_REPO}")
45
+ model = PeftModel.from_pretrained(base, ADAPTER_REPO)
46
+ print(f"βœ… Loaded {ADAPTER_REPO}")
47
+ except Exception as e1:
48
+ print(f"Failed to load {ADAPTER_REPO}: {e1}")
49
+ try:
50
+ print(f"Falling back to: {FALLBACK_ADAPTER}")
51
+ model = PeftModel.from_pretrained(base, FALLBACK_ADAPTER)
52
+ print(f"βœ… Loaded {FALLBACK_ADAPTER}")
53
+ except Exception as e2:
54
+ print(f"Both adapters failed. Using base model. Error: {e2}")
55
+ model = base
56
+
57
+ model.eval()
58
+ total = sum(p.numel() for p in model.parameters()) / 1e9
59
+ print(f"Model ready: {total:.1f}B parameters")
60
+
61
+
62
+ # Load at startup
63
+ print("Initializing Stack X Ultimate Space...")
64
+ try:
65
+ load_model()
66
+ STATUS = "βœ… Model loaded"
67
+ except Exception as e:
68
+ STATUS = f"⚠️ Load error: {e}"
69
+ model = None
70
+ tokenizer = None
71
+
72
+ # ─── Inference Functions ─────────────────────────────────────────────────────
73
+
74
+ def generate(prompt: str, max_tokens: int = 512, temperature: float = 0.7, top_p: float = 0.9):
75
+ """Generate text response."""
76
+ if model is None or tokenizer is None:
77
+ return "Model not loaded yet. Please try again in a moment."
78
+
79
+ if not prompt.strip():
80
+ return ""
81
+
82
+ try:
83
+ messages = [
84
+ {"role": "system", "content": "You are Stack X, a helpful AI coding assistant with tool-use capabilities."},
85
+ {"role": "user", "content": prompt},
86
+ ]
87
+ text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
88
+ inputs = tokenizer(text, return_tensors="pt").to(model.device)
89
+
90
+ with torch.no_grad():
91
+ out = model.generate(
92
+ **inputs,
93
+ max_new_tokens=max_tokens,
94
+ temperature=temperature,
95
+ top_p=top_p,
96
+ do_sample=temperature > 0,
97
+ pad_token_id=tokenizer.pad_token_id,
98
+ eos_token_id=tokenizer.eos_token_id,
99
+ repetition_penalty=1.1,
100
+ )
101
+
102
+ response = tokenizer.decode(out[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
103
+ return response
104
+
105
+ except Exception as e:
106
+ return f"Error: {e}"
107
+
108
+
109
+ def chat(messages: list, max_tokens: int = 512, temperature: float = 0.7):
110
+ """Chat with message history."""
111
+ if model is None or tokenizer is None:
112
+ return "Model not loaded yet."
113
+
114
+ if not messages:
115
+ return ""
116
+
117
+ try:
118
+ text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
119
+ inputs = tokenizer(text, return_tensors="pt").to(model.device)
120
+
121
+ with torch.no_grad():
122
+ out = model.generate(
123
+ **inputs,
124
+ max_new_tokens=max_tokens,
125
+ temperature=temperature,
126
+ do_sample=temperature > 0,
127
+ pad_token_id=tokenizer.pad_token_id,
128
+ eos_token_id=tokenizer.eos_token_id,
129
+ )
130
+
131
+ response = tokenizer.decode(out[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
132
+ return response
133
+
134
+ except Exception as e:
135
+ return f"Error: {e}"
136
+
137
+
138
+ # ─── Gradio Interface ─────────────────────────────────────────────────────────
139
+
140
+ with gr.Blocks(title="Stack X Ultimate", theme=gr.themes.Default()) as demo:
141
+ gr.Markdown("# πŸš€ Stack X Ultimate Inference")
142
+ gr.Markdown(f"**Status:** {STATUS}")
143
+ gr.Markdown("Built on Qwen2.5-Coder-3B-Instruct + LoRA adapter trained on NVIDIA Nemotron + Stack-4.0 agentic data.")
144
+
145
+ with gr.Tab("Generate"):
146
+ prompt = gr.Textbox(label="Prompt", placeholder="Write a quicksort in Python...", lines=5)
147
+ with gr.Row():
148
+ max_tok = gr.Slider(32, 1024, value=512, step=32, label="Max tokens")
149
+ temp = gr.Slider(0.1, 1.5, value=0.7, step=0.1, label="Temperature")
150
+ top_p = gr.Slider(0.5, 1.0, value=0.9, step=0.05, label="Top-p")
151
+ generate_btn = gr.Button("Generate", variant="primary")
152
+ output = gr.Textbox(label="Output", lines=10)
153
+ generate_btn.click(fn=generate, inputs=[prompt, max_tok, temp, top_p], outputs=output)
154
+
155
+ with gr.Tab("Chat"):
156
+ chatbot = gr.Chatbot(label="Conversation")
157
+ chat_msg = gr.Textbox(label="Your message", placeholder="Ask me anything...")
158
+ chat_clear = gr.Button("Clear")
159
+ chat_send = gr.Button("Send", variant="primary")
160
+
161
+ def user_msg(msg, history):
162
+ return "", history + [[msg, None]]
163
+
164
+ def bot_resp(history):
165
+ if not history:
166
+ return history
167
+ msgs = [{"role": "user" if i % 2 == 0 else "assistant", "content": c}
168
+ for i, c in enumerate(sum(history, []))]
169
+ # Build proper format
170
+ formatted = []
171
+ for i, (role, content) in enumerate(zip(msgs[::2], msgs[1::2])):
172
+ formatted.append({"role": role["role"], "content": content["content"]})
173
+ response = chat(formatted, max_tokens=512, temperature=0.7)
174
+ history[-1][1] = response
175
+ return history
176
+
177
+ chat_msg.submit(user_msg, [chat_msg, chatbot], [chat_msg, chatbot], queue=False).then(
178
+ bot_resp, [chatbot], [chatbot]
179
+ )
180
+ chat_send.click(user_msg, [chat_msg, chatbot], [chat_msg, chatbot], queue=False).then(
181
+ bot_resp, [chatbot], [chatbot]
182
+ )
183
+ chat_clear.click(fn=None, inputs=None, outputs=chatbot)
184
+
185
+ demo.launch(share=False)