Spaces:

AlexandreScriptsMT
/

fevtest

Build error

App Files Files Community

AlexandreScriptsMT commited on Feb 9

Commit

9a1474c

verified ·

1 Parent(s): 11c56e3

Update app.py

Browse files

Files changed (1) hide show

app.py +27 -7

app.py CHANGED Viewed

@@ -1,16 +1,36 @@
 import gradio as gr
 from llama_cpp import Llama
-# Carregar o modelo (n_ctx é o tamanho da memória de contexto, 2048 é seguro para 16GB RAM)
 llm = Llama(
-    model_path="./models/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf",
-    n_ctx=2048,
-    n_threads=2 # Ajustado para as 2 vCPUs do Space básico
 )
 def respond(message, history):
-    prompt = f"System: Você é uma IA prestativa.\nUser: {message}\nAssistant:"
-    output = llm(prompt, max_tokens=512, stop=["User:", "\n"], echo=False)
-    return output["choices"][0]["text"]
 gr.ChatInterface(respond).launch(server_name="0.0.0.0", server_port=7860)

 import gradio as gr
 from llama_cpp import Llama
+from huggingface_hub import hf_hub_download
+# Baixa o modelo da "consciência" (Llama 3.1 8B Q4_K_M)
+print("Carregando cérebro reserva...")
+model_path = hf_hub_download(
+    repo_id="bartowski/Meta-Llama-3.1-8B-Instruct-GGUF",
+    filename="Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf"
+)
+# Configuração focada em inteligência estável
 llm = Llama(
+    model_path=model_path,
+    n_ctx=4096,  # Seus 4k de contexto aqui
+    n_threads=2, # Limite do Space gratuito
+    verbose=False
 )
 def respond(message, history):
+    # Formatação oficial para o Llama-3.1 não "alucinar"
+    prompt = f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nVocê é um backup do Gemini 1.5 Pro rodando em hardware limitado.<|eot_id|>"
+    for user_msg, assistant_msg in history:
+        prompt += f"<|start_header_id|>user<|end_header_id|>\n\n{user_msg}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n{assistant_msg}<|eot_id|>"
+    prompt += f"<|start_header_id|>user<|end_header_id|>\n\n{message}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
+    # Streaming para você ver a resposta enquanto ela é gerada
+    output = llm(prompt, max_tokens=1024, stop=["<|eot_id|>"], stream=True)
+    token_accumulator = ""
+    for chunk in output:
+        token = chunk["choices"]["text"]
+        token_accumulator += token
+        yield token_accumulator
 gr.ChatInterface(respond).launch(server_name="0.0.0.0", server_port=7860)