AlexandreScriptsMT commited on
Commit
7d2bda7
·
verified ·
1 Parent(s): 172758a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -36
app.py CHANGED
@@ -1,46 +1,27 @@
1
- import os
2
  from fastapi import FastAPI
3
  import gradio as gr
4
- from llama_cpp import Llama
 
5
 
6
- # 1. Configuração do Modelo (Gemma 4 E4B GGUF)
7
- # Usando uma versão quantizada para caber nos 16GB de RAM
8
- model_id = "google/gemma-4-e4b-it-GGUF"
9
- model_file = "gemma-4-e4b-it-Q4_K_M.gguf"
10
 
11
- # Inicializa o modelo (ele será baixado automaticamente se configurado no Space)
12
- llm = Llama.from_pretrained(
13
- repo_id=model_id,
14
- filename=model_file,
15
- n_ctx=2048, # Janela de contexto
16
- n_threads=2 # Limite de 2 vCPUs do Space gratuito
 
 
17
  )
18
 
19
- app = FastAPI()
20
 
21
- def generate_response(message, history):
22
- # Formatação básica para o Gemma 4
23
- prompt = f"<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n"
24
-
25
- output = llm(
26
- prompt,
27
- max_tokens=512,
28
- stop=["<|im_end|>"],
29
- echo=False
30
- )
31
-
32
- return output["choices"][0]["text"]
33
 
34
- # 2. Interface Gradio
35
- demo = gr.ChatInterface(
36
- fn=generate_response,
37
- title="Gemma 4 - E4B Thinking (CPU Free Tier)",
38
- description="Rodando Gemma 4 via GGUF no hardware gratuito da Hugging Face."
39
- )
40
 
41
- # 3. Montar Gradio dentro do FastAPI
42
  app = gr.mount_gradio_app(app, demo, path="/")
43
-
44
- if __name__ == "__main__":
45
- import uvicorn
46
- uvicorn.run(app, host="0.0.0.0", port=7860)
 
 
1
  from fastapi import FastAPI
2
  import gradio as gr
3
+ from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
4
+ import torch
5
 
6
+ MODEL = "google/gemma-4-E4B"
 
 
 
7
 
8
+ # Carregamento otimista para CPU
9
+ tokenizer = AutoTokenizer.from_pretrained(MODEL, trust_remote_code=True)
10
+ model = AutoModelForCausalLM.from_pretrained(
11
+ MODEL,
12
+ device_map={"": "cpu"},
13
+ torch_dtype=torch.float32,
14
+ low_cpu_mem_usage=True,
15
+ trust_remote_code=True
16
  )
17
 
18
+ generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=-1)
19
 
20
+ def generate(prompt):
21
+ out = generator(prompt, max_new_tokens=128, do_sample=False)
22
+ return out[0]["generated_text"]
 
 
 
 
 
 
 
 
 
23
 
24
+ demo = gr.Interface(fn=generate, inputs=gr.Textbox(lines=4, label="Prompt"), outputs="text", title="Gemma-4-E4B (CPU)")
 
 
 
 
 
25
 
26
+ app = FastAPI()
27
  app = gr.mount_gradio_app(app, demo, path="/")