AlexandreScriptsMT commited on
Commit
887a4cd
verified
1 Parent(s): 5c1609d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -20
app.py CHANGED
@@ -4,39 +4,31 @@ from fastapi import FastAPI
4
  from transformers import AutoTokenizer, pipeline
5
  import threading
6
 
7
- MODEL = os.environ.get("MODEL_NAME", "google/gemma-4-E4B")
8
- # Para Space gr谩tis, defina MODEL_NAME=google/gemma-4-E2B no settings se E4B falhar.
9
 
10
  tokenizer = None
11
  generator = None
12
  _model_lock = threading.Lock()
13
- _loading = False
14
 
15
  def load_model():
16
- global tokenizer, generator, _loading
17
  with _model_lock:
18
  if tokenizer is not None and generator is not None:
19
  return
20
- _loading = True
21
- try:
22
- tokenizer = AutoTokenizer.from_pretrained(MODEL, trust_remote_code=True)
23
- # carregamento em CPU
24
- from transformers import AutoModelForCausalLM
25
- model = AutoModelForCausalLM.from_pretrained(
26
- MODEL,
27
- device_map={"": "cpu"},
28
- torch_dtype="float32",
29
- low_cpu_mem_usage=True,
30
- trust_remote_code=True
31
- )
32
- generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=-1)
33
- finally:
34
- _loading = False
35
 
36
  def generate(prompt):
37
  if generator is None:
38
  load_model()
39
- # limite de tokens para reduzir uso de mem贸ria
40
  out = generator(prompt, max_new_tokens=64, do_sample=False)
41
  return out[0]["generated_text"]
42
 
 
4
  from transformers import AutoTokenizer, pipeline
5
  import threading
6
 
7
+ MODEL = os.environ.get("MODEL_NAME", "google/gemma-4-E2B") # default para E2B (mais leve)
 
8
 
9
  tokenizer = None
10
  generator = None
11
  _model_lock = threading.Lock()
 
12
 
13
  def load_model():
14
+ global tokenizer, generator
15
  with _model_lock:
16
  if tokenizer is not None and generator is not None:
17
  return
18
+ tokenizer = AutoTokenizer.from_pretrained(MODEL, trust_remote_code=True)
19
+ from transformers import AutoModelForCausalLM
20
+ model = AutoModelForCausalLM.from_pretrained(
21
+ MODEL,
22
+ device_map={"": "cpu"},
23
+ torch_dtype="float32",
24
+ low_cpu_mem_usage=True,
25
+ trust_remote_code=True
26
+ )
27
+ generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=-1)
 
 
 
 
 
28
 
29
  def generate(prompt):
30
  if generator is None:
31
  load_model()
 
32
  out = generator(prompt, max_new_tokens=64, do_sample=False)
33
  return out[0]["generated_text"]
34