3morixd commited on
Commit
02f4e01
·
verified ·
1 Parent(s): 6d026a9

Upload inference_test.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. inference_test.py +23 -0
inference_test.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Quick inference test for Qwen2.5-Coder-7B-mobile
2
+ # Run: python inference_test.py
3
+ from llama_cpp import Llama
4
+ import time
5
+
6
+ print("Loading Qwen2.5-Coder-7B-mobile...")
7
+ llm = Llama(model_path="model.gguf", chat_format="chatml", n_ctx=512, n_threads=4, verbose=False)
8
+
9
+ tests = [
10
+ "What is the capital of France?",
11
+ "What is 2+2? Just the number.",
12
+ "Write a one-sentence greeting.",
13
+ ]
14
+
15
+ for prompt in tests:
16
+ t0 = time.time()
17
+ resp = llm.create_chat_completion(messages=[{"role":"user","content":prompt}], max_tokens=30, temperature=0.3)
18
+ t1 = time.time()
19
+ answer = resp["choices"][0]["message"]["content"].strip()
20
+ tokens = resp.get("usage", {}).get("completion_tokens", 0)
21
+ tps = tokens / (t1-t0) if (t1-t0) > 0 else 0
22
+ print(f"Q: {prompt}")
23
+ print(f"A: {answer} ({tps:.1f} t/s)\n")