| from openai import OpenAI | |
| client = OpenAI( | |
| base_url="http://localhost:8000/v1", | |
| api_key="none" | |
| ) | |
| prompts = [ | |
| "解释一下为什么大模型需要量化。", | |
| "列出三种常用的学习率调度方法。", | |
| "写一个关于未来城市的科幻短段落。", | |
| ] | |
| def batch_chat(prompts): | |
| responses = [] | |
| for p in prompts: | |
| resp = client.chat.completions.create( | |
| model="Qwen/Qwen3-30B-A3B-GPTQ-Int4", | |
| messages=[{"role": "user", "content": p}], | |
| max_tokens=256 | |
| ) | |
| responses.append(resp.choices[0].message.content) | |
| return responses | |
| results = batch_chat(prompts) | |
| for i, r in enumerate(results): | |
| print(f"\n=== Prompt {i+1} ===") | |
| print("Prompt:", prompts[i]) | |
| print("Response:", r) | |