import spaces import torch from threading import Thread from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer import gradio as gr MODEL_ID = "NoesisLab/Spartacus-1B-Instruct" tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained( "NoesisLab/Spartacus-1B-Instruct", trust_remote_code=True, tie_word_embeddings=False # 尝试强制关闭权重绑定检查 ) @spaces.GPU def respond(message, history): messages = [{"role": "system", "content": "You are Spartacus, a helpful assistant."}] for msg in history: messages.append({"role": msg["role"], "content": msg["content"]}) messages.append({"role": "user", "content": message}) input_ids = tokenizer.apply_chat_template( messages, add_generation_prompt=True, return_tensors="pt" ).to(model.device) streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) generate_kwargs = dict( input_ids=input_ids, streamer=streamer, temperature=0.5, top_p=0.9, do_sample=True, ) thread = Thread(target=model.generate, kwargs=generate_kwargs) thread.start() response = "" for token in streamer: response += token yield response demo = gr.ChatInterface( fn=respond, title="Spartacus Chat", description="Chat with NoesisLab/Spartacus-1B-Instruct", ) if __name__ == "__main__": demo.launch()