Spaces:

MLDevOps
/

CS553_CaseStudy1

Sleeping

CS553_CaseStudy1 / backend.py

github-actions

Deploy to Spaces with Xet

9826f0b 13 days ago

3.83 kB

	from typing import List
	from huggingface_hub import InferenceClient
	from transformers import pipeline
	from retrieval_utils import get_recommendations

	genre_list = open("genrelist.txt", "r").read().splitlines()

	def process_user_query(system_message: str, history: List[dict], user_message: str, use_local_model: bool, max_tokens: int, temperature: float, top_p: float, hf_token):
	# 1. Retrieve genres from the user message using naive approach
	genre_list = detect_genres(user_message)

	# 2. Retrieve relevant results from DB if the genre_list is not empty
	recommendations_string = ""
	if len(genre_list) > 0:
	recommendations_string = get_recommendations(genre_list)

	# 3. Query the model
	for result in query_model(system_message,
	history,
	user_message,
	recommendations_string,
	use_local_model,
	max_tokens,
	temperature,
	top_p,
	hf_token):
	yield result


	def detect_genres(message: str) -> List[str]:
	requested_genres = []
	# Simple naive genre check by detecting if any of our system stored genres are within the user query
	# TODO: Improve genre detection instead to use Retriever and RAG framework in the future
	for genre in genre_list:
	if message.__contains__(genre):
	requested_genres.append(genre)
	return requested_genres


	def query_model(
	system_message: str,
	history: List[dict],
	user_message: str,
	recommendations_string: str,
	use_local_model: bool,
	max_tokens: int, # TODO: Remove this and hardcode a value in constants.py
	temperature: float, # TODO: Remove this and hardcode a value in constants.py
	top_p: float, # TODO: Remove this and hardcode a value in constants.py
	hf_token):

	# Construct messages for the language model
	# Start by adding system prompt
	system_prompt = system_message
	if recommendations_string:
	system_prompt += "\nRECOMMENDATION JSON:" + f"\n{recommendations_string}"
	messages = [{"role": "system", "content": system_prompt}]

	# Add the rest of the history
	messages.extend(history)

	# Add the current user prompt
	messages.append({"role": "user", "content": user_message})

	# Determine which model to use (local or external)
	if use_local_model:
	# Local Model -- Uses pipeline from transformers library
	pipeline_local_model = pipeline(task='text-generation',
	model='Qwen/Qwen3-0.6B',
	max_new_tokens=max_tokens,
	temperature=temperature,
	do_sample=False,
	top_p=top_p
	)
	# Get the response from the local model
	response = pipeline_local_model(messages)

	# Parse the output and yield it
	yield response[0]['generated_text'][-1]['content'].split('</think>')[-1].strip()


	elif not use_local_model:
	# Non-local Model -- Use InferenceClient
	client = InferenceClient(
	token=hf_token,
	model="openai/gpt-oss-20b",
	)

	response = ""
	for chunk in client.chat_completion(
	messages=messages,
	max_tokens=max_tokens,
	stream=True,
	temperature=temperature,
	top_p=top_p,
	):
	if chunk.choices and chunk.choices[0].delta.content:
	token = chunk.choices[0].delta.content
	response += token
	yield response