Spaces:
Sleeping
Sleeping
| from typing import List | |
| from huggingface_hub import InferenceClient | |
| from transformers import pipeline | |
| from retrieval_utils import get_recommendations | |
| genre_list = open("genrelist.txt", "r").read().splitlines() | |
| def process_user_query(system_message: str, history: List[dict], user_message: str, use_local_model: bool, max_tokens: int, temperature: float, top_p: float, hf_token): | |
| # 1. Retrieve genres from the user message using naive approach | |
| genre_list = detect_genres(user_message) | |
| # 2. Retrieve relevant results from DB if the genre_list is not empty | |
| recommendations_string = "" | |
| if len(genre_list) > 0: | |
| recommendations_string = get_recommendations(genre_list) | |
| # 3. Query the model | |
| for result in query_model(system_message, | |
| history, | |
| user_message, | |
| recommendations_string, | |
| use_local_model, | |
| max_tokens, | |
| temperature, | |
| top_p, | |
| hf_token): | |
| yield result | |
| def detect_genres(message: str) -> List[str]: | |
| requested_genres = [] | |
| # Simple naive genre check by detecting if any of our system stored genres are within the user query | |
| # TODO: Improve genre detection instead to use Retriever and RAG framework in the future | |
| for genre in genre_list: | |
| if message.__contains__(genre): | |
| requested_genres.append(genre) | |
| return requested_genres | |
| def query_model( | |
| system_message: str, | |
| history: List[dict], | |
| user_message: str, | |
| recommendations_string: str, | |
| use_local_model: bool, | |
| max_tokens: int, # TODO: Remove this and hardcode a value in constants.py | |
| temperature: float, # TODO: Remove this and hardcode a value in constants.py | |
| top_p: float, # TODO: Remove this and hardcode a value in constants.py | |
| hf_token): | |
| # Construct messages for the language model | |
| # Start by adding system prompt | |
| system_prompt = system_message | |
| if recommendations_string: | |
| system_prompt += "\nRECOMMENDATION JSON:" + f"\n{recommendations_string}" | |
| messages = [{"role": "system", "content": system_prompt}] | |
| # Add the rest of the history | |
| messages.extend(history) | |
| # Add the current user prompt | |
| messages.append({"role": "user", "content": user_message}) | |
| # Determine which model to use (local or external) | |
| if use_local_model: | |
| # Local Model -- Uses pipeline from transformers library | |
| pipeline_local_model = pipeline(task='text-generation', | |
| model='Qwen/Qwen3-0.6B', | |
| max_new_tokens=max_tokens, | |
| temperature=temperature, | |
| do_sample=False, | |
| top_p=top_p | |
| ) | |
| # Get the response from the local model | |
| response = pipeline_local_model(messages) | |
| # Parse the output and yield it | |
| yield response[0]['generated_text'][-1]['content'].split('</think>')[-1].strip() | |
| elif not use_local_model: | |
| # Non-local Model -- Use InferenceClient | |
| client = InferenceClient( | |
| token=hf_token, | |
| model="openai/gpt-oss-20b", | |
| ) | |
| response = "" | |
| for chunk in client.chat_completion( | |
| messages=messages, | |
| max_tokens=max_tokens, | |
| stream=True, | |
| temperature=temperature, | |
| top_p=top_p, | |
| ): | |
| if chunk.choices and chunk.choices[0].delta.content: | |
| token = chunk.choices[0].delta.content | |
| response += token | |
| yield response | |