Spaces:

codegood
/

demochatbot

Running

App Files Files Community

demochatbot / chatbot.py

codegood

chatbot with UI

aa8691d 7 days ago

raw

history blame contribute delete

5.09 kB

	import re
	import sys
	from dataclasses import dataclass
	from pathlib import Path

	import bs4
	import numpy as np
	import requests
	from sambanova import SambaNova
	import yaml
	from langchain_huggingface import HuggingFaceEmbeddings

	@dataclass
	class DocumentChunk:
	text: str
	source: str
	vector: np.ndarray


	def load_config(path: Path) -> dict:
	with path.open("r", encoding="utf-8") as f:
	return yaml.safe_load(f)


	def scrape_website(url: str) -> str:
	response = requests.get(url, timeout=15)
	response.raise_for_status()
	soup = bs4.BeautifulSoup(response.text, "html.parser")
	for tag in soup(["script", "style", "header", "footer", "nav", "aside"]):
	tag.decompose()
	text = soup.get_text(separator="\n")
	text = re.sub(r"\n{2,}", "\n", text).strip()
	return text


	def split_into_chunks(text: str, chunk_size: int = 400, overlap: int = 100) -> list[str]:
	sentences = [s.strip() for s in re.split(r"(?<=[\.\?\!])\s+", text) if s.strip()]
	chunks = []
	current = ""
	for sentence in sentences:
	if len(current) + len(sentence) + 1 > chunk_size and current:
	chunks.append(current.strip())
	current = current[-overlap:] if overlap < len(current) else current
	current += " " + sentence
	if current.strip():
	chunks.append(current.strip())
	return chunks


	def embed_texts(texts: list[str], embed_model: HuggingFaceEmbeddings = None) -> list[np.ndarray]:
	if not texts:
	return []
	if embed_model:
	return embed_model.embed_documents(texts)


	def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
	if np.linalg.norm(a) == 0 or np.linalg.norm(b) == 0:
	return 0.0
	return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)))


	def build_rag_corpus(config: dict, embed_model: HuggingFaceEmbeddings, url: str) -> list[DocumentChunk]:
	print(f"Scraping website: {url}")
	page_text = scrape_website(url)
	chunks = split_into_chunks(page_text)
	print(f"Split content into {len(chunks)} chunks")
	embeddings = embed_texts(chunks, embed_model)
	return [DocumentChunk(text=chunk, source=url, vector=np.array(vector)) for chunk, vector in zip(chunks, embeddings)]


	def retrieve_relevant_chunks(chunks: list[DocumentChunk], question: str, embed_model: HuggingFaceEmbeddings, top_k: int = 4) -> list[DocumentChunk]:
	question_embeddings = embed_texts([question], embed_model)
	if not question_embeddings:
	return chunks[:top_k]
	question_vector = np.array(question_embeddings[0])
	scored = [
	(chunk, cosine_similarity(question_vector, chunk.vector))
	for chunk in chunks
	]
	scored.sort(key=lambda item: item[1], reverse=True)
	return [chunk for chunk, _ in scored[:top_k]]


	def build_prompt(system_prompt: str, question: str, context_chunks: list[DocumentChunk]) -> str:
	context_text = "\n---\n".join(chunk.text for chunk in context_chunks)
	return (
	f"{system_prompt}\n\n"
	f"Use the following extracted website text to answer the question clearly.\n"
	f"Context:\n{context_text}\n\n"
	f"Question: {question}\n"
	)


	def create_llm_client(config: dict) -> SambaNova:
	return SambaNova(
	api_key=config.get("sambanova_api_key"),
	base_url="https://api.sambanova.ai/v1",
	timeout=30,
	)


	def ask_model(prompt: str, client: SambaNova) -> str:
	response = client.chat.completions.create(
	model="DeepSeek-V3.1",
	messages=[{"role": "user", "content": prompt}],
	max_tokens=1056,
	temperature=0.2,
	)
	return response.choices[0].message.content.strip()


	def format_answer(raw: str, chunks: list[DocumentChunk]) -> str:
	return raw


	def main() -> int:
	config_path = Path(__file__).parent / "config.yaml"
	if not config_path.exists():
	print(f"Missing config file: {config_path}")
	return 1

	config = load_config(config_path)
	llm_api_key = config.get("sambanova_api_key")
	website = config.get("website")
	system_prompt = config.get("system_prompt", "You are a helpful assistant.")

	if not llm_api_key or not website:
	print("Please set sambanova_api_key and website in config.yaml")
	return 1
	embed_model = HuggingFaceEmbeddings(model_name=config.get("embedding_model"))
	chunks = build_rag_corpus(config, embed_model, website)
	client = create_llm_client(config)
	print("RAG corpus ready. Ask a question or type 'exit'.")

	while True:
	try:
	question = input("Question> ").strip()
	except EOFError:
	break
	if not question:
	continue
	if question.lower() in {"exit", "quit"}:
	break

	selected = retrieve_relevant_chunks(chunks, question, embed_model)
	prompt = build_prompt(system_prompt, question, selected)
	raw_answer = ask_model(prompt, client)
	response = format_answer(raw_answer, selected)

	print(response)
	print()

	return 0


	if __name__ == "__main__":
	sys.exit(main())