NLP-RAG / backend /services /chunks.py
Qar-Raz's picture
hf-space: deploy branch without frontend/data/results
c7256ee
from typing import Any
# might need to touch this to get the additional metadata for retrieved chunks, like title and url
# --@Qamar
def build_retrieved_chunks(
contexts: list[str],
chunk_lookup: dict[str, dict[str, Any]],
) -> list[dict[str, Any]]:
if not contexts:
return []
retrieved_chunks: list[dict[str, Any]] = []
for idx, text in enumerate(contexts, start=1):
meta = chunk_lookup.get(text, {})
title = meta.get("title") or "Untitled"
url = meta.get("url") or ""
chunk_index = meta.get("chunk_index")
page = meta.get("page")
section = meta.get("section")
source_type = meta.get("source_type") or meta.get("source")
image_url = (
meta.get("image_url")
or meta.get("image")
or meta.get("thumbnail_url")
or meta.get("media_url")
)
extra_metadata = {
k: v
for k, v in meta.items()
if k not in {"title", "url", "chunk_index", "text", "technique", "chunking_technique"}
}
retrieved_chunks.append(
{
"rank": idx,
"text": text,
"source_title": title,
"source_url": url,
"chunk_index": chunk_index,
"page": page,
"section": section,
"source_type": source_type,
"image_url": image_url,
"extra_metadata": extra_metadata,
}
)
return retrieved_chunks