Spaces:

Qar-Raz
/

NLP-RAG

Running

File size: 1,557 Bytes

c7256ee

from typing import Any


# might need to touch this to get the additional metadata for retrieved chunks, like title and url
# --@Qamar

def build_retrieved_chunks(
    contexts: list[str],
    chunk_lookup: dict[str, dict[str, Any]],
) -> list[dict[str, Any]]:
    if not contexts:
        return []

    retrieved_chunks: list[dict[str, Any]] = []

    for idx, text in enumerate(contexts, start=1):
        meta = chunk_lookup.get(text, {})
        title = meta.get("title") or "Untitled"
        url = meta.get("url") or ""
        chunk_index = meta.get("chunk_index")
        page = meta.get("page")
        section = meta.get("section")
        source_type = meta.get("source_type") or meta.get("source")
        image_url = (
            meta.get("image_url")
            or meta.get("image")
            or meta.get("thumbnail_url")
            or meta.get("media_url")
        )

        extra_metadata = {
            k: v
            for k, v in meta.items()
            if k not in {"title", "url", "chunk_index", "text", "technique", "chunking_technique"}
        }

        retrieved_chunks.append(
            {
                "rank": idx,
                "text": text,
                "source_title": title,
                "source_url": url,
                "chunk_index": chunk_index,
                "page": page,
                "section": section,
                "source_type": source_type,
                "image_url": image_url,
                "extra_metadata": extra_metadata,
            }
        )

    return retrieved_chunks