| | import os |
| | import json |
| | from typing import List, Dict |
| | from collections import defaultdict |
| | from dotenv import load_dotenv |
| |
|
| | from evoagentx.core.logging import logger |
| | from evoagentx.storages.base import StorageHandler |
| | from evoagentx.rag.rag import RAGEngine |
| | from evoagentx.models import OpenRouterConfig, OpenRouterLLM |
| | from evoagentx.storages.storages_config import VectorStoreConfig, DBConfig, GraphStoreConfig, StoreConfig |
| | from evoagentx.rag.rag_config import RAGConfig, ReaderConfig, ChunkerConfig, IndexConfig, EmbeddingConfig, RetrievalConfig |
| | from evoagentx.rag.schema import Query, Corpus, Chunk, ChunkMetadata |
| | from evoagentx.benchmark.hotpotqa import HotPotQA, download_raw_hotpotqa_data |
| |
|
| | |
| | load_dotenv() |
| |
|
| | |
| | download_raw_hotpotqa_data("hotpot_dev_distractor_v1.json", "./debug/data/hotpotqa") |
| | datasets = HotPotQA("./debug/data/hotpotqa") |
| |
|
| | |
| | store_config = StoreConfig( |
| | dbConfig=DBConfig( |
| | db_name="sqlite", |
| | path="./debug/data/hotpotqa/cache/test_hotpotQA.sql" |
| | ), |
| | vectorConfig=VectorStoreConfig( |
| | vector_name="faiss", |
| | dimensions=768, |
| | index_type="flat_l2", |
| | ), |
| | graphConfig=GraphStoreConfig( |
| | graph_name="neo4j", |
| | uri="bolt://localhost:7687", |
| | username= "neo4j", |
| | password= "12345678", |
| | database="neo4j", |
| | ), |
| | |
| | path="./debug/data/hotpotqa/cache/indexing" |
| | ) |
| | storage_handler = StorageHandler(storageConfig=store_config) |
| |
|
| | |
| | |
| | """ |
| | # For openai example |
| | embedding=EmbeddingConfig( |
| | provider="openai", |
| | model_name="text-embedding-ada-002", |
| | api_key=os.environ["OPENAI_API_KEY"], |
| | ) |
| | # For huggingface example |
| | embedding=EmbeddingConfig( |
| | provider="huggingface", |
| | model_name="debug/weights/bge-small-en-v1.5", |
| | device="cpu" |
| | ) |
| | # For ollama example |
| | embedding=EmbeddingConfig( |
| | provider="ollama", |
| | model_name="nomic-embed-text", |
| | base_url="10.168.1.71:17174", |
| | dimensions=768 |
| | ) |
| | # For azure openai example |
| | embedding=EmbeddingConfig( |
| | provider="azure_openai", |
| | model_name=os.environ["AZURE_EMBED_DEPLOYMENT"], |
| | api_key=os.environ["AZURE_EMBED_API_KEY"], |
| | dimensions=3072, |
| | deployment_name=os.environ["AZURE_EMBED_DEPLOYMENT"], |
| | azure_endpoint=os.environ["AZURE_EMBED_ENDPOINT"], |
| | api_version=os.environ["AZURE_EMBED_API_VERSION"], |
| | ) |
| | |
| | """ |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | embedding=EmbeddingConfig( |
| | provider="azure_openai", |
| | model_name=os.environ["AZURE_EMBED_DEPLOYMENT"], |
| | api_key=os.environ["AZURE_EMBED_API_KEY"], |
| | dimensions=3072, |
| | deployment_name=os.environ["AZURE_EMBED_DEPLOYMENT"], |
| | azure_endpoint=os.environ["AZURE_EMBED_ENDPOINT"], |
| | api_version=os.environ["AZURE_EMBED_API_VERSION"], |
| | ) |
| |
|
| |
|
| | rag_config = RAGConfig( |
| | reader=ReaderConfig( |
| | recursive=False, exclude_hidden=True, |
| | num_files_limit=None, custom_metadata_function=None, |
| | extern_file_extractor=None, |
| | errors="ignore", encoding="utf-8" |
| | ), |
| | chunker=ChunkerConfig( |
| | strategy="simple", |
| | chunk_size=512, |
| | chunk_overlap=0, |
| | max_chunks=None |
| | ), |
| | embedding=embedding, |
| | index=IndexConfig(index_type="graph"), |
| | retrieval=RetrievalConfig( |
| | retrivel_type="graph", |
| | postprocessor_type="simple", |
| | top_k=10, |
| | similarity_cutoff=0.3, |
| | keyword_filters=None, |
| | metadata_filters=None |
| | ) |
| | ) |
| |
|
| | OPEN_ROUNTER_API_KEY = os.environ["OPEN_ROUNTER_API_KEY"] |
| | config = OpenRouterConfig( |
| | openrouter_key=OPEN_ROUNTER_API_KEY, |
| | temperature=0.3, |
| | model="google/gemini-2.5-flash-lite-preview-06-17", |
| | ) |
| | llm = OpenRouterLLM(config=config) |
| |
|
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| |
|
| | search_engine = RAGEngine(config=rag_config, storage_handler=storage_handler, llm=llm) |
| |
|
| | |
| | def create_corpus_from_context(context: List[List], corpus_id: str) -> Corpus: |
| | """Convert HotPotQA context into a Corpus for indexing.""" |
| | chunks = [] |
| | for title, sentences in context: |
| | for idx, sentence in enumerate(sentences): |
| | chunk = Chunk( |
| | chunk_id=f"{title}_{idx}", |
| | text=sentence, |
| | metadata=ChunkMetadata( |
| | doc_id=str(idx), |
| | corpus_id=corpus_id |
| | ), |
| | start_char_idx=0, |
| | end_char_idx=len(sentence), |
| | excluded_embed_metadata_keys=[], |
| | excluded_llm_metadata_keys=[], |
| | relationships={} |
| | ) |
| | chunk.metadata.title = title |
| | chunks.append(chunk) |
| | return Corpus(chunks=chunks[:4], corpus_id=corpus_id) |
| |
|
| | def evaluate_retrieval(retrieved_chunks: List[Chunk], supporting_facts: List[List], top_k: int) -> Dict[str, float]: |
| | """Evaluate retrieved chunks against supporting facts.""" |
| | |
| | relevant = {(fact[0], fact[1]) for fact in supporting_facts} |
| | |
| | |
| | retrieved = [] |
| | for chunk in retrieved_chunks[:top_k]: |
| | title = chunk.metadata.title |
| | sentence_idx = int(chunk.metadata.doc_id) |
| | retrieved.append((title, sentence_idx)) |
| | |
| | |
| | hits = sum(1 for r in retrieved if r in relevant) |
| |
|
| | |
| | precision = hits / top_k if top_k > 0 else 0.0 |
| | recall = hits / len(relevant) if len(relevant) > 0 else 0.0 |
| | f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0 |
| | |
| | |
| | mrr = 0.0 |
| | for rank, r in enumerate(retrieved, 1): |
| | if r in relevant: |
| | mrr = 1.0 / rank |
| | break |
| | |
| | |
| | hit = 1.0 if hits > 0 else 0.0 |
| | |
| | intersection = set((r[0], r[1]) for r in retrieved) & relevant |
| | union = set((r[0], r[1]) for r in retrieved) | relevant |
| | jaccard = len(intersection) / len(union) if union else 0.0 |
| |
|
| | return { |
| | "precision@k": precision, |
| | "recall@k": recall, |
| | "f1@k": f1, |
| | "mrr": mrr, |
| | "hit@k": hit, |
| | "jaccard": jaccard |
| | } |
| |
|
| | def run_evaluation(samples: List[Dict], top_k: int = 5) -> Dict[str, float]: |
| | """Run evaluation on HotPotQA samples.""" |
| | metrics = defaultdict(list) |
| | |
| | for sample in samples: |
| | question = sample["question"] |
| | context = sample["context"] |
| | supporting_facts = sample["supporting_facts"] |
| | corpus_id = sample["_id"] |
| | |
| | logger.info(f"Processing sample: {corpus_id}, question: {question}") |
| | |
| | corpus = create_corpus_from_context(context, corpus_id) |
| | logger.info(f"Created corpus with {len(corpus.chunks)} chunks") |
| | search_engine.add(index_type="graph", nodes=corpus, corpus_id=corpus_id) |
| | |
| | |
| | query = Query(query_str=question, top_k=top_k) |
| | result = search_engine.query(query, corpus_id=corpus_id) |
| | retrieved_chunks = result.corpus.chunks |
| | logger.info(f"Retrieved {len(retrieved_chunks)} chunks for query") |
| | logger.info(f"content:\n{retrieved_chunks}") |
| | |
| | |
| | sample_metrics = evaluate_retrieval(retrieved_chunks, supporting_facts, top_k) |
| | for metric_name, value in sample_metrics.items(): |
| | metrics[metric_name].append(value) |
| | logger.info(f"Metrics for sample {corpus_id}: {sample_metrics}") |
| | |
| | CHECK_SAVE = False |
| | if CHECK_SAVE: |
| | |
| | |
| | |
| | search_engine.save(graph_exported=True) |
| | |
| | search_engine.clear(corpus_id=corpus_id) |
| |
|
| | search_engine1 = RAGEngine(config=rag_config, storage_handler=storage_handler, llm=llm) |
| | |
| | search_engine1.load(index_type="graph") |
| |
|
| | |
| | query = Query(query_str=question, top_k=top_k) |
| | result = search_engine1.query(query, corpus_id=corpus_id) |
| | retrieved_chunks = result.corpus.chunks |
| | logger.info(f"Retrieved {len(retrieved_chunks)} chunks for query") |
| | logger.info(f"content:\n{retrieved_chunks}") |
| | |
| | |
| | sample_metrics = evaluate_retrieval(retrieved_chunks, supporting_facts, top_k) |
| | logger.info(f"Metrics for sample {corpus_id}: {sample_metrics}") |
| |
|
| | |
| | avg_metrics = {name: sum(values) / len(values) for name, values in metrics.items()} |
| | return avg_metrics |
| |
|
| |
|
| | if __name__ == "__main__": |
| | |
| | samples = datasets._dev_data[:20] |
| | print(len(datasets._dev_data)) |
| |
|
| | avg_metrics = run_evaluation(samples, top_k=5) |
| |
|
| | logger.info("Average Metrics:") |
| | for metric_name, value in avg_metrics.items(): |
| | logger.info(f"{metric_name}: {value:.4f}") |
| |
|
| | |
| | with open("./debug/data/hotpotqa/evaluation_results.json", "w") as f: |
| | json.dump(avg_metrics, f, indent=2) |
| |
|
| | """ |
| | Results using 20 samples: |
| | text-embedding-ada-002: |
| | precision@k:0.3400, recall@k:0.7117, f1@k:0.4539, mrr:0.9250, hit@k: 1.0000, jaccard:0.3089 |
| | bge-small-en-v1.5: |
| | precision@k:0.3100, recall@k:0.6767, f1@k:0.4207, mrr: 0.7667, hit@k: 0.9500, jaccard:0.2837 |
| | nomic-embed-text: |
| | precision@k:0.3500, recall@k:0.7367, f1@k: 0.4682, mrr:0.7958, hit@k: 0.9500, jaccard: 0.3268 |
| | """ |