| | |
| | """ |
| | Simple benchmark for efficient-context's semantic deduplication. |
| | """ |
| |
|
| | import logging |
| | import time |
| | import sys |
| |
|
| | |
| | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') |
| | logger = logging.getLogger(__name__) |
| | logger.info("Simple deduplication benchmark starting") |
| |
|
| | |
| | try: |
| | from efficient_context import ContextManager |
| | from efficient_context.compression import SemanticDeduplicator |
| | from efficient_context.chunking import SemanticChunker |
| | from efficient_context.retrieval import CPUOptimizedRetriever |
| | logger.info("Successfully imported efficient_context") |
| | except ImportError as e: |
| | logger.error(f"Failed to import efficient_context: {e}") |
| | sys.exit(1) |
| |
|
| | def create_repetitive_document(): |
| | """Create a document with deliberate repetition""" |
| | |
| | climate_paragraph = """ |
| | Climate change is a significant alteration in weather patterns over extended periods. |
| | Global warming is the long-term heating of Earth's climate system due to human activities. |
| | Rising global temperatures are causing substantial changes in our environment and ecosystems. |
| | The warming of the planet is leading to significant transformations in weather patterns. |
| | Human activities are causing Earth's temperature to increase, resulting in climate changes. |
| | """ |
| | |
| | energy_paragraph = """ |
| | Renewable energy comes from sources that are naturally replenishing but flow-limited. |
| | Clean energy is derived from natural processes that are constantly replenished. |
| | Sustainable power is generated from resources that won't deplete over time. |
| | Green energy utilizes sources that don't produce pollution when generating power. |
| | Alternative energy refers to sources that are an alternative to fossil fuel. |
| | """ |
| | |
| | |
| | document = (climate_paragraph + energy_paragraph) * 3 |
| | return document |
| |
|
| | def main(): |
| | """Run the benchmark""" |
| | |
| | document = create_repetitive_document() |
| | logger.info(f"Document size: {len(document.split())} words") |
| | |
| | |
| | thresholds = [0.7, 0.8, 0.85, 0.9, 0.95] |
| | |
| | for threshold in thresholds: |
| | logger.info(f"\nTesting with threshold: {threshold}") |
| | |
| | |
| | context_manager = ContextManager( |
| | compressor=SemanticDeduplicator(threshold=threshold), |
| | chunker=SemanticChunker(chunk_size=100), |
| | retriever=CPUOptimizedRetriever(embedding_model="lightweight") |
| | ) |
| | |
| | |
| | start_time = time.time() |
| | doc_id = context_manager.add_document(document) |
| | processing_time = time.time() - start_time |
| | |
| | |
| | query = "Tell me about climate change and renewable energy" |
| | start_time = time.time() |
| | context = context_manager.generate_context(query) |
| | query_time = time.time() - start_time |
| | |
| | |
| | original_size = len(document.split()) |
| | context_size = len(context.split()) |
| | compression_ratio = context_size / original_size if original_size > 0 else 1.0 |
| | |
| | logger.info(f"Results for threshold {threshold}:") |
| | logger.info(f" - Original size: {original_size} words") |
| | logger.info(f" - Context size: {context_size} words") |
| | logger.info(f" - Compression ratio: {compression_ratio:.2f}") |
| | logger.info(f" - Processing time: {processing_time:.4f} seconds") |
| | logger.info(f" - Query time: {query_time:.4f} seconds") |
| |
|
| | if __name__ == "__main__": |
| | main() |
| |
|