Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Download sample documents for testing. | |
| """ | |
| import requests | |
| import zipfile | |
| from pathlib import Path | |
| import sys | |
| import os | |
| # Add the parent directory to Python path so we can import config | |
| sys.path.insert(0, str(Path(__file__).parent.parent)) | |
| from config import DATA_DIR | |
| def download_sample_data(): | |
| """Download a small sample dataset of documents.""" | |
| # Sample documents (you can replace with your own dataset) | |
| sample_docs = [ | |
| { | |
| "name": "machine_learning_intro.md", | |
| "content": """# Machine Learning Introduction | |
| Machine learning is a subset of artificial intelligence that enables systems | |
| to learn and improve from experience without being explicitly programmed. | |
| ## Types of Machine Learning | |
| 1. Supervised Learning | |
| 2. Unsupervised Learning | |
| 3. Reinforcement Learning | |
| ## Applications | |
| - Natural Language Processing | |
| - Computer Vision | |
| - Recommendation Systems | |
| - Predictive Analytics""" | |
| }, | |
| { | |
| "name": "fastapi_guide.md", | |
| "content": """# FastAPI Guide | |
| FastAPI is a modern, fast web framework for building APIs with Python 3.7+. | |
| ## Key Features | |
| - Fast: Very high performance | |
| - Easy: Easy to use and learn | |
| - Standards-based: Based on OpenAPI and JSON Schema | |
| ## Installation | |
| `ash | |
| pip install fastapi uvicorn | |
| Basic Example | |
| python | |
| from fastapi import FastAPI | |
| app = FastAPI() | |
| @app.get("/") | |
| def read_root(): | |
| return {"Hello": "World"} | |
| `""" | |
| }, | |
| { | |
| "name": "python_basics.txt", | |
| "content": """Python Programming Basics | |
| Python is an interpreted, high-level programming language known for its readability. | |
| Key features include dynamic typing, automatic memory management, and support for multiple programming paradigms. | |
| Data Types: | |
| - Integers, Floats | |
| - Strings | |
| - Lists, Tuples | |
| - Dictionaries | |
| - Sets | |
| Control Structures: | |
| - if/else statements | |
| - for loops | |
| - while loops | |
| - try/except blocks""" | |
| }, | |
| { | |
| "name": "database_concepts.md", | |
| "content": """# Database Concepts | |
| ## SQL vs NoSQL | |
| SQL databases are relational, NoSQL databases are non-relational. | |
| ## Common Databases | |
| 1. PostgreSQL | |
| 2. MySQL | |
| 3. MongoDB | |
| 4. Redis | |
| ## Indexing | |
| Indexes improve query performance but slow down write operations. | |
| Common index types: B-tree, Hash, Bitmap.""" | |
| }, | |
| { | |
| "name": "web_development.txt", | |
| "content": """Web Development Overview | |
| Frontend: HTML, CSS, JavaScript | |
| Backend: Python, Node.js, Java, Go | |
| Databases: SQL, NoSQL | |
| DevOps: Docker, Kubernetes, CI/CD | |
| Frameworks: | |
| - React, Vue, Angular (Frontend) | |
| - Django, Flask, FastAPI (Python) | |
| - Express.js (Node.js) | |
| - Spring Boot (Java)""" | |
| } | |
| ] | |
| print(f"Creating sample documents in {DATA_DIR}...") | |
| DATA_DIR.mkdir(exist_ok=True) | |
| for doc in sample_docs: | |
| file_path = DATA_DIR / doc["name"] | |
| with open(file_path, 'w', encoding='utf-8') as f: | |
| f.write(doc["content"]) | |
| print(f" Created: {file_path}") | |
| # Create additional text files | |
| topics = ["ai", "databases", "web", "devops", "cloud", "security"] | |
| for i, topic in enumerate(topics): | |
| file_path = DATA_DIR / f"{topic}_overview.txt" | |
| content = f"# {topic.title()} Overview\n\n" | |
| content += f"This document discusses key concepts in {topic}.\n\n" | |
| content += "## Key Concepts\n" | |
| for j in range(1, 6): | |
| content += f"{j}. Important aspect {j} of {topic}\n" | |
| content += f" - Detail {j}a about this aspect\n" | |
| content += f" - Detail {j}b about this aspect\n" | |
| content += f" - Detail {j}c about this aspect\n\n" | |
| content += "## Applications\n" | |
| content += f"- Application 1 of {topic}\n" | |
| content += f"- Application 2 of {topic}\n" | |
| content += f"- Application 3 of {topic}\n" | |
| with open(file_path, 'w', encoding='utf-8') as f: | |
| f.write(content) | |
| print(f" Created: {file_path}") | |
| print(f"\nCreated {len(sample_docs) + len(topics)} sample documents in {DATA_DIR}") | |
| print("You can add your own documents to the data/ directory") | |
| if __name__ == "__main__": | |
| download_sample_data() | |