diff --git a/.cursor/plans/researchmind_rag_agent_7390b536.plan.md b/.cursor/plans/researchmind_rag_agent_7390b536.plan.md new file mode 100644 index 0000000000000000000000000000000000000000..7a5dc09d93a14b329abb3d36afba4f84dfe28f72 --- /dev/null +++ b/.cursor/plans/researchmind_rag_agent_7390b536.plan.md @@ -0,0 +1,366 @@ +--- +name: ResearchMind RAG Agent +overview: "Add ResearchMind: ingest skills (web/PDF/extract) with references and scripts, a persistent MemRAG store (SQLite + embeddings), an agent runner with citation-backed Q&A, and a new Gradio tab. Topic mode suggests URLs via the local model (user confirms); optional auto-search mode via app dropdown and skill flags." +todos: + - id: pkg-researchmind + content: "Create libs/researchmind package: MemRAGStore (SQLite), chunking, sentence-transformers embeddings, retrieve + citations" + status: completed + - id: skills-scrape-extract + content: Add skills/scrape-web, scrape-pdf, extract-content, research-mind with references/ and scripts/ CLIs + status: completed + - id: agent-runner + content: Extend SkillRegistry (flags), ToolRegistry (5 tools), AgentRunner ingest/chat with suggest_urls + auto_search boolean + status: completed + - id: gradio-tab + content: "Add research_mind.py tab: topic/URL/file ingest, mode dropdown, URL confirm, session chat, trace accordion" + status: completed + - id: tests-docs + content: Unit tests for store/retrieve/runner; update .env.example and README for ResearchMind offline Q&A + status: completed +isProject: false +--- + +# ResearchMind — Scraper + RAG + MemRAG Plan + +## Goal + +Ship a **Backyard AI** research agent that: +1. Accepts a **topic**, **URL**, or **PDF/doc** upload +2. **Ingests once** (scrape → extract → chunk → embed → graph persist) +3. Answers questions **offline** across sessions with **citations** +4. Uses the **active local preset** from [`models.yaml`](models.yaml) (no new training in MVP) + +## Architecture + +```mermaid +flowchart TB + subgraph gradio [Gradio Research Tab] + Input[Topic / URL / File] + Mode[Ingest mode dropdown] + Confirm[URL confirm list] + Chat[Research chat] + end + + subgraph skills [skills/] + SW[scrape-web] + SP[scrape-pdf] + EX[extract-content] + RM[research-mind] + end + + subgraph lib [libs/researchmind] + Ingest[IngestPipeline] + Store[MemRAGStore] + Retrieve[Retriever] + Cite[CitationFormatter] + end + + subgraph agent [libs/agent] + Runner[AgentRunner.run_researchmind] + Tools[ToolRegistry] + Trace[TraceRecorder] + end + + Input --> Runner + Mode --> Runner + Runner --> SW + Runner --> SP + Runner --> EX + SW --> Ingest + SP --> Ingest + EX --> Ingest + Ingest --> Store + Chat --> Retrieve + Retrieve --> Store + Runner --> Cite + Cite --> Chat + Runner --> Trace +``` + +**Separation of concerns** +- **Skills** (`skills/*/SKILL.md` + `references/` + `scripts/`) — workflow docs and thin CLIs the agent/humans can invoke +- **`libs/researchmind/`** — real Python library: scrape, extract, chunk, embed, SQLite MemRAG, retrieval +- **`libs/agent/`** — orchestration: `AgentRunner.run_researchmind()`, tool handlers, prompts with citations +- **`apps/gradio-space/`** — third top-level tab wired like [`education_pptx.py`](apps/gradio-space/src/gradio_space/tabs/education_pptx.py) + +**Not in MVP scope:** wiring [`research/ensemble/src/ensemble/memory.py`](research/ensemble/src/ensemble/memory.py) toy `Embedder` (token-id bound, research-only). Production path uses **sentence-transformers** (`all-MiniLM-L6-v2`) for arbitrary text, fully offline after first model download. + +--- + +## 1. New package: `libs/researchmind/` + +Add workspace member in root [`pyproject.toml`](pyproject.toml) and depend from `agent` + `gradio-space`. + +| Module | Responsibility | +|--------|----------------| +| `store.py` | **MemRAGStore** — SQLite at `$RESEARCHMIND_DATA_DIR/memory.db` | +| `ingest.py` | **IngestPipeline** — normalize → chunk → embed → graph edges | +| `scrape_web.py` | `httpx` + `trafilatura` fetch/clean HTML | +| `scrape_pdf.py` | `pypdf` text extraction; optional OCR hook stub | +| `extract.py` | Unified `ExtractedDocument` (title, url, mime, text, metadata) | +| `chunking.py` | Sliding-window chunks (~512 tokens / 128 overlap) with stable IDs | +| `embeddings.py` | Lazy-load `SentenceTransformer`, batch encode, L2-normalize | +| `retrieve.py` | Top-k cosine search + optional graph expansion (same-doc neighbors) | +| `citations.py` | Map chunks → `[1]` footnotes with source title/URL/page | +| `search_urls.py` | Optional DuckDuckGo search (`duckduckgo-search`) when `auto_search=True` | +| `url_suggest.py` | LLM prompt: topic → JSON list of suggested URLs (default path) | + +### MemRAG graph schema (SQLite) + +``` +documents(id, source_type, uri, title, ingested_at, content_hash) +chunks(id, doc_id, ordinal, text, embedding_blob, meta_json) +edges(src_id, dst_id, rel) -- doc->chunk, chunk->next_chunk, chunk->cites +sessions(id, topic, created_at) +session_messages(session_id, role, content, chunk_ids_json) +``` + +- **Persistence** enables cross-session memory: chat loads `session_id` or creates new; retrieval searches all ingested docs unless filtered by session/topic tag +- **Dedup**: skip re-ingest when `content_hash` matches +- **Graph expansion (light MemRAG)**: when retrieving chunk `k`, also pull adjacent chunks (`chunk->next_chunk`) from same document for context window assembly + +### Dependencies (add to `libs/researchmind/pyproject.toml`) + +- `httpx`, `trafilatura` — web scrape +- `pypdf` — PDF +- `python-docx` — already in agent; reuse for `.docx` uploads +- `sentence-transformers` — offline embeddings +- `duckduckgo-search` — optional auto-search mode +- `numpy` — vector ops (or store as bytes in SQLite) + +Env vars (extend [`.env.example`](.env.example)): + +| Variable | Default | Purpose | +|----------|---------|---------| +| `RESEARCHMIND_DATA_DIR` | `outputs/researchmind` | DB + raw snapshots | +| `RESEARCHMIND_EMBED_MODEL` | `all-MiniLM-L6-v2` | Embedding model | +| `RESEARCHMIND_AUTO_SEARCH` | `false` | Global default for auto-search | +| `RESEARCHMIND_TOP_K` | `5` | Retrieval depth | + +--- + +## 2. Skills layout (with references + scripts) + +Create four skill folders under [`skills/`](skills/), mirroring Cursor skill layout but using existing [`SkillRegistry`](libs/agent/src/agent/skills.py) frontmatter (`name`, `description`, `task`, `tools`): + +### `skills/scrape-web/` + +``` +scrape-web/ +├── SKILL.md +├── references/ +│ ├── allowed-domains.md # robots.txt / rate-limit notes +│ └── html-cleanup.md # trafilatura settings +└── scripts/ + └── scrape_url.py # CLI: python scripts/scrape_url.py --out ... +``` + +- **tools:** `scrape_web` +- Script calls `researchmind.scrape_web.fetch_and_extract` + +### `skills/scrape-pdf/` + +``` +scrape-pdf/ +├── SKILL.md +├── references/ +│ └── pdf-limits.md # max pages, scanned PDF note +└── scripts/ + └── extract_pdf.py +``` + +- **tools:** `scrape_pdf` + +### `skills/extract-content/` + +``` +extract-content/ +├── SKILL.md +├── references/ +│ └── chunking-policy.md +└── scripts/ + └── chunk_and_index.py # ingest into MemRAGStore +``` + +- **tools:** `extract_and_index` + +### `skills/research-mind/` (orchestrator) + +``` +research-mind/ +├── SKILL.md +├── references/ +│ ├── ingest-modes.md # suggest / auto_search / direct_url +│ └── citation-format.md +└── scripts/ + ├── suggest_urls.py + ├── ingest.py + └── ask.py # CLI Q&A with citations +``` + +Frontmatter additions (parsed as optional YAML fields in extended `Skill` dataclass): + +```yaml +--- +name: research-mind +task: research +tools: + - suggest_urls + - scrape_web + - scrape_pdf + - extract_and_index + - research_answer +flags: + auto_search: false # skill default; overridden by agent + Gradio +--- +``` + +Extend [`libs/agent/src/agent/skills.py`](libs/agent/src/agent/skills.py) to read optional `flags:` dict without breaking existing skills. + +--- + +## 3. Agent orchestration + +### New tools in [`libs/agent/src/agent/tools_registry.py`](libs/agent/src/agent/tools_registry.py) + +| Tool | Handler | +|------|---------| +| `suggest_urls` | `url_suggest.suggest(topic, backend)` → list[str] | +| `scrape_web` | fetch + return `ExtractedDocument` | +| `scrape_pdf` | extract PDF path/bytes | +| `extract_and_index` | chunk + embed + `MemRAGStore.add_document` | +| `research_answer` | retrieve + RAG prompt + `backend.chat` → answer + citations | + +### New runner method in [`libs/agent/src/agent/runner.py`](libs/agent/src/agent/runner.py) + +```python +def run_researchmind_ingest( + *, topic: str | None, urls: list[str], files: list[Path], + auto_search: bool, session_id: str | None, + model_key: str, backend: InferenceBackend, +) -> ResearchIngestResult: ... + +def run_researchmind_chat( + *, question: str, session_id: str, + model_key: str, backend: InferenceBackend, +) -> ResearchChatResult: ... +``` + +**Ingest flow (default — Option C)** + +1. If `topic` and no URLs/files: call `suggest_urls` (local LLM returns JSON URL list) +2. Return suggested URLs to UI for **user confirmation** (Gradio checkbox group) +3. On confirm: scrape each URL / PDF / doc → `extract_and_index` +4. If `auto_search=True`: skip LLM suggest; run DuckDuckGo `search_urls(topic, n=5)` and ingest without confirmation + +**Chat flow** + +1. `retrieve(question, top_k)` from `MemRAGStore` +2. Build system prompt from `skills/research-mind/SKILL.md` body + `references/citation-format.md` +3. Inject numbered context blocks; instruct model to cite `[n]` +4. `TraceRecorder` logs retrieval chunk IDs + LLM I/O (Sharing is Caring badge) + +### Pydantic models in [`libs/agent/src/agent/models.py`](libs/agent/src/agent/models.py) + +- `ResearchIngestInput`, `ResearchChatInput`, `Citation`, `ResearchChatResult` + +--- + +## 4. Gradio tab: Research Agent + +New file: [`apps/gradio-space/src/gradio_space/tabs/research_mind.py`](apps/gradio-space/src/gradio_space/tabs/research_mind.py) + +Register in [`app.py`](apps/gradio-space/src/gradio_space/app.py) and [`tabs/__init__.py`](apps/gradio-space/src/gradio_space/tabs/__init__.py). + +### UI layout + +``` +Research Agent tab +├── Markdown intro (offline-after-ingest, citations) +├── Session: dropdown of past sessions + "New session" +├── Ingest section +│ ├── Textbox: topic (optional) +│ ├── Textbox: URLs (one per line, optional) +│ ├── File: PDF/DOCX upload (optional) +│ ├── Dropdown: ingest mode +│ │ ├── "Suggest URLs (confirm)" [default] +│ │ └── "Auto search & ingest" +│ ├── Button: "Discover sources" → shows CheckboxGroup of suggested URLs +│ └── Button: "Ingest selected" → status + doc count +├── Chat section +│ ├── Chatbot (history) +│ ├── Textbox: question +│ └── Button: Ask +└── Accordion: trace JSON + ingested sources table +``` + +**Handler pattern:** mirror `generate_lesson_slides()` — `ensure_model_loaded()`, `AgentRunner()`, try/except with user-visible errors, `gradio_allowed_paths()` extended for `RESEARCHMIND_DATA_DIR`. + +Update app header in `app.py` to mention ResearchMind alongside Lesson Agent. + +--- + +## 5. Offline-after-ingest guarantee + +| Phase | Network | +|-------|---------| +| Ingest (scrape/search) | May use network | +| Embed model first run | HuggingFace download once | +| Q&A / chat | **No network** — only SQLite + local LLM | + +Raw HTML/PDF snapshots saved under `RESEARCHMIND_DATA_DIR/raw/{doc_id}/` for audit and re-chunk without re-scrape. + +--- + +## 6. Tests + +| Location | Coverage | +|----------|----------| +| `libs/researchmind/tests/test_store.py` | SQLite CRUD, dedup hash | +| `libs/researchmind/tests/test_chunking.py` | chunk boundaries | +| `libs/researchmind/tests/test_retrieve.py` | top-k with fixture embeddings | +| `libs/agent/tests/test_research_runner.py` | mock backend; ingest + chat happy path | +| `libs/researchmind/tests/fixtures/` | small HTML snippet + 1-page PDF | + +Use offline fixtures for CI; mark optional network tests `@pytest.mark.network`. + +--- + +## 7. Docker / Space considerations + +- Add `sentence-transformers` + embedding model to Docker image **or** lazy-download on first ingest (document in README) +- `allowed_paths` must include `RESEARCHMIND_DATA_DIR` for any file previews +- GPU not required for embeddings on CPU (MiniLM is small); same GPU preset works for chat + +--- + +## 8. Implementation order + +1. **`libs/researchmind`** core: store, chunk, embed, retrieve, citations +2. **Skills** skeleton: four folders with SKILL.md + references + script stubs calling library +3. **Agent tools + runner** methods +4. **Gradio tab** with suggest-confirm flow + auto-search dropdown +5. **Tests + `.env.example` + README** section under Backyard AI track + +--- + +## Key files to modify + +| File | Change | +|------|--------| +| [`pyproject.toml`](pyproject.toml) | Add `researchmind` workspace member | +| [`libs/agent/pyproject.toml`](libs/agent/pyproject.toml) | Depend on `researchmind` | +| [`apps/gradio-space/pyproject.toml`](apps/gradio-space/pyproject.toml) | Transitive via `agent` | +| [`libs/agent/src/agent/skills.py`](libs/agent/src/agent/skills.py) | Optional `flags` in frontmatter | +| [`libs/agent/src/agent/runner.py`](libs/agent/src/agent/runner.py) | `run_researchmind_*` | +| [`apps/gradio-space/src/gradio_space/app.py`](apps/gradio-space/src/gradio_space/app.py) | Third tab | +| [`.env.example`](.env.example) | ResearchMind env vars | +| [`README.md`](README.md) | ResearchMind usage blurb | + +--- + +## Future (post-MVP, not in this PR) + +- LoRA distillation on ingested corpus via [`research/finetune.py`](research/finetune.py) +- Bridge to [`research/ensemble`](research/ensemble/) for ablation experiments +- Entity extraction edges in MemRAG graph (true knowledge graph) diff --git a/.env.example b/.env.example index e7e5bae7665a0b4466ba77d2867eeaf7a5498f05..3af4af4645cb9bc1790e649c001b987fff929a8a 100644 --- a/.env.example +++ b/.env.example @@ -9,6 +9,14 @@ ALLOW_MODEL_SWITCH=false # AGENT_TRACES_DIR=outputs/traces # SKILLS_DIR=./skills +# --- ResearchMind (MemRAG + scraper) --- +# RESEARCHMIND_DATA_DIR=outputs/researchmind +# RESEARCHMIND_EMBED_MODEL=all-MiniLM-L6-v2 +# RESEARCHMIND_AUTO_SEARCH=false +# RESEARCHMIND_TOP_K=5 +# RESEARCHMIND_CHUNK_SIZE=512 +# RESEARCHMIND_CHUNK_OVERLAP=128 + # --- Legacy single-model overrides (optional; applied to ACTIVE_MODEL only) --- # INFERENCE_BACKEND=transformers # MODEL_ID=openbmb/MiniCPM5-1B diff --git a/.gitignore b/.gitignore index 7344c2ce9b537468b10cecb38b75b1724f4f507c..71fb6cfd36e09054c2def6948041df8ca12ba109 100644 --- a/.gitignore +++ b/.gitignore @@ -12,4 +12,6 @@ build/ outputs/traces -/results \ No newline at end of file +/results + +outputs/researchmind \ No newline at end of file diff --git a/README.md b/README.md index 9607d77a2ca1ddf74022d9d031771b09aacd1ddb..8218f8de9b9813266849e432933be68c6ea8338b 100644 --- a/README.md +++ b/README.md @@ -32,7 +32,10 @@ cp .env.example .env # optional: edit model settings uv run --package gradio-space python -m gradio_space.app ``` -Open [http://localhost:7860](http://localhost:7860). Use the **Lesson slides** tab: enter a topic, grade, and slide count. The model loads on first generate. +Open [http://localhost:7860](http://localhost:7860). + +- **Lesson slides** — topic, grade, slide count → downloadable PowerPoint +- **Research Agent** — scrape/index sources into MemRAG, then ask questions offline with citations ## How it works @@ -42,13 +45,21 @@ Open [http://localhost:7860](http://localhost:7860). Use the **Lesson slides** t 4. **Trace** — JSON log saved under `outputs/traces/` for the Sharing is Caring badge ```text -apps/gradio-space/ # Gradio tabs (Lesson slides + Chat debug) +apps/gradio-space/ # Gradio tabs (Lesson slides, Research Agent, Chat debug) libs/agent/ # Skill agent runner, tools, trace recorder +libs/researchmind/ # Scraper, chunk/embed, MemRAG SQLite store, retrieval libs/inference/ # Transformers + llama.cpp backends -skills/ # SKILL.md task definitions +skills/ # SKILL.md + references/ + scripts/ per task research/ # Fine-tune, ensemble experiments, agentic evals (optional) ``` +### ResearchMind (offline after ingest) + +1. **Skills** — `skills/scrape-web`, `scrape-pdf`, `extract-content`, `research-mind` +2. **Ingest** — URL/PDF/DOCX or topic → (optional LLM URL suggest + confirm, or auto search) → chunk + embed → SQLite +3. **Q&A** — local model + retrieved chunks with `[n]` citations (no network at chat time) +4. **Memory** — persists under `RESEARCHMIND_DATA_DIR` (default `outputs/researchmind`) + Optional research tooling (not required for the Space): see [research/USAGE.md](research/USAGE.md). ## Environment variables @@ -59,6 +70,9 @@ Optional research tooling (not required for the Space): see [research/USAGE.md]( | `AGENT_OUTPUTS_DIR` | `/tmp/agent_outputs` | Generated `.pptx` files | | `AGENT_TRACES_DIR` | `outputs/traces` | Agent trace JSON | | `SKILLS_DIR` | `./skills` | Skill definitions root | +| `RESEARCHMIND_DATA_DIR` | `outputs/researchmind` | MemRAG DB and raw snapshots | +| `RESEARCHMIND_EMBED_MODEL` | `all-MiniLM-L6-v2` | Sentence embedding model | +| `RESEARCHMIND_AUTO_SEARCH` | `false` | Default auto DuckDuckGo ingest | See [`.env.example`](.env.example) and [`models.yaml`](models.yaml) for model presets. diff --git a/apps/gradio-space/src/gradio_space/app.py b/apps/gradio-space/src/gradio_space/app.py index 88417e1a6792379c5331fb74d589157db092a877..b88f4346c7249e0f7630e815820f210c9c643d01 100644 --- a/apps/gradio-space/src/gradio_space/app.py +++ b/apps/gradio-space/src/gradio_space/app.py @@ -3,8 +3,9 @@ import os import gradio as gr from gradio_space.model_loading import preload_active_model -from gradio_space.tabs import build_chat_tab, build_education_pptx_tab +from gradio_space.tabs import build_chat_tab, build_education_pptx_tab, build_research_mind_tab from gradio_space.tabs.education_pptx import gradio_allowed_paths +from gradio_space.tabs.research_mind import researchmind_allowed_paths from inference.config import get_app_config _app_config = get_app_config() @@ -18,12 +19,12 @@ def build_demo() -> gr.Blocks: else "Using built-in presets (models.yaml not found)." ) - with gr.Blocks(title="Lesson Agent — Build Small Hackathon") as demo: + with gr.Blocks(title="Lesson Agent + ResearchMind — Build Small Hackathon") as demo: gr.Markdown( f""" -# Lesson Agent +# Lesson Agent + ResearchMind -Local skill-based agent for teachers — **topic in, PowerPoint out**. +Local skill-based agents — **lesson slides** and **research with MemRAG** (offline Q&A after ingest). - **Model:** `{active.key}` — {active.label} - **Backend:** `{active.backend}` @@ -36,6 +37,8 @@ Part of the [Build Small Hackathon](https://huggingface.co/build-small-hackathon with gr.Tabs(): with gr.Tab("Lesson slides"): build_education_pptx_tab() + with gr.Tab("ResearchMind"): + build_research_mind_tab() with gr.Tab("Chat (debug)"): build_chat_tab() @@ -48,7 +51,7 @@ def main() -> None: demo.launch( server_name="0.0.0.0", server_port=int(os.environ.get("PORT", "7860")), - allowed_paths=gradio_allowed_paths(), + allowed_paths=[*gradio_allowed_paths(), *researchmind_allowed_paths()], ) diff --git a/apps/gradio-space/src/gradio_space/model_loading.py b/apps/gradio-space/src/gradio_space/model_loading.py index a8709977318a7c55129487c17ed12bc4dd364de1..f20f109cc735f100ce543c99a71ea5ddd3c9994a 100644 --- a/apps/gradio-space/src/gradio_space/model_loading.py +++ b/apps/gradio-space/src/gradio_space/model_loading.py @@ -1,5 +1,6 @@ from inference.config import get_app_config, get_model_config from inference.factory import get_backend, reset_backend +from inference.response_clean import strip_reasoning_output _app_config = get_app_config() _current_model_key: str | None = None @@ -111,4 +112,5 @@ def chat(message: str, history: list, model_key: str) -> str: messages = _history_to_messages(history) messages.append({"role": "user", "content": message}) - return get_backend(model_key).chat(messages) + reply = get_backend(model_key).chat(messages) + return strip_reasoning_output(reply) diff --git a/apps/gradio-space/src/gradio_space/research_helpers.py b/apps/gradio-space/src/gradio_space/research_helpers.py new file mode 100644 index 0000000000000000000000000000000000000000..7f242e78987b6149fd9953a725b50fe6b714c812 --- /dev/null +++ b/apps/gradio-space/src/gradio_space/research_helpers.py @@ -0,0 +1,196 @@ +from __future__ import annotations + +import json +from pathlib import Path + +import gradio as gr + +from agent.models import ResearchIngestResult +from agent.runner import AgentRunner +from gradio_space.model_loading import chat, ensure_model_loaded, get_active_model_key +from inference.factory import get_backend +from researchmind.ingest import IngestPipeline + + +def list_session_choices() -> list[tuple[str, str]]: + store = IngestPipeline().store + sessions = store.list_sessions() + choices: list[tuple[str, str]] = [("New session (chat only)", "")] + for s in sessions: + label = f"{s.topic or 'Untitled'} ({s.id})" + choices.append((label, s.id)) + return choices + + +def refresh_sessions(current: str): + choices = list_session_choices() + values = [c[1] for c in choices] + value = current if current in values else "" + return gr.update(choices=choices, value=value) + + +def list_doc_choices(session_id: str | None) -> list[tuple[str, str]]: + store = IngestPipeline().store + docs = store.list_documents(session_id=session_id or None) + choices: list[tuple[str, str]] = [] + for d in docs: + label = f"{d.title} ({d.source_type})" + if len(d.uri) > 60: + label += f" — {d.uri[:57]}…" + else: + label += f" — {d.uri}" + choices.append((label, d.id)) + return choices + + +def refresh_doc_choices(session_id: str, current: list[str] | None): + choices = list_doc_choices(session_id or None) + valid = {c[1] for c in choices} + selected = [doc_id for doc_id in (current or []) if doc_id in valid] + default_selected = [c[1] for c in choices] if choices and not selected else selected + return gr.update(choices=choices, value=default_selected) + + +def load_trace_json(trace_path: str) -> str: + if not trace_path: + return "" + if trace_path.strip().startswith("{"): + return trace_path + path = Path(trace_path) + if path.is_file(): + return path.read_text(encoding="utf-8") + return trace_path + + +def trace_summary_markdown(trace_path: str) -> str: + raw = load_trace_json(trace_path) + if not raw or not raw.strip().startswith("{"): + return raw or "_No trace yet._" + try: + data = json.loads(raw) + except json.JSONDecodeError: + return f"Trace file: `{trace_path}`" + + lines = [ + f"**Run** `{data.get('run_id', '?')}` · skill `{data.get('skill', '?')}`", + "", + ] + for step in data.get("steps", []): + if step.get("type") != "note": + continue + msg = step.get("message", "") + extra = {k: v for k, v in step.items() if k not in ("type", "message")} + detail = "" + if extra: + detail = " — " + ", ".join(f"{k}={v!r}" for k, v in extra.items()) + lines.append(f"- {msg}{detail}") + if len(lines) <= 2: + lines.append("_No notes in trace. See Trace JSON below._") + return "\n".join(lines) + + +def format_ingest_status(result: ResearchIngestResult) -> str: + lines = [result.message, ""] + if result.ingested: + lines.append("**Ingested**") + lines.extend(f"- {url}" for url in result.ingested) + lines.append("") + if result.skipped: + lines.append("**Skipped (duplicate)**") + lines.extend(f"- {url}" for url in result.skipped) + lines.append("") + if result.failures: + lines.append("**Failed**") + for failure in result.failures: + lines.append(f"- `{failure.url}` — _{failure.stage}_: {failure.reason}") + lines.append("") + lines.append("_Open the **Trace** tab for full JSON._") + return "\n".join(lines).strip() + + +def memory_summary(session_id: str) -> str: + store = IngestPipeline().store + docs = store.list_documents(session_id=session_id or None) + chunks = store.count_chunks() + if not docs: + return f"_No documents indexed yet._ Total chunks in store: **{chunks}**." + scope = f"session `{session_id}`" if session_id else "all sessions" + lines = [f"**{len(docs)}** document(s) in {scope} · **{chunks}** total chunks in store\n"] + for d in docs: + lines.append(f"- **{d.title}** (`{d.source_type}`) — {d.uri}") + return "\n".join(lines) + + +def rag_scope_hint(session_id: str, doc_ids: list[str] | None) -> str: + if doc_ids: + return f"RAG scope: **{len(doc_ids)}** selected document(s)." + if session_id: + n = len(IngestPipeline().store.list_documents(session_id=session_id)) + return f"RAG scope: all **{n}** document(s) in session `{session_id}`." + return "RAG scope: **entire** indexed corpus (all sessions)." + + +def run_research_question( + question: str, + *, + session_id: str, + doc_ids: list[str] | None, + model_key: str | None = None, +) -> tuple[str, str, str]: + """Returns (answer_markdown, trace_json, trace_summary_md).""" + key = model_key or get_active_model_key() + load_error = ensure_model_loaded(key) + if load_error: + return load_error, load_error, load_error + + if not question.strip(): + return "Enter a question.", "", "" + + sid = session_id + if not sid: + sid = IngestPipeline().store.create_session().id + + runner = AgentRunner() + result = runner.run_researchmind_chat( + question=question, + session_id=sid, + doc_ids=doc_ids or None, + model_key=key, + backend=get_backend(key), + ) + trace_json = json.dumps( + { + "trace_path": result.trace_path, + "citations": [c.model_dump() for c in result.citations], + "scope": { + "session_id": sid, + "doc_ids": doc_ids or [], + }, + }, + indent=2, + ) + return ( + result.answer, + trace_json, + trace_summary_markdown(result.trace_path), + ) + + +def rag_aware_chat( + message: str, + history: list, + model_key: str, + use_rag: bool, + session_id: str, + doc_ids: list[str] | None, +) -> str: + if not use_rag: + return chat(message, history, model_key) + + answer, _, _ = run_research_question( + message, + session_id=session_id, + doc_ids=doc_ids, + model_key=model_key, + ) + return answer diff --git a/apps/gradio-space/src/gradio_space/tabs/__init__.py b/apps/gradio-space/src/gradio_space/tabs/__init__.py index 4c265d25cab953bb485fefb082e6f7f72eaf43c4..cc127edcedfc7e5a8ee71c68804943f369c8c7c3 100644 --- a/apps/gradio-space/src/gradio_space/tabs/__init__.py +++ b/apps/gradio-space/src/gradio_space/tabs/__init__.py @@ -1,4 +1,5 @@ from gradio_space.tabs.chat import build_chat_tab from gradio_space.tabs.education_pptx import build_education_pptx_tab +from gradio_space.tabs.research_mind import build_research_mind_tab -__all__ = ["build_chat_tab", "build_education_pptx_tab"] +__all__ = ["build_chat_tab", "build_education_pptx_tab", "build_research_mind_tab"] diff --git a/apps/gradio-space/src/gradio_space/tabs/chat.py b/apps/gradio-space/src/gradio_space/tabs/chat.py index 325be244cd7da5ce4b2b0749b908a05388a9b53e..57ab1c3cbf844a37767858573f0306613845f5e5 100644 --- a/apps/gradio-space/src/gradio_space/tabs/chat.py +++ b/apps/gradio-space/src/gradio_space/tabs/chat.py @@ -1,6 +1,13 @@ import gradio as gr -from gradio_space.model_loading import chat, model_status +from gradio_space.model_loading import model_status +from gradio_space.research_helpers import ( + list_session_choices, + rag_aware_chat, + rag_scope_hint, + refresh_doc_choices, + refresh_sessions, +) from inference.config import get_app_config _app_config = get_app_config() @@ -11,12 +18,29 @@ def build_chat_tab() -> None: """ ### Model chat (debug) -Test the active local model with a simple chat interface. +Test the active local model. Enable **ResearchMind RAG** to answer from ingested sessions and documents with citations. """ ) model_key = _app_config.active_model + with gr.Row(): + use_rag = gr.Checkbox(label="Use ResearchMind RAG", value=False) + session_dd = gr.Dropdown( + label="Session", + choices=list_session_choices(), + value="", + interactive=True, + ) + refresh_sessions_btn = gr.Button("Refresh", size="sm") + + doc_dd = gr.CheckboxGroup( + label="Documents to search (empty = all docs in session, or entire corpus if no session)", + choices=[], + value=[], + ) + rag_hint = gr.Markdown(value=rag_scope_hint("", [])) + if _app_config.allow_model_switch and len(_app_config.models) > 1: model_dropdown = gr.Dropdown( choices=_app_config.model_choices(), @@ -26,19 +50,42 @@ Test the active local model with a simple chat interface. status = gr.Markdown(model_status(model_key)) model_dropdown.change(fn=model_status, inputs=model_dropdown, outputs=status) gr.ChatInterface( - fn=chat, - additional_inputs=[model_dropdown], + fn=rag_aware_chat, + additional_inputs=[model_dropdown, use_rag, session_dd, doc_dd], examples=[ - ["Hello! What can you help me with?", _app_config.active_model], - ["Explain photosynthesis in one sentence.", _app_config.active_model], + ["What do my ingested sources say about AI agents?", _app_config.active_model, True, "", []], + ["Hello! What can you help me with?", _app_config.active_model, False, "", []], ], ) else: status = gr.Markdown(model_status(model_key)) + + def _chat(message, history, use_rag_flag, sid, docs): + return rag_aware_chat(message, history, model_key, use_rag_flag, sid, docs) + gr.ChatInterface( - fn=lambda message, history: chat(message, history, model_key), + fn=_chat, + additional_inputs=[use_rag, session_dd, doc_dd], examples=[ - "Hello! What can you help me with?", - "Explain photosynthesis in one sentence.", + ["What do my ingested sources say about AI agents?", True, "", []], + ["Hello! What can you help me with?", False, "", []], ], ) + + def _update_hint(sid: str, docs: list[str] | None, rag_on: bool) -> str: + if not rag_on: + return "_Plain chat — model only, no document retrieval._" + return rag_scope_hint(sid, docs) + + refresh_sessions_btn.click(fn=refresh_sessions, inputs=[session_dd], outputs=[session_dd]) + session_dd.change( + fn=refresh_doc_choices, + inputs=[session_dd, doc_dd], + outputs=[doc_dd], + ).then( + fn=_update_hint, + inputs=[session_dd, doc_dd, use_rag], + outputs=[rag_hint], + ) + doc_dd.change(fn=_update_hint, inputs=[session_dd, doc_dd, use_rag], outputs=[rag_hint]) + use_rag.change(fn=_update_hint, inputs=[session_dd, doc_dd, use_rag], outputs=[rag_hint]) diff --git a/apps/gradio-space/src/gradio_space/tabs/research_mind.py b/apps/gradio-space/src/gradio_space/tabs/research_mind.py new file mode 100644 index 0000000000000000000000000000000000000000..dfee5bbe14cc4f6a30dda4f859c43440b4b33418 --- /dev/null +++ b/apps/gradio-space/src/gradio_space/tabs/research_mind.py @@ -0,0 +1,366 @@ +from __future__ import annotations + +import logging +from pathlib import Path + +import gradio as gr + +from agent.runner import AgentRunner +from gradio_space.model_loading import ensure_model_loaded, get_active_model_key, model_status +from gradio_space.research_helpers import ( + format_ingest_status, + list_session_choices, + load_trace_json, + memory_summary, + rag_scope_hint, + refresh_doc_choices, + refresh_sessions, + run_research_question, + trace_summary_markdown, +) +from inference.factory import get_backend +from researchmind.config import get_config +from researchmind.ingest import IngestPipeline + +logger = logging.getLogger(__name__) + +INGEST_MODES = [ + ("Suggest URLs (confirm)", "suggest"), + ("Auto search & ingest", "auto"), +] + + +def discover_sources( + topic: str, + ingest_mode: str, + session_id: str, +) -> tuple[str, gr.Update, str, str, str, str, object]: + model_key = get_active_model_key() + load_error = ensure_model_loaded(model_key) + if load_error: + return ( + load_error, + gr.update(choices=[], value=[]), + session_id, + load_error, + load_error, + memory_summary(session_id), + refresh_doc_choices(session_id, []), + ) + + if not topic.strip(): + msg = "Enter a topic to discover sources." + return ( + msg, + gr.update(choices=[], value=[]), + session_id, + msg, + msg, + memory_summary(session_id), + refresh_doc_choices(session_id, []), + ) + + auto_search = ingest_mode == "auto" + try: + runner = AgentRunner() + if auto_search: + result = runner.run_researchmind_ingest( + topic=topic, + urls=[], + files=[], + auto_search=True, + session_id=session_id or None, + model_key=model_key, + backend=get_backend(model_key), + ) + trace_json = load_trace_json(result.trace_path) + return ( + format_ingest_status(result), + gr.update(choices=[], value=[]), + result.session_id, + trace_summary_markdown(result.trace_path), + trace_json, + memory_summary(result.session_id), + refresh_doc_choices(result.session_id, []), + ) + + discover = runner.run_researchmind_discover( + topic=topic, + auto_search=False, + session_id=session_id or None, + model_key=model_key, + backend=get_backend(model_key), + ) + choices = discover.suggested_urls + if not choices: + summary = ( + "No verified URLs found. Try a more specific topic, paste URLs manually, " + "or switch to **Auto search & ingest**." + ) + else: + summary = ( + f"Found **{len(choices)} verified URL(s)** via web search " + f"(Google + fallbacks). Select sources and click **Ingest selected**." + ) + trace_json = load_trace_json(discover.trace_path) + return ( + summary, + gr.update(choices=choices, value=choices), + discover.session_id, + trace_summary_markdown(discover.trace_path), + trace_json, + memory_summary(discover.session_id), + refresh_doc_choices(discover.session_id, []), + ) + except Exception as exc: # noqa: BLE001 + msg = f"Discover error: {exc}" + return ( + msg, + gr.update(choices=[], value=[]), + session_id, + msg, + msg, + memory_summary(session_id), + refresh_doc_choices(session_id, []), + ) + + +def ingest_selected( + topic: str, + urls_text: str, + selected_urls: list[str], + upload_files: list[str] | None, + session_id: str, +) -> tuple[str, str, str, str, object, object]: + model_key = get_active_model_key() + load_error = ensure_model_loaded(model_key) + if load_error: + return ( + load_error, + memory_summary(session_id), + load_error, + load_error, + refresh_sessions(session_id), + refresh_doc_choices(session_id, []), + ) + + direct_urls = [ln.strip() for ln in urls_text.splitlines() if ln.strip()] + all_urls = list(dict.fromkeys([*direct_urls, *(selected_urls or [])])) + files = [Path(p) for p in (upload_files or [])] + + if not all_urls and not files: + msg = "Provide URLs, select suggested sources, or upload a file." + return ( + msg, + memory_summary(session_id), + msg, + msg, + refresh_sessions(session_id), + refresh_doc_choices(session_id, []), + ) + + try: + logger.info("Ingesting %d URL(s) and %d file(s)", len(all_urls), len(files)) + runner = AgentRunner() + result = runner.run_researchmind_ingest( + topic=topic or None, + urls=all_urls, + files=files, + auto_search=False, + session_id=session_id or None, + model_key=model_key, + backend=get_backend(model_key), + ) + trace_json = load_trace_json(result.trace_path) + return ( + format_ingest_status(result), + memory_summary(result.session_id), + trace_json, + trace_summary_markdown(result.trace_path), + refresh_sessions(result.session_id), + refresh_doc_choices(result.session_id, []), + ) + except Exception as exc: # noqa: BLE001 + logger.exception("Ingest failed") + msg = f"**Ingest error:** {exc}" + return ( + msg, + memory_summary(session_id), + msg, + msg, + refresh_sessions(session_id), + refresh_doc_choices(session_id, []), + ) + + +def ask_question( + question: str, + session_id: str, + doc_ids: list[str] | None, + chat_history: list[dict], +) -> tuple[list[dict], str, str, str]: + if not question.strip(): + return chat_history or [], "Enter a question.", "", rag_scope_hint(session_id, doc_ids) + + try: + answer, trace_json, trace_summary = run_research_question( + question, + session_id=session_id, + doc_ids=doc_ids, + ) + history = list(chat_history or []) + history.append({"role": "user", "content": question}) + history.append({"role": "assistant", "content": answer}) + return history, trace_json, trace_summary, rag_scope_hint(session_id, doc_ids) + except Exception as exc: # noqa: BLE001 + logger.exception("Research chat failed") + history = list(chat_history or []) + history.append({"role": "user", "content": question}) + err = f"Chat error: {exc}" + history.append({"role": "assistant", "content": err}) + return history, err, err, rag_scope_hint(session_id, doc_ids) + + +def build_research_mind_tab() -> None: + """ResearchMind UI — ingest, memory, trace, and corpus chat.""" + model_key = get_active_model_key() + cfg = get_config() + + gr.Markdown( + """ +### ResearchMind + +Scrape sources once, index into **MemRAG** (local SQLite + embeddings), then ask questions **offline** with citations. +""" + ) + gr.Markdown(model_status(model_key)) + gr.Markdown(f"Memory store: `{cfg.data_dir.resolve()}`") + + with gr.Row(): + session_dd = gr.Dropdown( + label="Session", + choices=list_session_choices(), + value="", + interactive=True, + ) + refresh_btn = gr.Button("Refresh sessions", size="sm") + + with gr.Tabs(): + with gr.Tab("Ingest"): + gr.Markdown( + """ +- **Suggest mode:** Google web search → verified URLs → you confirm → ingest +- **Auto search:** same search, ingests top verified URLs immediately +- **Direct:** paste URLs or upload PDF/DOCX +""" + ) + with gr.Row(): + topic = gr.Textbox( + label="Topic (optional)", + placeholder="e.g. Photosynthesis, American Revolution", + ) + ingest_mode = gr.Dropdown( + label="Ingest mode", + choices=[m[0] for m in INGEST_MODES], + value=INGEST_MODES[0][0], + ) + + urls_text = gr.Textbox( + label="URLs (one per line, optional)", + lines=3, + placeholder="https://en.wikipedia.org/wiki/...", + ) + upload_files = gr.File( + label="Upload PDF or DOCX", + file_count="multiple", + file_types=[".pdf", ".docx"], + ) + + discover_btn = gr.Button("Discover sources", variant="secondary") + url_choices = gr.CheckboxGroup(label="Suggested URLs to ingest", choices=[]) + ingest_btn = gr.Button("Ingest selected", variant="primary") + ingest_status = gr.Markdown() + + with gr.Tab("Memory"): + gr.Markdown("Indexed documents and chunk counts for the selected session.") + memory_md = gr.Markdown(value=memory_summary("")) + refresh_memory_btn = gr.Button("Refresh memory view", size="sm") + + with gr.Tab("Trace"): + trace_summary = gr.Markdown() + trace_box = gr.Textbox(label="Trace JSON", lines=14, interactive=False) + + gr.Markdown("---") + gr.Markdown("### Chat with your corpus") + gr.Markdown( + "Ask questions about ingested sources. Limit search to specific documents below, " + "or leave all checked to search the whole session." + ) + rag_hint = gr.Markdown(value=rag_scope_hint("", [])) + doc_dd = gr.CheckboxGroup( + label="Documents in session", + choices=[], + value=[], + ) + chatbot = gr.Chatbot(label="Research chat", height=360) + question = gr.Textbox( + label="Question", + placeholder="What do these sources say about AI agents?", + ) + ask_btn = gr.Button("Ask", variant="primary") + + refresh_btn.click(fn=refresh_sessions, inputs=[session_dd], outputs=[session_dd]) + refresh_memory_btn.click(fn=memory_summary, inputs=[session_dd], outputs=[memory_md]) + session_dd.change(fn=memory_summary, inputs=[session_dd], outputs=[memory_md]) + session_dd.change( + fn=refresh_doc_choices, + inputs=[session_dd, doc_dd], + outputs=[doc_dd], + ).then( + fn=rag_scope_hint, + inputs=[session_dd, doc_dd], + outputs=[rag_hint], + ) + doc_dd.change(fn=rag_scope_hint, inputs=[session_dd, doc_dd], outputs=[rag_hint]) + + discover_btn.click( + fn=lambda topic, mode, sid: discover_sources( + topic, + "auto" if mode == INGEST_MODES[1][0] else "suggest", + sid, + ), + inputs=[topic, ingest_mode, session_dd], + outputs=[ + ingest_status, + url_choices, + session_dd, + trace_summary, + trace_box, + memory_md, + doc_dd, + ], + ) + + ingest_btn.click( + fn=ingest_selected, + inputs=[topic, urls_text, url_choices, upload_files, session_dd], + outputs=[ingest_status, memory_md, trace_box, trace_summary, session_dd, doc_dd], + ) + + ask_btn.click( + fn=ask_question, + inputs=[question, session_dd, doc_dd, chatbot], + outputs=[chatbot, trace_box, trace_summary, rag_hint], + ) + question.submit( + fn=ask_question, + inputs=[question, session_dd, doc_dd, chatbot], + outputs=[chatbot, trace_box, trace_summary, rag_hint], + ) + + +def researchmind_allowed_paths() -> list[str]: + cfg = get_config() + root = cfg.data_dir.resolve() + root.mkdir(parents=True, exist_ok=True) + return [str(root)] diff --git a/libs/agent/pyproject.toml b/libs/agent/pyproject.toml index 11c6b37acf2e37aae550dd6e549d9c18e46819f7..9eb7e9faea13bcdda530df9c91abb25a299e40f6 100644 --- a/libs/agent/pyproject.toml +++ b/libs/agent/pyproject.toml @@ -9,6 +9,7 @@ authors = [ requires-python = ">=3.12" dependencies = [ "inference", + "researchmind", "pillow>=10.0.0", "pydantic>=2.0.0", "python-docx>=1.1.0", @@ -18,6 +19,7 @@ dependencies = [ [tool.uv.sources] inference = { workspace = true } +researchmind = { workspace = true } [build-system] requires = ["uv_build>=0.8.13,<0.9.0"] diff --git a/libs/agent/src/agent/models.py b/libs/agent/src/agent/models.py index 22b1af3b88f26fe65a207f2131a3b08900dce4c9..e5d3da59eb0b401cb8b72a741f9d4b3e4d823f08 100644 --- a/libs/agent/src/agent/models.py +++ b/libs/agent/src/agent/models.py @@ -18,3 +18,55 @@ class EducationPptxInput(BaseModel): topic: str grade: str slide_count: int = Field(ge=3, le=8) + + +class Citation(BaseModel): + index: int + chunk_id: str + doc_title: str + doc_uri: str + excerpt: str + + +class ResearchIngestInput(BaseModel): + topic: str = "" + urls: list[str] = Field(default_factory=list) + auto_search: bool = False + session_id: str | None = None + + +class ResearchChatInput(BaseModel): + question: str + session_id: str + doc_ids: list[str] = Field(default_factory=list) + + +class ResearchDiscoverResult(BaseModel): + suggested_urls: list[str] + session_id: str + trace_path: str + + +class IngestFailure(BaseModel): + url: str + reason: str + stage: str = "unknown" + + +class ResearchIngestResult(BaseModel): + session_id: str + ingested: list[str] + skipped: list[str] + failures: list[IngestFailure] = Field(default_factory=list) + doc_count: int + chunk_count: int + trace_path: str + message: str + + +class ResearchChatResult(BaseModel): + answer: str + citations: list[Citation] + references_markdown: str + session_id: str + trace_path: str diff --git a/libs/agent/src/agent/research_prompts.py b/libs/agent/src/agent/research_prompts.py new file mode 100644 index 0000000000000000000000000000000000000000..57ad26e296230af2688ff120c5c82093f0e860ed --- /dev/null +++ b/libs/agent/src/agent/research_prompts.py @@ -0,0 +1,36 @@ +from __future__ import annotations + +from pathlib import Path + + +def _load_reference(skill_path: Path, rel: str) -> str: + ref = skill_path.parent / rel + if ref.is_file(): + return ref.read_text(encoding="utf-8") + return "" + + +def research_answer_system(skill_body: str, skill_path: Path) -> str: + citation_ref = _load_reference(skill_path, "references/citation-format.md") + parts = [ + "You are ResearchMind, a local research assistant.", + "Answer ONLY from the provided context.", + "Each context block is numbered [1], [2], … — one number per source document.", + "Cite with those numbers only (e.g. [1]). Use at most a few citations per answer.", + "Ignore any [n] markers inside source text; never list citation numbers in a row.", + skill_body, + ] + if citation_ref: + parts.append(citation_ref) + return "\n\n".join(parts) + + +def research_answer_user(question: str, context: str) -> str: + return f"""Context: +{context} + +Question: {question} + +Write a concise answer with inline [n] citations (one index per source document). +Do not append a References section — it is added automatically. +If context is insufficient, say so.""" diff --git a/libs/agent/src/agent/runner.py b/libs/agent/src/agent/runner.py index 061604fd5eb597d1ebbd02d1fca1951ec86a8eb6..039baf918db10ec42bf9939895e9299603689916 100644 --- a/libs/agent/src/agent/runner.py +++ b/libs/agent/src/agent/runner.py @@ -3,11 +3,23 @@ from __future__ import annotations import json import re from dataclasses import dataclass +from pathlib import Path from typing import Any from inference.base import InferenceBackend +from researchmind.extract import extract_docx +from researchmind.ingest import IngestPipeline -from agent.models import EducationPptxInput, SlideOutline, SlideSpec +from agent.models import ( + Citation, + EducationPptxInput, + ResearchChatInput, + ResearchChatResult, + ResearchDiscoverResult, + ResearchIngestResult, + SlideOutline, + SlideSpec, +) from agent.preview import outline_to_html, render_slide_images from agent.prompts import ( education_outline_repair, @@ -21,6 +33,7 @@ from agent.tools_registry import ToolRegistry from agent.trace import TraceRecorder EDUCATION_PPTX_SKILL = "education-pptx" +RESEARCH_MIND_SKILL = "research-mind" @dataclass @@ -225,3 +238,246 @@ class AgentRunner: if start >= 0 and end > start: cleaned = cleaned[start : end + 1] return json.loads(cleaned) + + def _research_skill(self) -> Any: + return self._skills.get(RESEARCH_MIND_SKILL) + + def _ensure_session( + self, + store: Any, + session_id: str | None, + topic: str = "", + ) -> str: + if session_id and store.get_session(session_id): + return session_id + return store.create_session(topic=topic).id + + def run_researchmind_discover( + self, + *, + topic: str, + auto_search: bool, + session_id: str | None, + model_key: str, + backend: InferenceBackend, + ) -> ResearchDiscoverResult: + skill = self._research_skill() + pipeline = IngestPipeline() + store = pipeline.store + sid = self._ensure_session(store, session_id, topic=topic) + + trace = TraceRecorder( + skill=skill.name, + model=model_key, + user_input={"topic": topic, "auto_search": auto_search, "phase": "discover"}, + ) + backend.load() + + search_tool = self._tools.get("search_urls") + urls = search_tool.handler(topic, n=8) + trace.log_tool( + "search_urls", + {"topic": topic, "n": 8, "queries": "google+ddg"}, + json.dumps(urls), + ) + if not urls: + suggest_tool = self._tools.get("suggest_urls") + from researchmind.url_validate import filter_valid_urls + + raw_llm = suggest_tool.handler(topic, backend) + urls = filter_valid_urls(raw_llm, check_reachable=True, max_results=5) + trace.log_tool("suggest_urls", {"topic": topic, "fallback": True}, json.dumps(urls)) + + trace_path = str(trace.save()) + return ResearchDiscoverResult( + suggested_urls=urls, + session_id=sid, + trace_path=trace_path, + ) + + def run_researchmind_ingest( + self, + *, + topic: str | None, + urls: list[str], + files: list[Path], + auto_search: bool, + session_id: str | None, + model_key: str, + backend: InferenceBackend, + ) -> ResearchIngestResult: + skill = self._research_skill() + pipeline = IngestPipeline() + store = pipeline.store + sid = self._ensure_session(store, session_id, topic=topic or "") + + trace = TraceRecorder( + skill=skill.name, + model=model_key, + user_input={ + "topic": topic, + "urls": urls, + "files": [str(f) for f in files], + "auto_search": auto_search, + "session_id": sid, + }, + ) + backend.load() + + targets = [u.strip() for u in urls if u.strip()] + if auto_search and topic and not targets and not files: + discover = self.run_researchmind_discover( + topic=topic, + auto_search=True, + session_id=sid, + model_key=model_key, + backend=backend, + ) + targets = discover.suggested_urls + + from agent.models import IngestFailure + + ingested: list[str] = [] + skipped: list[str] = [] + failures: list[IngestFailure] = [] + + scrape_web = self._tools.get("scrape_web") + extract_index = self._tools.get("extract_and_index") + + from researchmind.url_validate import validate_url + + for url in targets: + ok, reason, normalized = validate_url(url, check_reachable=False) + if not ok: + trace.log_note(f"Skipped invalid URL {url}", reason=reason, stage="validate") + failures.append(IngestFailure(url=url, reason=reason, stage="validate")) + continue + try: + doc = scrape_web.handler(normalized) + if not (doc.text or "").strip(): + msg = "empty content after scrape" + trace.log_note(f"Ingest failed for {url}", error=msg, stage="scrape") + failures.append(IngestFailure(url=url, reason=msg, stage="scrape")) + continue + doc_id, is_new = extract_index.handler(doc, session_id=sid) + trace.log_tool("scrape_web", {"url": url}, doc.title) + trace.log_tool( + "extract_and_index", + {"uri": doc.uri}, + f"{doc_id} new={is_new}", + ) + (ingested if is_new else skipped).append(url) + except Exception as exc: # noqa: BLE001 + trace.log_note(f"Ingest failed for {url}", error=str(exc), stage="ingest") + failures.append(IngestFailure(url=url, reason=str(exc), stage="ingest")) + + for file_path in files: + path = Path(file_path) + try: + if path.suffix.lower() == ".pdf": + doc = self._tools.get("scrape_pdf").handler(path) + elif path.suffix.lower() == ".docx": + doc = extract_docx(path) + else: + text = path.read_text(encoding="utf-8", errors="replace") + from researchmind.extract import ExtractedDocument + + doc = ExtractedDocument( + source_type="file", + uri=str(path.resolve()), + title=path.stem, + text=text, + ) + doc_id, is_new = extract_index.handler(doc, session_id=sid) + trace.log_tool("extract_and_index", {"file": str(path)}, f"{doc_id} new={is_new}") + label = path.name + (ingested if is_new else skipped).append(label) + except Exception as exc: # noqa: BLE001 + trace.log_note(f"Ingest failed for {path}", error=str(exc)) + skipped.append(path.name) + + doc_count = len(store.list_documents(session_id=sid)) + chunk_count = store.count_chunks() + fail_n = len(failures) + message = ( + f"Ingested {len(ingested)} source(s), skipped/duplicate {len(skipped)}, " + f"failed {fail_n}. Session `{sid}` has {doc_count} document(s); " + f"{chunk_count} total chunks." + ) + trace.log_note(message, failures=[f.model_dump() for f in failures]) + trace_path = str(trace.save()) + + return ResearchIngestResult( + session_id=sid, + ingested=ingested, + skipped=skipped, + failures=failures, + doc_count=doc_count, + chunk_count=chunk_count, + trace_path=trace_path, + message=message, + ) + + def run_researchmind_chat( + self, + *, + question: str, + session_id: str, + model_key: str, + backend: InferenceBackend, + doc_ids: list[str] | None = None, + ) -> ResearchChatResult: + skill = self._research_skill() + req = ResearchChatInput( + question=question.strip(), + session_id=session_id, + doc_ids=doc_ids or [], + ) + + trace = TraceRecorder( + skill=skill.name, + model=model_key, + user_input=req.model_dump(), + ) + backend.load() + + answer_tool = self._tools.get("research_answer") + raw_answer, citations, refs = answer_tool.handler( + req.question, + backend, + skill_body=skill.body, + skill_path=skill.path, + session_id=req.session_id, + doc_ids=req.doc_ids or None, + ) + trace.log_llm(req.question, raw_answer) + trace.log_note( + "citations", + count=len(citations), + session_id=req.session_id, + doc_ids=req.doc_ids, + ) + + full_answer = raw_answer + if refs: + full_answer = f"{raw_answer}\n\n{refs}" + + trace_path = str(trace.save()) + pydantic_citations = [ + Citation( + index=c.index, + chunk_id=c.chunk_id, + doc_title=c.doc_title, + doc_uri=c.doc_uri, + excerpt=c.excerpt, + ) + for c in citations + ] + + return ResearchChatResult( + answer=full_answer, + citations=pydantic_citations, + references_markdown=refs, + session_id=req.session_id, + trace_path=trace_path, + ) diff --git a/libs/agent/src/agent/skills.py b/libs/agent/src/agent/skills.py index 82e7a942dc1969520730286069adea3e162c9391..7e01e6c193667ee48c7acb67f9f2d97388f0bda6 100644 --- a/libs/agent/src/agent/skills.py +++ b/libs/agent/src/agent/skills.py @@ -15,6 +15,7 @@ class Skill: task: str tools: list[str] model_hints: list[str] + flags: dict[str, Any] body: str path: Path @@ -44,12 +45,16 @@ def _parse_skill_md(path: Path) -> Skill: meta: dict[str, Any] = yaml.safe_load(match.group(1)) or {} body = match.group(2).strip() + raw_flags = meta.get("flags") or {} + flags = {str(k): v for k, v in raw_flags.items()} if isinstance(raw_flags, dict) else {} + return Skill( name=str(meta.get("name", path.parent.name)), description=str(meta.get("description", "")), task=str(meta.get("task", "")), tools=[str(t) for t in meta.get("tools", [])], model_hints=[str(m) for m in meta.get("model_hints", [])], + flags=flags, body=body, path=path, ) diff --git a/libs/agent/src/agent/tools/research_tools.py b/libs/agent/src/agent/tools/research_tools.py new file mode 100644 index 0000000000000000000000000000000000000000..7bc04697d865d6bcc396a7915200d349b8ce78f8 --- /dev/null +++ b/libs/agent/src/agent/tools/research_tools.py @@ -0,0 +1,93 @@ +from __future__ import annotations + +from pathlib import Path +from typing import Any + +from researchmind.citations import Citation, clean_model_answer, format_context_block, format_references +from researchmind.config import get_config +from researchmind.extract import ExtractedDocument +from researchmind.ingest import IngestPipeline +from researchmind.retrieve import retrieve +from researchmind.scrape_pdf import extract_pdf +from researchmind.scrape_web import fetch_and_extract +from researchmind.search_urls import search_urls +from researchmind.store import MemRAGStore +from researchmind.url_suggest import suggest_urls as llm_suggest_urls + +from agent.research_prompts import research_answer_system, research_answer_user + + +def get_store() -> MemRAGStore: + return IngestPipeline().store + + +def tool_suggest_urls(topic: str, backend: Any) -> list[str]: + return llm_suggest_urls(topic, backend) + + +def tool_scrape_web(url: str) -> ExtractedDocument: + return fetch_and_extract(url) + + +def tool_scrape_pdf(path: Path) -> ExtractedDocument: + return extract_pdf(path) + + +def tool_extract_and_index( + doc: ExtractedDocument, + *, + session_id: str | None = None, +) -> tuple[str, bool]: + pipeline = IngestPipeline() + return pipeline.ingest_document(doc, session_id=session_id) + + +def tool_research_answer( + question: str, + backend: Any, + *, + skill_body: str, + skill_path: Path, + session_id: str | None = None, + doc_ids: list[str] | None = None, +) -> tuple[str, list[Citation], str]: + cfg = get_config() + store = get_store() + scope_session = session_id if session_id and not doc_ids else None + scope_docs = doc_ids if doc_ids else None + chunks = retrieve( + question, + store, + config=cfg, + session_id=scope_session, + doc_ids=scope_docs, + ) + if not chunks: + if doc_ids: + hint = "No chunks for the selected document(s). Try other sources or re-ingest." + elif session_id: + hint = "No indexed sources in this session yet. Ingest URLs or files first." + else: + hint = "No indexed sources yet. Ingest URLs or documents first." + return hint, [], "" + + context, citations = format_context_block(chunks) + system = research_answer_system(skill_body, skill_path) + user = research_answer_user(question, context) + messages = [ + {"role": "system", "content": system}, + {"role": "user", "content": user}, + ] + answer = clean_model_answer( + backend.chat(messages, max_tokens=1024, temperature=0.3) + ) + refs = format_references(citations) + if session_id: + store.add_message(session_id, "user", question, [c.chunk_id for c in citations]) + store.add_message(session_id, "assistant", answer, [c.chunk_id for c in citations]) + + return answer, citations, refs + + +def tool_search_urls(topic: str, *, n: int = 5, check_reachable: bool = True) -> list[str]: + return search_urls(topic, n=n, check_reachable=check_reachable) diff --git a/libs/agent/src/agent/tools_registry.py b/libs/agent/src/agent/tools_registry.py index 5d0388095125534ae12c54ecacc3857f162733e9..43e0c384881b281f1744bbbdfe8713781b461ec7 100644 --- a/libs/agent/src/agent/tools_registry.py +++ b/libs/agent/src/agent/tools_registry.py @@ -6,7 +6,14 @@ from typing import Any from agent.models import SlideOutline from agent.tools.pptx import create_pptx - +from agent.tools.research_tools import ( + tool_extract_and_index, + tool_research_answer, + tool_scrape_pdf, + tool_scrape_web, + tool_search_urls, + tool_suggest_urls, +) @dataclass(frozen=True) class ToolSpec: @@ -23,6 +30,36 @@ class ToolRegistry: "Create a PowerPoint file from a validated SlideOutline", self._handle_create_pptx, ) + self.register( + "suggest_urls", + "Suggest research URLs for a topic using the local LLM", + tool_suggest_urls, + ) + self.register( + "scrape_web", + "Fetch and extract text from a web URL", + tool_scrape_web, + ) + self.register( + "scrape_pdf", + "Extract text from a PDF file path", + tool_scrape_pdf, + ) + self.register( + "extract_and_index", + "Chunk, embed, and index an ExtractedDocument into MemRAG", + tool_extract_and_index, + ) + self.register( + "research_answer", + "Answer a question with RAG citations from MemRAG", + tool_research_answer, + ) + self.register( + "search_urls", + "Web search for URLs on a topic (DuckDuckGo)", + tool_search_urls, + ) def register(self, name: str, description: str, handler: Callable[..., Any]) -> None: self._tools[name] = ToolSpec(name=name, description=description, handler=handler) diff --git a/libs/agent/tests/test_research_runner.py b/libs/agent/tests/test_research_runner.py new file mode 100644 index 0000000000000000000000000000000000000000..a71601c6d0ac434b4eea3e0c2a6626e906a76c48 --- /dev/null +++ b/libs/agent/tests/test_research_runner.py @@ -0,0 +1,107 @@ +from __future__ import annotations + +from pathlib import Path + +import numpy as np +import pytest + +from agent.runner import AgentRunner +from researchmind.config import ResearchMindConfig +from researchmind.extract import ExtractedDocument +from researchmind.store import MemRAGStore + + +class MockBackend: + def load(self) -> None: + return None + + def chat(self, messages, *, max_tokens=512, temperature=0.7): + user = messages[-1]["content"] + if "Topic:" in user: + return '["https://example.com/a", "https://example.com/b"]' + return "Plants use photosynthesis [1]." + + def generate(self, prompt, *, max_tokens=512, temperature=0.7): + return self.chat([{"role": "user", "content": prompt}], max_tokens=max_tokens) + + +@pytest.fixture +def research_env(tmp_path, monkeypatch): + cfg = ResearchMindConfig( + data_dir=tmp_path / "rm", + embed_model="test", + auto_search=False, + top_k=2, + max_context_chunks=8, + chunk_size=50, + chunk_overlap=10, + ) + monkeypatch.setenv("RESEARCHMIND_DATA_DIR", str(cfg.data_dir)) + + def fake_embed(texts, *, model_name): + vecs = [] + for t in texts: + vecs.append(np.array([1.0, 0.0, 0.0], dtype=np.float32)) + return np.stack(vecs) if vecs else np.zeros((0, 3), dtype=np.float32) + + monkeypatch.setattr("researchmind.ingest.embed_texts", fake_embed) + monkeypatch.setattr("researchmind.retrieve.embed_texts", fake_embed) + + def fake_scrape(url: str): + return ExtractedDocument( + source_type="web", + uri=url, + title="Example", + text="Photosynthesis converts light to energy in plants.", + ) + + monkeypatch.setattr("agent.tools.research_tools.fetch_and_extract", fake_scrape) + + def fake_search(topic, *, n=5, check_reachable=True): + return [f"https://example.com/{topic.replace(' ', '-')}"] + + monkeypatch.setattr("agent.tools.research_tools.search_urls", fake_search) + + def fake_validate(url, *, check_reachable=True): + normalized = url if url.startswith("http") else f"https://{url}" + return True, "ok", normalized + + monkeypatch.setattr("researchmind.url_validate.validate_url", fake_validate) + return cfg + + +def test_discover_urls(research_env): + runner = AgentRunner() + result = runner.run_researchmind_discover( + topic="photosynthesis", + auto_search=False, + session_id=None, + model_key="test", + backend=MockBackend(), + ) + assert len(result.suggested_urls) >= 1 + assert result.session_id + + +def test_ingest_and_chat(research_env): + runner = AgentRunner() + ingest = runner.run_researchmind_ingest( + topic=None, + urls=["https://example.com/a"], + files=[], + auto_search=False, + session_id=None, + model_key="test", + backend=MockBackend(), + ) + assert ingest.doc_count >= 1 + assert ingest.chunk_count >= 1 + + chat = runner.run_researchmind_chat( + question="How do plants make energy?", + session_id=ingest.session_id, + model_key="test", + backend=MockBackend(), + ) + assert "photosynthesis" in chat.answer.lower() or "[1]" in chat.answer + assert chat.session_id == ingest.session_id diff --git a/libs/inference/src/inference/response_clean.py b/libs/inference/src/inference/response_clean.py new file mode 100644 index 0000000000000000000000000000000000000000..66d6c8c36bb14425a8d6157822fd1685334e4496 --- /dev/null +++ b/libs/inference/src/inference/response_clean.py @@ -0,0 +1,87 @@ +from __future__ import annotations + +import re + +_RT_OPEN = "<" + "redacted_thinking" + ">" +_RT_CLOSE = "" +_THINK_OPEN = "<" + "think" + ">" +_THINK_CLOSE = "" + +_THINK_BLOCKS = re.compile( + "|".join( + ( + re.escape(_RT_OPEN) + r".*?" + re.escape(_RT_CLOSE), + re.escape(_THINK_OPEN) + r".*?" + re.escape(_THINK_CLOSE), + r".*?", + ) + ), + re.DOTALL | re.IGNORECASE, +) +_MALFORMED_THINK_OPEN = re.compile(r"^think>\s*", re.IGNORECASE) +_ANSWER_SPLITS = [ + re.compile(r"(?:Let's draft:|Draft:)\s*", re.IGNORECASE), + re.compile(r"\nSummary:\s*", re.IGNORECASE), + re.compile(r"\nAnswer:\s*", re.IGNORECASE), + re.compile(r"\n\n(?:In summary|To summarize)[,:]\s*", re.IGNORECASE), +] +_META_TAIL = re.compile( + r"\n\n(?:Now,|We need|Also,|But we|However,|The instruction|So we|" + r"That means|We must|We should|We have|We can)\b", + re.IGNORECASE, +) +_REASONING_OPENERS = ( + "we need to", + "first,", + "the user", + "let me", + "okay,", + "now, let", + "i need to", +) + + +def _normalize_extracted(text: str) -> str: + cleaned = text.strip() + cleaned = re.sub(r"^Summary:\s*", "", cleaned, flags=re.IGNORECASE) + cleaned = re.sub(r"^Answer:\s*", "", cleaned, flags=re.IGNORECASE) + return cleaned.strip() + + +def _extract_answer_from_reasoning(text: str) -> str | None: + for pattern in _ANSWER_SPLITS: + match = pattern.search(text) + if not match: + continue + rest = _normalize_extracted(text[match.end() :]) + rest = _META_TAIL.split(rest, maxsplit=1)[0].strip() + if len(rest) >= 40: + return rest + return None + + +def looks_like_reasoning_only(text: str) -> bool: + sample = text[:240].lower() + return any(sample.startswith(opener) for opener in _REASONING_OPENERS) + + +def strip_reasoning_output(text: str) -> str: + """Remove model chain-of-thought / thinking traces from user-visible replies.""" + cleaned = text.strip() + if not cleaned: + return "" + + cleaned = _THINK_BLOCKS.sub("", cleaned).strip() + + if _MALFORMED_THINK_OPEN.match(cleaned): + body = _MALFORMED_THINK_OPEN.sub("", cleaned, count=1).strip() + extracted = _extract_answer_from_reasoning(body) + if extracted: + return extracted + cleaned = body + + if looks_like_reasoning_only(cleaned): + extracted = _extract_answer_from_reasoning(cleaned) + if extracted: + return extracted + + return cleaned diff --git a/libs/inference/tests/test_response_clean.py b/libs/inference/tests/test_response_clean.py new file mode 100644 index 0000000000000000000000000000000000000000..6aecd26f90d7b5f0281571ff1ad52443746bead5 --- /dev/null +++ b/libs/inference/tests/test_response_clean.py @@ -0,0 +1,34 @@ +from __future__ import annotations + +from inference.response_clean import strip_reasoning_output + +_RT_OPEN = "<" + "redacted_thinking" + ">" +_RT_CLOSE = "" +_THINK_OPEN = "<" + "think" + ">" +_THINK_CLOSE = "" + + +def test_strips_redacted_thinking_block(): + raw = f"{_RT_OPEN}\nplanning...\n{_RT_CLOSE}\n\nThe capital of France is Paris." + assert strip_reasoning_output(raw) == "The capital of France is Paris." + + +def test_strips_think_block(): + raw = f"{_THINK_OPEN}\nplanning...\n{_THINK_CLOSE}\n\nAgents use memory [1]." + assert strip_reasoning_output(raw) == "Agents use memory [1]." + + +def test_strips_malformed_think_prefix_and_extracts_summary(): + raw = """think> We need to summarize the document. First, identify sources. + +Let's draft: + +Summary: This review covers AI agent applications, evaluation, and future work [1].""" + out = strip_reasoning_output(raw) + assert out.startswith("This review covers") + assert "We need to summarize" not in out + + +def test_preserves_normal_answer(): + text = "AI agents combine perception, planning, and action [1]." + assert strip_reasoning_output(text) == text diff --git a/libs/researchmind/README.md b/libs/researchmind/README.md new file mode 100644 index 0000000000000000000000000000000000000000..dcb244d74e680b1b97dbd910e63b9ea79d68227e --- /dev/null +++ b/libs/researchmind/README.md @@ -0,0 +1,9 @@ +# researchmind + +Local ingest, MemRAG persistence, and retrieval for the ResearchMind agent. + +- Scrape web (httpx + trafilatura), PDF (pypdf), DOCX (python-docx) +- Chunk, embed (sentence-transformers), store in SQLite +- Top-k retrieval with graph neighbor expansion and citation formatting + +Set `RESEARCHMIND_DATA_DIR` (default `outputs/researchmind`) for the memory database and raw snapshots. diff --git a/libs/researchmind/pyproject.toml b/libs/researchmind/pyproject.toml new file mode 100644 index 0000000000000000000000000000000000000000..18bf42723225c31439265657127df79346a8cf33 --- /dev/null +++ b/libs/researchmind/pyproject.toml @@ -0,0 +1,25 @@ +[project] +name = "researchmind" +version = "0.1.0" +description = "Local scraper + RAG + MemRAG store for ResearchMind agent" +readme = "README.md" +authors = [ + { name = "MSGhais", email = "msghais135@gmail.com" } +] +requires-python = ">=3.12" +dependencies = [ + "inference", + "ddgs>=9.0.0", + "googlesearch-python>=1.3.0", + "httpx>=0.28.0", + "numpy>=2.0.0", + "pydantic>=2.0.0", + "pypdf>=5.0.0", + "python-docx>=1.1.0", + "sentence-transformers>=3.0.0", + "trafilatura>=2.0.0", +] + +[build-system] +requires = ["uv_build>=0.8.13,<0.9.0"] +build-backend = "uv_build" diff --git a/libs/researchmind/src/researchmind/__init__.py b/libs/researchmind/src/researchmind/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1e9684b77301d031152195832ceaa1b0fb44ba98 --- /dev/null +++ b/libs/researchmind/src/researchmind/__init__.py @@ -0,0 +1,11 @@ +from researchmind.config import get_config +from researchmind.extract import ExtractedDocument +from researchmind.ingest import IngestPipeline +from researchmind.store import MemRAGStore + +__all__ = [ + "ExtractedDocument", + "IngestPipeline", + "MemRAGStore", + "get_config", +] diff --git a/libs/researchmind/src/researchmind/chunking.py b/libs/researchmind/src/researchmind/chunking.py new file mode 100644 index 0000000000000000000000000000000000000000..13c40a9e03edaeb8b1e1ecfaa4f51f447894b8be --- /dev/null +++ b/libs/researchmind/src/researchmind/chunking.py @@ -0,0 +1,46 @@ +from __future__ import annotations + +import hashlib +import re +from dataclasses import dataclass + + +@dataclass(frozen=True) +class TextChunk: + chunk_id: str + ordinal: int + text: str + + +def _approx_tokens(text: str) -> int: + return len(re.findall(r"\S+", text)) + + +def chunk_text( + text: str, + *, + doc_id: str, + chunk_size: int = 512, + chunk_overlap: int = 128, +) -> list[TextChunk]: + words = text.split() + if not words: + return [] + + chunks: list[TextChunk] = [] + start = 0 + ordinal = 0 + step = max(1, chunk_size - chunk_overlap) + + while start < len(words): + end = min(len(words), start + chunk_size) + piece = " ".join(words[start:end]).strip() + if piece: + digest = hashlib.sha256(f"{doc_id}:{ordinal}:{piece}".encode()).hexdigest()[:16] + chunks.append(TextChunk(chunk_id=f"{doc_id}_{ordinal}_{digest}", ordinal=ordinal, text=piece)) + ordinal += 1 + if end >= len(words): + break + start += step + + return chunks diff --git a/libs/researchmind/src/researchmind/citations.py b/libs/researchmind/src/researchmind/citations.py new file mode 100644 index 0000000000000000000000000000000000000000..f5d1c5a50b0854b0711af356136dc85464f702f0 --- /dev/null +++ b/libs/researchmind/src/researchmind/citations.py @@ -0,0 +1,92 @@ +from __future__ import annotations + +import re +from dataclasses import dataclass + +from inference.response_clean import looks_like_reasoning_only, strip_reasoning_output + +from researchmind.store import StoredChunk + +_EXCERPT_LEN = 400 +_PASSAGE_LEN = 700 +_CITATION_RUN = re.compile(r"(?:\[\d{1,4}\]\s*){3,}") + + +@dataclass(frozen=True) +class Citation: + index: int + chunk_id: str + doc_title: str + doc_uri: str + excerpt: str + + +def _clean_passage(text: str) -> str: + """Collapse long runs of in-text [n] markers from scraped papers.""" + cleaned = _CITATION_RUN.sub("[…] ", text) + cleaned = re.sub(r"\s+", " ", cleaned).strip() + if len(cleaned) > _PASSAGE_LEN: + return cleaned[:_PASSAGE_LEN] + "…" + return cleaned + + +def format_context_block(chunks: list[StoredChunk]) -> tuple[str, list[Citation]]: + """Build LLM context with one citation index per source document.""" + groups: list[tuple[str, str, list[StoredChunk]]] = [] + seen_uris: set[str] = set() + for chunk in chunks: + if chunk.doc_uri in seen_uris: + for uri, _title, group in groups: + if uri == chunk.doc_uri: + group.append(chunk) + break + else: + seen_uris.add(chunk.doc_uri) + groups.append((chunk.doc_uri, chunk.doc_title, [chunk])) + + citations: list[Citation] = [] + blocks: list[str] = [] + for i, (uri, title, doc_chunks) in enumerate(groups, start=1): + passages = [_clean_passage(c.text) for c in doc_chunks if c.text.strip()] + merged = "\n\n".join(passages) + excerpt = merged[:_EXCERPT_LEN] + ("..." if len(merged) > _EXCERPT_LEN else "") + citations.append( + Citation( + index=i, + chunk_id=doc_chunks[0].id, + doc_title=title, + doc_uri=uri, + excerpt=excerpt, + ) + ) + blocks.append(f"[{i}] **{title}**\n{uri}\n\n{merged}") + + context = "\n\n---\n\n".join(blocks) + return context, citations + + +def format_references(citations: list[Citation]) -> str: + if not citations: + return "" + lines = ["**References**"] + for c in citations: + lines.append(f"- [{c.index}] {c.doc_title} — {c.doc_uri}") + return "\n".join(lines) + + +def clean_model_answer(answer: str) -> str: + """Remove thinking traces, duplicate references, and citation spam from model output.""" + text = strip_reasoning_output(answer) + if "**References**" in text: + text = text.split("**References**", maxsplit=1)[0].rstrip() + if "\nReferences\n" in text: + text = text.split("\nReferences\n", maxsplit=1)[0].rstrip() + text = _CITATION_RUN.sub("", text) + text = re.sub(r"\n{3,}", "\n\n", text) + text = text.strip() + if not text or looks_like_reasoning_only(text): + return ( + "The model returned planning text without a final answer. " + "Try asking again or switch to a non-reasoning model preset." + ) + return text diff --git a/libs/researchmind/src/researchmind/config.py b/libs/researchmind/src/researchmind/config.py new file mode 100644 index 0000000000000000000000000000000000000000..9058d63c11030db9ce774d33a21b123f2a48d276 --- /dev/null +++ b/libs/researchmind/src/researchmind/config.py @@ -0,0 +1,32 @@ +from __future__ import annotations + +import os +from dataclasses import dataclass +from pathlib import Path + + +@dataclass(frozen=True) +class ResearchMindConfig: + data_dir: Path + embed_model: str + auto_search: bool + top_k: int + max_context_chunks: int + chunk_size: int + chunk_overlap: int + + +def get_config() -> ResearchMindConfig: + data_dir = Path( + os.environ.get("RESEARCHMIND_DATA_DIR", "outputs/researchmind") + ).expanduser() + return ResearchMindConfig( + data_dir=data_dir, + embed_model=os.environ.get("RESEARCHMIND_EMBED_MODEL", "all-MiniLM-L6-v2"), + auto_search=os.environ.get("RESEARCHMIND_AUTO_SEARCH", "false").lower() + in ("1", "true", "yes"), + top_k=int(os.environ.get("RESEARCHMIND_TOP_K", "5")), + max_context_chunks=int(os.environ.get("RESEARCHMIND_MAX_CONTEXT_CHUNKS", "8")), + chunk_size=int(os.environ.get("RESEARCHMIND_CHUNK_SIZE", "512")), + chunk_overlap=int(os.environ.get("RESEARCHMIND_CHUNK_OVERLAP", "128")), + ) diff --git a/libs/researchmind/src/researchmind/embeddings.py b/libs/researchmind/src/researchmind/embeddings.py new file mode 100644 index 0000000000000000000000000000000000000000..b8aa3f7ef908b6dc693c20e055d79c5c94787982 --- /dev/null +++ b/libs/researchmind/src/researchmind/embeddings.py @@ -0,0 +1,32 @@ +from __future__ import annotations + +import numpy as np + +_embedder = None +_embedder_model_name: str | None = None + + +def get_embedder(model_name: str): + global _embedder, _embedder_model_name + if _embedder is None or _embedder_model_name != model_name: + from sentence_transformers import SentenceTransformer + + _embedder = SentenceTransformer(model_name) + _embedder_model_name = model_name + return _embedder + + +def embed_texts(texts: list[str], *, model_name: str) -> np.ndarray: + if not texts: + return np.zeros((0, 0), dtype=np.float32) + model = get_embedder(model_name) + vectors = model.encode(texts, normalize_embeddings=True, show_progress_bar=False) + return np.asarray(vectors, dtype=np.float32) + + +def embedding_to_bytes(vector: np.ndarray) -> bytes: + return vector.astype(np.float32).tobytes() + + +def bytes_to_embedding(data: bytes, dim: int) -> np.ndarray: + return np.frombuffer(data, dtype=np.float32).reshape(dim) diff --git a/libs/researchmind/src/researchmind/extract.py b/libs/researchmind/src/researchmind/extract.py new file mode 100644 index 0000000000000000000000000000000000000000..dbf6300365a4158509393ed3a704a5a4b61da081 --- /dev/null +++ b/libs/researchmind/src/researchmind/extract.py @@ -0,0 +1,36 @@ +from __future__ import annotations + +from pathlib import Path + +from pydantic import BaseModel, Field + + +class ExtractedDocument(BaseModel): + source_type: str + uri: str + title: str + text: str + mime: str = "text/plain" + metadata: dict[str, str] = Field(default_factory=dict) + + +def extract_docx(path: Path) -> ExtractedDocument: + from docx import Document + + doc = Document(path) + paragraphs = [p.text.strip() for p in doc.paragraphs if p.text.strip()] + text = "\n\n".join(paragraphs) + title = path.stem + for para in doc.paragraphs: + if para.style and para.style.name and "Heading" in para.style.name: + if para.text.strip(): + title = para.text.strip() + break + return ExtractedDocument( + source_type="docx", + uri=str(path.resolve()), + title=title, + text=text or path.name, + mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document", + metadata={"filename": path.name}, + ) diff --git a/libs/researchmind/src/researchmind/ingest.py b/libs/researchmind/src/researchmind/ingest.py new file mode 100644 index 0000000000000000000000000000000000000000..dd2edb8f29ec588611fb679d2364b1992e371933 --- /dev/null +++ b/libs/researchmind/src/researchmind/ingest.py @@ -0,0 +1,105 @@ +from __future__ import annotations + +from pathlib import Path +from typing import Any + +import numpy as np + +from researchmind.chunking import chunk_text +from researchmind.config import ResearchMindConfig, get_config +from researchmind.embeddings import embed_texts +from researchmind.extract import ExtractedDocument, extract_docx +from researchmind.scrape_pdf import extract_pdf +from researchmind.scrape_web import fetch_and_extract +from researchmind.store import MemRAGStore + + +class IngestPipeline: + def __init__( + self, + store: MemRAGStore | None = None, + config: ResearchMindConfig | None = None, + ) -> None: + self._config = config or get_config() + self._store = store or MemRAGStore(self._config) + + @property + def store(self) -> MemRAGStore: + return self._store + + def ingest_document( + self, + doc: ExtractedDocument, + *, + session_id: str | None = None, + raw_snapshot: str | None = None, + ) -> tuple[str, bool]: + doc_id_prefix = self._store.content_hash(doc.text)[:12] + chunks = chunk_text( + doc.text, + doc_id=doc_id_prefix, + chunk_size=self._config.chunk_size, + chunk_overlap=self._config.chunk_overlap, + ) + if not chunks and doc.text.strip(): + from researchmind.chunking import TextChunk + + chunks = [ + TextChunk( + chunk_id=f"{doc_id_prefix}_0", + ordinal=0, + text=doc.text[: self._config.chunk_size], + ) + ] + + chunks_text = [c.text for c in chunks] + embeddings = embed_texts(chunks_text, model_name=self._config.embed_model) + chunk_tuples: list[tuple[str, int, str, np.ndarray, dict[str, Any]]] = [] + for chunk, emb in zip(chunks, embeddings, strict=True): + chunk_tuples.append( + ( + chunk.chunk_id, + chunk.ordinal, + chunk.text, + emb, + {"source_type": doc.source_type}, + ) + ) + + return self._store.add_document( + source_type=doc.source_type, + uri=doc.uri, + title=doc.title, + text=doc.text, + chunks=chunk_tuples, + session_id=session_id, + raw_snapshot=raw_snapshot or doc.text[:100_000], + ) + + def ingest_url(self, url: str, *, session_id: str | None = None) -> tuple[str, bool]: + doc = fetch_and_extract(url) + return self.ingest_document(doc, session_id=session_id, raw_snapshot=doc.text) + + def ingest_pdf(self, path: Path, *, session_id: str | None = None) -> tuple[str, bool]: + doc = extract_pdf(path) + return self.ingest_document(doc, session_id=session_id) + + def ingest_docx(self, path: Path, *, session_id: str | None = None) -> tuple[str, bool]: + doc = extract_docx(path) + return self.ingest_document(doc, session_id=session_id) + + def ingest_path(self, path: Path, *, session_id: str | None = None) -> tuple[str, bool]: + suffix = path.suffix.lower() + if suffix == ".pdf": + return self.ingest_pdf(path, session_id=session_id) + if suffix == ".docx": + return self.ingest_docx(path, session_id=session_id) + text = path.read_text(encoding="utf-8", errors="replace") + doc = ExtractedDocument( + source_type="file", + uri=str(path.resolve()), + title=path.stem, + text=text, + mime="text/plain", + ) + return self.ingest_document(doc, session_id=session_id) diff --git a/libs/researchmind/src/researchmind/retrieve.py b/libs/researchmind/src/researchmind/retrieve.py new file mode 100644 index 0000000000000000000000000000000000000000..befd13e977ce060e6d9f4f9b9c1bde650e15cd8a --- /dev/null +++ b/libs/researchmind/src/researchmind/retrieve.py @@ -0,0 +1,57 @@ +from __future__ import annotations + +import numpy as np + +from researchmind.config import ResearchMindConfig, get_config +from researchmind.embeddings import embed_texts +from researchmind.store import MemRAGStore, StoredChunk + + +def retrieve( + query: str, + store: MemRAGStore, + *, + config: ResearchMindConfig | None = None, + top_k: int | None = None, + expand_neighbors: bool = True, + session_id: str | None = None, + doc_ids: list[str] | None = None, +) -> list[StoredChunk]: + cfg = config or get_config() + k = top_k if top_k is not None else cfg.top_k + all_chunks = store.get_chunks_with_embeddings( + session_id=session_id, + doc_ids=doc_ids, + ) + if not all_chunks: + return [] + + q_vec = embed_texts([query], model_name=cfg.embed_model)[0] + scored: list[tuple[float, StoredChunk]] = [] + for chunk, emb in all_chunks: + sim = float(np.dot(q_vec, emb)) + scored.append((sim, chunk)) + + max_chunks = cfg.max_context_chunks + scored.sort(key=lambda x: x[0], reverse=True) + selected: list[StoredChunk] = [] + seen_ids: set[str] = set() + + for _, chunk in scored[:k]: + if len(selected) >= max_chunks: + break + if chunk.id not in seen_ids: + selected.append(chunk) + seen_ids.add(chunk.id) + if expand_neighbors and len(selected) < max_chunks: + for nid in store.get_neighbor_chunk_ids(chunk.id)[:1]: + if len(selected) >= max_chunks: + break + if nid not in seen_ids: + neighbors = store.get_chunks_by_ids([nid]) + for n in neighbors: + selected.append(n) + seen_ids.add(n.id) + break + + return selected[:max_chunks] diff --git a/libs/researchmind/src/researchmind/scrape_pdf.py b/libs/researchmind/src/researchmind/scrape_pdf.py new file mode 100644 index 0000000000000000000000000000000000000000..390f35d7d4a68340a62daa247ab45195574a8eeb --- /dev/null +++ b/libs/researchmind/src/researchmind/scrape_pdf.py @@ -0,0 +1,30 @@ +from __future__ import annotations + +from pathlib import Path + +from pypdf import PdfReader + +from researchmind.extract import ExtractedDocument + + +def extract_pdf(path: Path, *, max_pages: int = 200) -> ExtractedDocument: + reader = PdfReader(str(path)) + pages: list[str] = [] + for i, page in enumerate(reader.pages[:max_pages]): + page_text = (page.extract_text() or "").strip() + if page_text: + pages.append(page_text) + + text = "\n\n".join(pages) + title = path.stem + if reader.metadata and reader.metadata.title: + title = str(reader.metadata.title) + + return ExtractedDocument( + source_type="pdf", + uri=str(path.resolve()), + title=title, + text=text or path.name, + mime="application/pdf", + metadata={"page_count": str(min(len(reader.pages), max_pages))}, + ) diff --git a/libs/researchmind/src/researchmind/scrape_web.py b/libs/researchmind/src/researchmind/scrape_web.py new file mode 100644 index 0000000000000000000000000000000000000000..9ac0ed5bfbcd758cd48491a02e7fb1ad9a066806 --- /dev/null +++ b/libs/researchmind/src/researchmind/scrape_web.py @@ -0,0 +1,38 @@ +from __future__ import annotations + +import httpx +import trafilatura + +from researchmind.extract import ExtractedDocument + + +def fetch_and_extract(url: str, *, timeout: float = 30.0) -> ExtractedDocument: + headers = { + "User-Agent": "ResearchMind/0.1 (local research agent; hackathon)", + } + with httpx.Client(follow_redirects=True, timeout=timeout, headers=headers) as client: + response = client.get(url) + response.raise_for_status() + html = response.text + + extracted = trafilatura.extract( + html, + url=url, + include_comments=False, + include_tables=True, + output_format="txt", + ) + metadata = trafilatura.extract_metadata(html, default_url=url) + title = (metadata.title if metadata and metadata.title else url) or url + text = (extracted or "").strip() + if not text: + text = html[:50_000] + + return ExtractedDocument( + source_type="web", + uri=url, + title=title, + text=text, + mime="text/html", + metadata={"final_url": str(response.url)}, + ) diff --git a/libs/researchmind/src/researchmind/search_urls.py b/libs/researchmind/src/researchmind/search_urls.py new file mode 100644 index 0000000000000000000000000000000000000000..2cc69c4318489383f3612e2813f7ca28b6853240 --- /dev/null +++ b/libs/researchmind/src/researchmind/search_urls.py @@ -0,0 +1,89 @@ +from __future__ import annotations + +import logging + +from researchmind.url_validate import filter_valid_urls, normalize_url + +logger = logging.getLogger(__name__) + + +def build_search_queries(topic: str) -> list[str]: + """Craft Google-friendly queries for a research topic.""" + t = topic.strip() + if not t: + return [] + return [ + f"{t} site:wikipedia.org", + f'"{t}" introduction overview', + f"{t} tutorial guide site:.edu OR site:.gov", + f"{t} research paper site:arxiv.org", + f"what is {t}", + ] + + +def _google_search(query: str, *, n: int) -> list[str]: + urls: list[str] = [] + try: + from googlesearch import search + + for item in search(query, num_results=n, lang="en", timeout=15): + if isinstance(item, str): + urls.append(item) + else: + href = getattr(item, "url", None) or getattr(item, "link", None) + if href: + urls.append(str(href)) + except Exception as exc: # noqa: BLE001 + logger.warning("Google search failed for %r: %s", query, exc) + return urls + + +def _duckduckgo_search(query: str, *, n: int) -> list[str]: + urls: list[str] = [] + try: + try: + from ddgs import DDGS + except ImportError: + from duckduckgo_search import DDGS + + ddgs = DDGS() + results = ddgs.text(query, max_results=n) + if results is None: + return urls + for item in results: + if not isinstance(item, dict): + continue + href = item.get("href") or item.get("link") + if href: + urls.append(str(href)) + except Exception as exc: # noqa: BLE001 + logger.warning("DuckDuckGo search failed for %r: %s", query, exc) + return urls + + +def _collect_candidates(topic: str, *, per_query: int = 4) -> list[str]: + candidates: list[str] = [] + seen: set[str] = set() + for query in build_search_queries(topic): + batch = _google_search(query, n=per_query) + if not batch: + batch = _duckduckgo_search(query, n=per_query) + for raw in batch: + normalized = normalize_url(raw) + if normalized and normalized not in seen: + seen.add(normalized) + candidates.append(normalized) + return candidates + + +def search_urls( + topic: str, + *, + n: int = 5, + check_reachable: bool = True, +) -> list[str]: + """ + Search the web (Google first, DuckDuckGo fallback) and return verified URLs. + """ + candidates = _collect_candidates(topic, per_query=max(n, 4)) + return filter_valid_urls(candidates, check_reachable=check_reachable, max_results=n) diff --git a/libs/researchmind/src/researchmind/store.py b/libs/researchmind/src/researchmind/store.py new file mode 100644 index 0000000000000000000000000000000000000000..cbd9c5f31938a7c1539e5f55cff66efed743f3b8 --- /dev/null +++ b/libs/researchmind/src/researchmind/store.py @@ -0,0 +1,381 @@ +from __future__ import annotations + +import hashlib +import json +import sqlite3 +import uuid +from dataclasses import dataclass +from datetime import UTC, datetime +from pathlib import Path +from typing import Any + +import numpy as np + +from researchmind.config import ResearchMindConfig, get_config +from researchmind.embeddings import bytes_to_embedding, embedding_to_bytes + + +@dataclass(frozen=True) +class StoredDocument: + id: str + source_type: str + uri: str + title: str + ingested_at: str + content_hash: str + + +@dataclass(frozen=True) +class StoredChunk: + id: str + doc_id: str + ordinal: int + text: str + doc_title: str + doc_uri: str + metadata: dict[str, Any] + + +@dataclass(frozen=True) +class SessionInfo: + id: str + topic: str + created_at: str + + +class MemRAGStore: + def __init__(self, config: ResearchMindConfig | None = None) -> None: + self._config = config or get_config() + self._config.data_dir.mkdir(parents=True, exist_ok=True) + (self._config.data_dir / "raw").mkdir(parents=True, exist_ok=True) + self._db_path = self._config.data_dir / "memory.db" + self._embed_dim: int | None = None + self._init_db() + + @property + def db_path(self) -> Path: + return self._db_path + + @property + def embed_dim(self) -> int: + if self._embed_dim is None: + row = self._conn().execute( + "SELECT dim FROM embed_meta LIMIT 1" + ).fetchone() + self._embed_dim = int(row[0]) if row else 384 + return self._embed_dim + + def _conn(self) -> sqlite3.Connection: + conn = sqlite3.connect(self._db_path) + conn.row_factory = sqlite3.Row + return conn + + def _init_db(self) -> None: + with self._conn() as conn: + conn.executescript( + """ + CREATE TABLE IF NOT EXISTS embed_meta ( + dim INTEGER NOT NULL + ); + CREATE TABLE IF NOT EXISTS documents ( + id TEXT PRIMARY KEY, + source_type TEXT NOT NULL, + uri TEXT NOT NULL, + title TEXT NOT NULL, + ingested_at TEXT NOT NULL, + content_hash TEXT NOT NULL UNIQUE, + session_id TEXT + ); + CREATE TABLE IF NOT EXISTS chunks ( + id TEXT PRIMARY KEY, + doc_id TEXT NOT NULL, + ordinal INTEGER NOT NULL, + text TEXT NOT NULL, + embedding_blob BLOB NOT NULL, + meta_json TEXT NOT NULL DEFAULT '{}', + FOREIGN KEY (doc_id) REFERENCES documents(id) + ); + CREATE TABLE IF NOT EXISTS edges ( + src_id TEXT NOT NULL, + dst_id TEXT NOT NULL, + rel TEXT NOT NULL, + PRIMARY KEY (src_id, dst_id, rel) + ); + CREATE TABLE IF NOT EXISTS sessions ( + id TEXT PRIMARY KEY, + topic TEXT NOT NULL, + created_at TEXT NOT NULL + ); + CREATE TABLE IF NOT EXISTS session_messages ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + session_id TEXT NOT NULL, + role TEXT NOT NULL, + content TEXT NOT NULL, + chunk_ids_json TEXT NOT NULL DEFAULT '[]', + created_at TEXT NOT NULL, + FOREIGN KEY (session_id) REFERENCES sessions(id) + ); + CREATE INDEX IF NOT EXISTS idx_chunks_doc ON chunks(doc_id); + CREATE INDEX IF NOT EXISTS idx_documents_session ON documents(session_id); + """ + ) + + def set_embed_dim(self, dim: int) -> None: + with self._conn() as conn: + conn.execute("DELETE FROM embed_meta") + conn.execute("INSERT INTO embed_meta (dim) VALUES (?)", (dim,)) + self._embed_dim = dim + + @staticmethod + def content_hash(text: str) -> str: + return hashlib.sha256(text.encode()).hexdigest() + + def create_session(self, topic: str = "") -> SessionInfo: + session_id = uuid.uuid4().hex[:12] + created_at = datetime.now(UTC).isoformat() + with self._conn() as conn: + conn.execute( + "INSERT INTO sessions (id, topic, created_at) VALUES (?, ?, ?)", + (session_id, topic, created_at), + ) + return SessionInfo(id=session_id, topic=topic, created_at=created_at) + + def list_sessions(self) -> list[SessionInfo]: + with self._conn() as conn: + rows = conn.execute( + "SELECT id, topic, created_at FROM sessions ORDER BY created_at DESC" + ).fetchall() + return [SessionInfo(id=r["id"], topic=r["topic"], created_at=r["created_at"]) for r in rows] + + def get_session(self, session_id: str) -> SessionInfo | None: + with self._conn() as conn: + row = conn.execute( + "SELECT id, topic, created_at FROM sessions WHERE id = ?", + (session_id,), + ).fetchone() + if not row: + return None + return SessionInfo(id=row["id"], topic=row["topic"], created_at=row["created_at"]) + + def document_exists(self, content_hash: str) -> str | None: + with self._conn() as conn: + row = conn.execute( + "SELECT id FROM documents WHERE content_hash = ?", + (content_hash,), + ).fetchone() + return row["id"] if row else None + + def add_document( + self, + *, + source_type: str, + uri: str, + title: str, + text: str, + chunks: list[tuple[str, int, str, np.ndarray, dict[str, Any]]], + session_id: str | None = None, + raw_snapshot: str | None = None, + ) -> tuple[str, bool]: + """Returns (doc_id, was_new). Skips if content_hash already indexed.""" + c_hash = self.content_hash(text) + existing = self.document_exists(c_hash) + if existing: + return existing, False + + doc_id = uuid.uuid4().hex[:12] + ingested_at = datetime.now(UTC).isoformat() + if chunks: + dim = int(chunks[0][3].shape[0]) + self.set_embed_dim(dim) + + with self._conn() as conn: + conn.execute( + """ + INSERT INTO documents (id, source_type, uri, title, ingested_at, content_hash, session_id) + VALUES (?, ?, ?, ?, ?, ?, ?) + """, + (doc_id, source_type, uri, title, ingested_at, c_hash, session_id), + ) + for chunk_id, ordinal, chunk_text, emb, meta in chunks: + conn.execute( + """ + INSERT INTO chunks (id, doc_id, ordinal, text, embedding_blob, meta_json) + VALUES (?, ?, ?, ?, ?, ?) + """, + ( + chunk_id, + doc_id, + ordinal, + chunk_text, + embedding_to_bytes(emb), + json.dumps(meta), + ), + ) + conn.execute( + "INSERT OR IGNORE INTO edges (src_id, dst_id, rel) VALUES (?, ?, ?)", + (doc_id, chunk_id, "doc_has_chunk"), + ) + for i in range(len(chunks) - 1): + conn.execute( + "INSERT OR IGNORE INTO edges (src_id, dst_id, rel) VALUES (?, ?, ?)", + (chunks[i][0], chunks[i + 1][0], "chunk_next"), + ) + + if raw_snapshot is not None: + raw_dir = self._config.data_dir / "raw" / doc_id + raw_dir.mkdir(parents=True, exist_ok=True) + (raw_dir / "snapshot.txt").write_text(raw_snapshot, encoding="utf-8") + + return doc_id, True + + def list_documents(self, session_id: str | None = None) -> list[StoredDocument]: + query = "SELECT id, source_type, uri, title, ingested_at, content_hash FROM documents" + params: tuple[Any, ...] = () + if session_id: + query += " WHERE session_id = ?" + params = (session_id,) + query += " ORDER BY ingested_at DESC" + with self._conn() as conn: + rows = conn.execute(query, params).fetchall() + return [ + StoredDocument( + id=r["id"], + source_type=r["source_type"], + uri=r["uri"], + title=r["title"], + ingested_at=r["ingested_at"], + content_hash=r["content_hash"], + ) + for r in rows + ] + + def get_chunks_with_embeddings( + self, + *, + session_id: str | None = None, + doc_ids: list[str] | None = None, + ) -> list[tuple[StoredChunk, np.ndarray]]: + dim = self.embed_dim + query = """ + SELECT c.id, c.doc_id, c.ordinal, c.text, c.embedding_blob, c.meta_json, + d.title AS doc_title, d.uri AS doc_uri + FROM chunks c + JOIN documents d ON d.id = c.doc_id + WHERE 1=1 + """ + params: list[Any] = [] + if session_id: + query += " AND d.session_id = ?" + params.append(session_id) + if doc_ids: + placeholders = ",".join("?" * len(doc_ids)) + query += f" AND d.id IN ({placeholders})" + params.extend(doc_ids) + with self._conn() as conn: + rows = conn.execute(query, params).fetchall() + result: list[tuple[StoredChunk, np.ndarray]] = [] + for r in rows: + chunk = StoredChunk( + id=r["id"], + doc_id=r["doc_id"], + ordinal=r["ordinal"], + text=r["text"], + doc_title=r["doc_title"], + doc_uri=r["doc_uri"], + metadata=json.loads(r["meta_json"] or "{}"), + ) + emb = bytes_to_embedding(r["embedding_blob"], dim) + result.append((chunk, emb)) + return result + + def get_neighbor_chunk_ids(self, chunk_id: str) -> list[str]: + ids: list[str] = [] + with self._conn() as conn: + for row in conn.execute( + "SELECT dst_id FROM edges WHERE src_id = ? AND rel = 'chunk_next'", + (chunk_id,), + ): + ids.append(row["dst_id"]) + for row in conn.execute( + "SELECT src_id FROM edges WHERE dst_id = ? AND rel = 'chunk_next'", + (chunk_id,), + ): + ids.append(row["src_id"]) + return ids + + def get_chunks_by_ids(self, chunk_ids: list[str]) -> list[StoredChunk]: + if not chunk_ids: + return [] + placeholders = ",".join("?" for _ in chunk_ids) + with self._conn() as conn: + rows = conn.execute( + f""" + SELECT c.id, c.doc_id, c.ordinal, c.text, c.meta_json, + d.title AS doc_title, d.uri AS doc_uri + FROM chunks c + JOIN documents d ON d.id = c.doc_id + WHERE c.id IN ({placeholders}) + """, + chunk_ids, + ).fetchall() + by_id = { + r["id"]: StoredChunk( + id=r["id"], + doc_id=r["doc_id"], + ordinal=r["ordinal"], + text=r["text"], + doc_title=r["doc_title"], + doc_uri=r["doc_uri"], + metadata=json.loads(r["meta_json"] or "{}"), + ) + for r in rows + } + return [by_id[cid] for cid in chunk_ids if cid in by_id] + + def add_message( + self, + session_id: str, + role: str, + content: str, + chunk_ids: list[str] | None = None, + ) -> None: + with self._conn() as conn: + conn.execute( + """ + INSERT INTO session_messages (session_id, role, content, chunk_ids_json, created_at) + VALUES (?, ?, ?, ?, ?) + """, + ( + session_id, + role, + content, + json.dumps(chunk_ids or []), + datetime.now(UTC).isoformat(), + ), + ) + + def get_messages(self, session_id: str) -> list[dict[str, Any]]: + with self._conn() as conn: + rows = conn.execute( + """ + SELECT role, content, chunk_ids_json, created_at + FROM session_messages + WHERE session_id = ? + ORDER BY id ASC + """, + (session_id,), + ).fetchall() + return [ + { + "role": r["role"], + "content": r["content"], + "chunk_ids": json.loads(r["chunk_ids_json"] or "[]"), + "created_at": r["created_at"], + } + for r in rows + ] + + def count_chunks(self) -> int: + with self._conn() as conn: + row = conn.execute("SELECT COUNT(*) AS n FROM chunks").fetchone() + return int(row["n"]) diff --git a/libs/researchmind/src/researchmind/url_suggest.py b/libs/researchmind/src/researchmind/url_suggest.py new file mode 100644 index 0000000000000000000000000000000000000000..266e34ad23394ba1b877e225657d2cdd9ccfa739 --- /dev/null +++ b/libs/researchmind/src/researchmind/url_suggest.py @@ -0,0 +1,68 @@ +from __future__ import annotations + +import json +import re +from typing import TYPE_CHECKING, Protocol + +if TYPE_CHECKING: + pass + + +class ChatBackend(Protocol): + def chat( + self, + messages: list[dict[str, str]], + *, + max_tokens: int = 512, + temperature: float = 0.7, + ) -> str: ... + + +SUGGEST_SYSTEM = """You suggest reputable web URLs for research on a topic. +Return ONLY a JSON array of 3-5 full https URLs as strings. +No markdown, no explanation. Example: ["https://example.com/a", "https://example.com/b"] +""" + + +def suggest_urls(topic: str, backend: ChatBackend, *, max_urls: int = 5) -> list[str]: + messages = [ + {"role": "system", "content": SUGGEST_SYSTEM}, + {"role": "user", "content": f"Topic: {topic.strip()}"}, + ] + raw = backend.chat(messages, max_tokens=512, temperature=0.2) + return _parse_url_list(raw, max_urls=max_urls) + + +def _parse_url_list(raw: str, *, max_urls: int) -> list[str]: + cleaned = raw.strip() + fence = re.search(r"```(?:json)?\s*(\[.*?\])\s*```", cleaned, re.DOTALL) + if fence: + cleaned = fence.group(1) + else: + start = cleaned.find("[") + end = cleaned.rfind("]") + if start >= 0 and end > start: + cleaned = cleaned[start : end + 1] + + try: + data = json.loads(cleaned) + except json.JSONDecodeError: + urls = re.findall(r"https?://[^\s\"'<>]+", raw) + return _dedupe_urls(urls, max_urls) + + if not isinstance(data, list): + return [] + urls = [str(u).strip() for u in data if str(u).strip().startswith("http")] + return _dedupe_urls(urls, max_urls) + + +def _dedupe_urls(urls: list[str], max_urls: int) -> list[str]: + seen: set[str] = set() + out: list[str] = [] + for u in urls: + if u not in seen: + seen.add(u) + out.append(u) + if len(out) >= max_urls: + break + return out diff --git a/libs/researchmind/src/researchmind/url_validate.py b/libs/researchmind/src/researchmind/url_validate.py new file mode 100644 index 0000000000000000000000000000000000000000..012b0099137eb2ea861815ddd82c8be4594cb79d --- /dev/null +++ b/libs/researchmind/src/researchmind/url_validate.py @@ -0,0 +1,118 @@ +from __future__ import annotations + +import re +from urllib.parse import urlparse + +import httpx + +# arXiv IDs look like 2301.00001 or 2301.00001v2 +_ARXIV_ABS = re.compile( + r"^https?://(?:www\.)?arxiv\.org/abs/(\d{4}\.\d{4,5})(?:v\d+)?/?$", + re.IGNORECASE, +) + + +def normalize_url(url: str) -> str: + cleaned = url.strip().strip("\"'<>") + if not cleaned: + return "" + if cleaned.startswith("//"): + cleaned = "https:" + cleaned + if not cleaned.startswith(("http://", "https://")): + cleaned = "https://" + cleaned + parsed = urlparse(cleaned) + if not parsed.netloc: + return "" + return parsed.geturl().split("#")[0].rstrip("/") + + +def is_well_formed(url: str) -> tuple[bool, str]: + if not url: + return False, "empty url" + if "..." in url or "…" in url: + return False, "truncated placeholder" + if " " in url: + return False, "contains spaces" + + parsed = urlparse(url) + if parsed.scheme not in ("http", "https"): + return False, f"unsupported scheme {parsed.scheme!r}" + host = parsed.netloc.lower() + if not host or "." not in host: + return False, "missing host" + if host in ("localhost", "127.0.0.1"): + return False, "local url" + + path = parsed.path or "" + if "arxiv.org" in host and "/abs/" in path: + if not _ARXIV_ABS.match(url): + return False, "invalid arxiv abs url" + + if "ieeexplore.ieee.org" in host and path.rstrip("/") in ("", "/document"): + return False, "incomplete ieee document url" + + if _is_tracking_or_junk_url(host, path, parsed.query): + return False, "tracking or redirect link (not a content page)" + + return True, "ok" + + +def _is_tracking_or_junk_url(host: str, path: str, query: str) -> bool: + """Reject ad/click trackers and other non-content URLs from search results.""" + if "bing.com" in host and "/aclick" in path: + return True + if "google." in host and ("/aclk" in path or "googleadservices" in host): + return True + if "doubleclick.net" in host or "googlesyndication.com" in host: + return True + if host.endswith("bing.com") and path.startswith("/ck/"): + return True + # Search result redirect wrappers, not stable content URLs + if "google." in host and path.rstrip("/") == "/url" and "q=" in query: + return True + return False + + +def probe_url_reachable(url: str, *, timeout: float = 12.0) -> tuple[bool, str]: + headers = {"User-Agent": "ResearchMind/0.1 (url-validator)"} + try: + with httpx.Client(follow_redirects=True, timeout=timeout, headers=headers) as client: + response = client.head(url) + if response.status_code in (405, 501): + response = client.get(url) + if response.status_code >= 400: + return False, f"http {response.status_code}" + return True, "ok" + except httpx.HTTPError as exc: + return False, str(exc) + + +def validate_url(url: str, *, check_reachable: bool = True) -> tuple[bool, str, str]: + """Return (ok, reason, normalized_url).""" + normalized = normalize_url(url) + ok, reason = is_well_formed(normalized) + if not ok: + return False, reason, normalized + if check_reachable: + ok, reason = probe_url_reachable(normalized) + if not ok: + return False, reason, normalized + return True, "ok", normalized + + +def filter_valid_urls( + urls: list[str], + *, + check_reachable: bool = True, + max_results: int = 5, +) -> list[str]: + seen: set[str] = set() + valid: list[str] = [] + for raw in urls: + ok, _reason, normalized = validate_url(raw, check_reachable=check_reachable) + if ok and normalized not in seen: + seen.add(normalized) + valid.append(normalized) + if len(valid) >= max_results: + break + return valid diff --git a/libs/researchmind/tests/test_chunking.py b/libs/researchmind/tests/test_chunking.py new file mode 100644 index 0000000000000000000000000000000000000000..ca3ac2433d77fa7b0b968ae3fb92475a2241c3f1 --- /dev/null +++ b/libs/researchmind/tests/test_chunking.py @@ -0,0 +1,15 @@ +from __future__ import annotations + +from researchmind.chunking import chunk_text + + +def test_chunk_text_splits_long_document(): + words = ["word"] * 600 + text = " ".join(words) + chunks = chunk_text(text, doc_id="doc1", chunk_size=100, chunk_overlap=20) + assert len(chunks) > 1 + assert chunks[0].ordinal == 0 + + +def test_chunk_text_empty(): + assert chunk_text("", doc_id="x") == [] diff --git a/libs/researchmind/tests/test_citations.py b/libs/researchmind/tests/test_citations.py new file mode 100644 index 0000000000000000000000000000000000000000..3b35bdca7516d5fd1b5d5422a8272dc7ad400f48 --- /dev/null +++ b/libs/researchmind/tests/test_citations.py @@ -0,0 +1,67 @@ +from __future__ import annotations + +from researchmind.citations import ( + clean_model_answer, + format_context_block, + format_references, +) +from researchmind.store import StoredChunk + + +def _chunk(chunk_id: str, doc_uri: str, text: str) -> StoredChunk: + return StoredChunk( + id=chunk_id, + doc_id="doc1", + ordinal=0, + text=text, + doc_title="AI Agents Review", + doc_uri=doc_uri, + metadata={}, + ) + + +def test_format_context_groups_chunks_by_document(): + chunks = [ + _chunk("c1", "https://example.com/paper", "First passage about agents."), + _chunk("c2", "https://example.com/paper", "Second passage about planning."), + ] + context, citations = format_context_block(chunks) + assert context.count("[1]") == 1 + assert "[2]" not in context + assert len(citations) == 1 + assert "First passage" in context + assert "Second passage" in context + + +def test_format_references_one_line_per_source(): + _, citations = format_context_block( + [ + _chunk("c1", "https://a.test", "alpha"), + _chunk("c2", "https://a.test", "beta"), + ] + ) + refs = format_references(citations) + assert refs.count("https://a.test") == 1 + + +def test_clean_passage_collapses_citation_runs(): + chunks = [_chunk("c1", "https://a.test", "[1] [2] [3] [4] [5] actual content")] + context, _ = format_context_block(chunks) + assert "[1] [2] [3] [4] [5]" not in context + assert "actual content" in context + + +def test_clean_model_answer_strips_reference_spam(): + raw = "Summary here [1][2][3][4][5].\n\n**References**\n- [1] dup" + cleaned = clean_model_answer(raw) + assert "**References**" not in cleaned + assert "[1][2][3]" not in cleaned + assert "Summary here" in cleaned + + +def test_clean_model_answer_strips_thinking_block(): + think_open = "<" + "think" + ">" + think_close = "" + raw = f"{think_open}\nplan\n{think_close}\n\nAgents use tools and memory [1]." + cleaned = clean_model_answer(raw) + assert cleaned == "Agents use tools and memory [1]." diff --git a/libs/researchmind/tests/test_retrieve.py b/libs/researchmind/tests/test_retrieve.py new file mode 100644 index 0000000000000000000000000000000000000000..71980d9f8b597b88d795a0682b50a3f636123dc5 --- /dev/null +++ b/libs/researchmind/tests/test_retrieve.py @@ -0,0 +1,95 @@ +from __future__ import annotations + +import numpy as np + +from researchmind.config import ResearchMindConfig +from researchmind.retrieve import retrieve +from researchmind.store import MemRAGStore + + +def _fake_embed(monkeypatch): + def fake_embed_texts(texts, *, model_name): + out = [] + for t in texts: + if "photosynthesis" in t.lower(): + out.append(np.array([1.0, 0.0], dtype=np.float32)) + else: + out.append(np.array([0.0, 1.0], dtype=np.float32)) + return np.stack(out) + + monkeypatch.setattr("researchmind.retrieve.embed_texts", fake_embed_texts) + + +def test_retrieve_ranks_by_similarity(tmp_path, monkeypatch): + _fake_embed(monkeypatch) + cfg = ResearchMindConfig( + data_dir=tmp_path, + embed_model="test", + auto_search=False, + top_k=1, + max_context_chunks=8, + chunk_size=512, + chunk_overlap=128, + ) + store = MemRAGStore(cfg) + store.set_embed_dim(2) + store.add_document( + source_type="test", + uri="a", + title="A", + text="photosynthesis in plants", + chunks=[("c1", 0, "photosynthesis in plants", np.array([1.0, 0.0], dtype=np.float32), {})], + ) + store.add_document( + source_type="test", + uri="b", + title="B", + text="fractions math", + chunks=[("c2", 0, "fractions math", np.array([0.0, 1.0], dtype=np.float32), {})], + ) + hits = retrieve("photosynthesis", store, config=cfg, top_k=1, expand_neighbors=False) + assert len(hits) == 1 + assert "photosynthesis" in hits[0].text + + +def test_retrieve_filters_by_session(tmp_path, monkeypatch): + _fake_embed(monkeypatch) + cfg = ResearchMindConfig( + data_dir=tmp_path, + embed_model="test", + auto_search=False, + top_k=2, + max_context_chunks=8, + chunk_size=512, + chunk_overlap=128, + ) + store = MemRAGStore(cfg) + store.set_embed_dim(2) + sid_a = store.create_session(topic="a").id + sid_b = store.create_session(topic="b").id + store.add_document( + source_type="test", + uri="a", + title="Plants", + text="photosynthesis in plants", + chunks=[("c1", 0, "photosynthesis in plants", np.array([1.0, 0.0], dtype=np.float32), {})], + session_id=sid_a, + ) + store.add_document( + source_type="test", + uri="b", + title="Math", + text="fractions math", + chunks=[("c2", 0, "fractions math", np.array([0.0, 1.0], dtype=np.float32), {})], + session_id=sid_b, + ) + scoped = retrieve( + "photosynthesis", + store, + config=cfg, + top_k=2, + expand_neighbors=False, + session_id=sid_a, + ) + assert len(scoped) == 1 + assert "photosynthesis" in scoped[0].text diff --git a/libs/researchmind/tests/test_search_queries.py b/libs/researchmind/tests/test_search_queries.py new file mode 100644 index 0000000000000000000000000000000000000000..8ea617d06b3a03960cea897067404c260684b9fd --- /dev/null +++ b/libs/researchmind/tests/test_search_queries.py @@ -0,0 +1,29 @@ +from __future__ import annotations + +from researchmind.search_urls import build_search_queries, search_urls + + +def test_build_search_queries_includes_wikipedia_and_arxiv(): + queries = build_search_queries("AI agent") + joined = " ".join(queries).lower() + assert "wikipedia" in joined + assert "arxiv" in joined + assert "ai agent" in joined + + +def test_search_urls_uses_validated_results(monkeypatch): + monkeypatch.setattr( + "researchmind.search_urls._collect_candidates", + lambda topic, per_query=4: [ + "https://en.wikipedia.org/wiki/Intelligent_agent", + "https://arxiv.org/abs/quantcomm/2021/10.0", + ], + ) + + def fake_filter(urls, *, check_reachable=True, max_results=5): + return [u for u in urls if "wikipedia" in u][:max_results] + + monkeypatch.setattr("researchmind.search_urls.filter_valid_urls", fake_filter) + out = search_urls("AI agent", n=3, check_reachable=False) + assert len(out) == 1 + assert "wikipedia" in out[0] diff --git a/libs/researchmind/tests/test_store.py b/libs/researchmind/tests/test_store.py new file mode 100644 index 0000000000000000000000000000000000000000..2adf76444fefad5f809772c364047d7ef69c411b --- /dev/null +++ b/libs/researchmind/tests/test_store.py @@ -0,0 +1,57 @@ +from __future__ import annotations + +import numpy as np + +from researchmind.config import ResearchMindConfig +from researchmind.store import MemRAGStore + + +def test_store_dedup_and_chunks(tmp_path): + cfg = ResearchMindConfig( + data_dir=tmp_path, + embed_model="test", + auto_search=False, + top_k=3, + max_context_chunks=8, + chunk_size=512, + chunk_overlap=128, + ) + store = MemRAGStore(cfg) + emb = np.array([1.0, 0.0, 0.0], dtype=np.float32) + chunks = [("c1", 0, "hello world", emb, {})] + doc_id, is_new = store.add_document( + source_type="test", + uri="test://a", + title="A", + text="hello world", + chunks=chunks, + ) + assert is_new + doc_id2, is_new2 = store.add_document( + source_type="test", + uri="test://a", + title="A", + text="hello world", + chunks=chunks, + ) + assert not is_new2 + assert doc_id == doc_id2 + assert store.count_chunks() == 1 + + +def test_session_messages(tmp_path): + cfg = ResearchMindConfig( + data_dir=tmp_path, + embed_model="test", + auto_search=False, + top_k=3, + max_context_chunks=8, + chunk_size=512, + chunk_overlap=128, + ) + store = MemRAGStore(cfg) + session = store.create_session(topic="test topic") + store.add_message(session.id, "user", "hi", []) + msgs = store.get_messages(session.id) + assert len(msgs) == 1 + assert msgs[0]["role"] == "user" diff --git a/libs/researchmind/tests/test_url_validate.py b/libs/researchmind/tests/test_url_validate.py new file mode 100644 index 0000000000000000000000000000000000000000..e0e86fd2a95de0c2fc4d53b33dcc0651328fdec3 --- /dev/null +++ b/libs/researchmind/tests/test_url_validate.py @@ -0,0 +1,65 @@ +from __future__ import annotations + +from researchmind.url_validate import ( + filter_valid_urls, + is_well_formed, + normalize_url, + validate_url, +) + + +def test_rejects_truncated_and_bad_arxiv(): + ok, reason = is_well_formed("https://arxiv.org/abs/quantcomm/2021/10.0") + assert not ok + assert "arxiv" in reason + + ok, reason = is_well_formed("https://ieeexplore.ieee.org/document/...") + assert not ok + + +def test_accepts_valid_arxiv(): + ok, _ = is_well_formed("https://arxiv.org/abs/2301.00001") + assert ok + + +def test_normalize_adds_scheme(): + assert normalize_url("en.wikipedia.org/wiki/AI_agent").startswith("https://") + + +def test_validate_url_does_not_shadow_probe(monkeypatch): + """Regression: check_reachable=True must not call the bool parameter.""" + + def fake_probe(url, *, timeout=12.0): + return True, "ok" + + monkeypatch.setattr("researchmind.url_validate.probe_url_reachable", fake_probe) + ok, reason, normalized = validate_url( + "https://en.wikipedia.org/wiki/Agent", + check_reachable=True, + ) + assert ok + assert reason == "ok" + assert "wikipedia" in normalized + + +def test_rejects_bing_tracking_links(): + ok, reason = is_well_formed( + "https://www.bing.com/aclick?id=abc&u=aHR0cHM6Ly9leGFtcGxlLmNvbQ" + ) + assert not ok + assert "tracking" in reason + + +def test_filter_valid_urls_skips_bad(monkeypatch): + def fake_validate(url, *, check_reachable=True): + if "bad" in url: + return False, "bad", url + return True, "ok", url + + monkeypatch.setattr("researchmind.url_validate.validate_url", fake_validate) + out = filter_valid_urls( + ["https://good.example/a", "https://bad.example/b"], + check_reachable=False, + max_results=5, + ) + assert out == ["https://good.example/a"] diff --git a/pyproject.toml b/pyproject.toml index e35a2013744d8b3c1801fb423f0ac1bfbb4430b5..2b2a5851b46c8da450d72ac01db773e27703fcf8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,6 +9,7 @@ dependencies = [ "ensemble", "gradio-space", "inference", + "researchmind", ] [dependency-groups] @@ -46,4 +47,5 @@ agent = { workspace = true } ensemble = { workspace = true } gradio-space = { workspace = true } inference = { workspace = true } +researchmind = { workspace = true } slm-evals = { workspace = true } diff --git a/skills/extract-content/SKILL.md b/skills/extract-content/SKILL.md new file mode 100644 index 0000000000000000000000000000000000000000..edc4500791601f540d0d0d12ed085f36c9263171 --- /dev/null +++ b/skills/extract-content/SKILL.md @@ -0,0 +1,16 @@ +--- +name: extract-content +description: Chunk, embed, and index extracted text into MemRAG +task: research +tools: + - extract_and_index +--- + +## Workflow + +1. Receive an `ExtractedDocument` (from web, PDF, or DOCX scrape). +2. Call `extract_and_index` with optional `session_id`. +3. Chunks are embedded with sentence-transformers and stored in SQLite. +4. Duplicate content (same hash) is skipped. + +See `references/chunking-policy.md` for chunk size and overlap defaults. diff --git a/skills/extract-content/references/chunking-policy.md b/skills/extract-content/references/chunking-policy.md new file mode 100644 index 0000000000000000000000000000000000000000..7c7ec1aa65a75baaf4e26cb05b934dad766195b1 --- /dev/null +++ b/skills/extract-content/references/chunking-policy.md @@ -0,0 +1,9 @@ +# Chunking policy + +| Setting | Env var | Default | +|---------|---------|---------| +| Chunk size (words) | `RESEARCHMIND_CHUNK_SIZE` | 512 | +| Overlap (words) | `RESEARCHMIND_CHUNK_OVERLAP` | 128 | +| Embedding model | `RESEARCHMIND_EMBED_MODEL` | `all-MiniLM-L6-v2` | + +Chunks link via `chunk_next` edges for neighbor expansion at retrieval time. diff --git a/skills/extract-content/scripts/chunk_and_index.py b/skills/extract-content/scripts/chunk_and_index.py new file mode 100644 index 0000000000000000000000000000000000000000..84932abde1564d02ca970e05c08e04adb4e429b7 --- /dev/null +++ b/skills/extract-content/scripts/chunk_and_index.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python3 +"""CLI: ingest a text file or URL into MemRAG.""" + +from __future__ import annotations + +import argparse +import sys +from pathlib import Path + +from researchmind.extract import ExtractedDocument +from researchmind.ingest import IngestPipeline + + +def main() -> int: + parser = argparse.ArgumentParser(description="Chunk and index content") + parser.add_argument("--url", help="Scrape and index URL") + parser.add_argument("--file", type=Path, help="Index local file") + parser.add_argument("--session", help="Session id to tag document") + args = parser.parse_args() + + pipeline = IngestPipeline() + if args.url: + doc_id, is_new = pipeline.ingest_url(args.url, session_id=args.session) + elif args.file: + doc_id, is_new = pipeline.ingest_path(args.file, session_id=args.session) + else: + parser.error("Provide --url or --file") + + status = "indexed" if is_new else "deduplicated" + print(f"Document {doc_id} ({status}), chunks in store: {pipeline.store.count_chunks()}") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/skills/research-mind/SKILL.md b/skills/research-mind/SKILL.md new file mode 100644 index 0000000000000000000000000000000000000000..becbfa6f4c5efd1bae2ebfc7535af615dd24ae3c --- /dev/null +++ b/skills/research-mind/SKILL.md @@ -0,0 +1,30 @@ +--- +name: research-mind +description: Local research agent — scrape, index, and answer with citations +task: research +tools: + - suggest_urls + - scrape_web + - scrape_pdf + - extract_and_index + - research_answer +flags: + auto_search: false +--- + +## Workflow + +### Ingest + +1. **Topic only (default):** run `search_urls` (Google + verification) → user confirms URLs → scrape → `extract_and_index`. +2. **Auto search:** when `auto_search` is true, same search pipeline ingests top verified URLs without confirmation. +3. **Direct URL / file:** scrape and index immediately. + +### Q&A (offline after ingest) + +1. Call `research_answer` with the user question and `session_id`. +2. Retrieve top-k chunks from MemRAG, expand neighbors. +3. Answer using the local model with inline `[n]` citations. +4. Append references from `references/citation-format.md`. + +See `references/ingest-modes.md` for mode details. diff --git a/skills/research-mind/references/citation-format.md b/skills/research-mind/references/citation-format.md new file mode 100644 index 0000000000000000000000000000000000000000..3180e97e9f9d9e62210e75e570062f8921fda6ca --- /dev/null +++ b/skills/research-mind/references/citation-format.md @@ -0,0 +1,6 @@ +# Citation format + +- Context uses **one number per source document**: `[1]`, `[2]`, … +- Cite inline sparingly (typically 1–3 markers per answer), not after every phrase. +- Bracket numbers inside scraped paper text are not citation indices — ignore them. +- Do not output long runs of `[1][2][3]…` or duplicate **References** lists. diff --git a/skills/research-mind/references/ingest-modes.md b/skills/research-mind/references/ingest-modes.md new file mode 100644 index 0000000000000000000000000000000000000000..553b6fe78e9899a5019b7eaa1419c22c0b9789a2 --- /dev/null +++ b/skills/research-mind/references/ingest-modes.md @@ -0,0 +1,9 @@ +# Ingest modes + +| Mode | `auto_search` | Behavior | +|------|---------------|----------| +| Suggest URLs (confirm) | `false` | Google search + URL verification; user checks boxes before ingest | +| Auto search & ingest | `true` | Same search pipeline; ingests verified URLs without confirmation | +| Direct URL / file | n/a | Skip discovery; ingest provided sources | + +Global default: `RESEARCHMIND_AUTO_SEARCH=false`. Gradio dropdown and skill `flags.auto_search` override per run. diff --git a/skills/research-mind/scripts/ask.py b/skills/research-mind/scripts/ask.py new file mode 100644 index 0000000000000000000000000000000000000000..167b921e989c3959571784986767c64433c9d2a4 --- /dev/null +++ b/skills/research-mind/scripts/ask.py @@ -0,0 +1,33 @@ +#!/usr/bin/env python3 +"""CLI stub: Q&A requires a loaded inference backend (use Gradio/agent).""" + +from __future__ import annotations + +import argparse +import sys + +from researchmind.config import get_config +from researchmind.ingest import IngestPipeline +from researchmind.retrieve import retrieve + + +def main() -> int: + parser = argparse.ArgumentParser(description="Preview retrieval for a question") + parser.add_argument("question", help="Question to retrieve context for") + parser.add_argument("--top-k", type=int, default=None) + args = parser.parse_args() + + cfg = get_config() + store = IngestPipeline().store + chunks = retrieve(args.question, store, config=cfg, top_k=args.top_k) + if not chunks: + print("No chunks in store. Ingest sources first.") + return 1 + for i, c in enumerate(chunks, 1): + print(f"\n--- [{i}] {c.doc_title} ---\n{c.text[:500]}...") + print("\nUse AgentRunner.run_researchmind_chat() for a full cited answer.") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/skills/research-mind/scripts/ingest.py b/skills/research-mind/scripts/ingest.py new file mode 100644 index 0000000000000000000000000000000000000000..d99577c22c50e9e8e35138b5cba6634733d2859f --- /dev/null +++ b/skills/research-mind/scripts/ingest.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python3 +"""CLI: ingest URLs from a file (one per line).""" + +from __future__ import annotations + +import argparse +import sys +from pathlib import Path + +from researchmind.ingest import IngestPipeline + + +def main() -> int: + parser = argparse.ArgumentParser(description="Ingest URLs for ResearchMind") + parser.add_argument("urls_file", type=Path, help="Text file with one URL per line") + parser.add_argument("--session", help="Optional session id") + args = parser.parse_args() + + pipeline = IngestPipeline() + lines = [ln.strip() for ln in args.urls_file.read_text().splitlines() if ln.strip()] + for url in lines: + doc_id, is_new = pipeline.ingest_url(url, session_id=args.session) + print(f"{url} -> {doc_id} ({'new' if is_new else 'dup'})") + print(f"Total chunks: {pipeline.store.count_chunks()}") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/skills/research-mind/scripts/suggest_urls.py b/skills/research-mind/scripts/suggest_urls.py new file mode 100644 index 0000000000000000000000000000000000000000..c7ee05fbacd5a773143aae79592dedffe1a84dcb --- /dev/null +++ b/skills/research-mind/scripts/suggest_urls.py @@ -0,0 +1,24 @@ +#!/usr/bin/env python3 +"""CLI stub: URL suggestion requires a loaded inference backend (use Gradio/agent).""" + +from __future__ import annotations + +import argparse +import sys + + +def main() -> int: + parser = argparse.ArgumentParser( + description="Suggest URLs for a topic (use agent runner for full flow)" + ) + parser.add_argument("topic", help="Research topic") + args = parser.parse_args() + print( + "Use AgentRunner.run_researchmind_discover() or the Gradio Research tab " + f"to suggest URLs for: {args.topic!r}" + ) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/skills/scrape-pdf/SKILL.md b/skills/scrape-pdf/SKILL.md new file mode 100644 index 0000000000000000000000000000000000000000..0ec24f8968693d6c8837d13d5aba2e487f89490e --- /dev/null +++ b/skills/scrape-pdf/SKILL.md @@ -0,0 +1,15 @@ +--- +name: scrape-pdf +description: Extract text from a local PDF file for indexing +task: research +tools: + - scrape_pdf +--- + +## Workflow + +1. Receive a path to a `.pdf` file (upload or local path). +2. Call `scrape_pdf` to extract text with pypdf. +3. Pass the `ExtractedDocument` to `extract_and_index`. + +See `references/pdf-limits.md` for page limits and scanned-PDF notes. diff --git a/skills/scrape-pdf/references/pdf-limits.md b/skills/scrape-pdf/references/pdf-limits.md new file mode 100644 index 0000000000000000000000000000000000000000..456993154b086f0e40d10639333b6808514cb04a --- /dev/null +++ b/skills/scrape-pdf/references/pdf-limits.md @@ -0,0 +1,5 @@ +# PDF limits + +- Default max pages: **200** per file. +- Scanned PDFs without a text layer may return little or no text (OCR not in MVP). +- Title taken from PDF metadata when available, else filename stem. diff --git a/skills/scrape-pdf/scripts/extract_pdf.py b/skills/scrape-pdf/scripts/extract_pdf.py new file mode 100644 index 0000000000000000000000000000000000000000..805dcc3059af624f63766e53d8e197657d3a7caa --- /dev/null +++ b/skills/scrape-pdf/scripts/extract_pdf.py @@ -0,0 +1,31 @@ +#!/usr/bin/env python3 +"""CLI: extract text from a PDF.""" + +from __future__ import annotations + +import argparse +import sys +from pathlib import Path + +from researchmind.scrape_pdf import extract_pdf + + +def main() -> int: + parser = argparse.ArgumentParser(description="Extract PDF text for ResearchMind") + parser.add_argument("path", type=Path, help="Path to PDF file") + parser.add_argument("--out", help="Write full text to this file") + args = parser.parse_args() + + doc = extract_pdf(args.path) + if args.out: + Path(args.out).write_text(doc.text, encoding="utf-8") + print(f"Title: {doc.title}") + print(f"Pages metadata: {doc.metadata.get('page_count', '?')}") + print(f"Chars: {len(doc.text)}") + if not args.out: + print(doc.text[:2000]) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/skills/scrape-web/SKILL.md b/skills/scrape-web/SKILL.md new file mode 100644 index 0000000000000000000000000000000000000000..c1796a2f40fcf2b4e57e075cba778877d66b9cc2 --- /dev/null +++ b/skills/scrape-web/SKILL.md @@ -0,0 +1,16 @@ +--- +name: scrape-web +description: Fetch a web page and extract clean text for indexing +task: research +tools: + - scrape_web +--- + +## Workflow + +1. Receive a full `https://` URL from the user or orchestrator. +2. Call `scrape_web` with the URL. +3. Return title, extracted text, and final URL metadata. +4. Pass the `ExtractedDocument` to `extract_and_index` for MemRAG storage. + +See `references/html-cleanup.md` for extraction settings and `references/allowed-domains.md` for rate-limit notes. diff --git a/skills/scrape-web/references/allowed-domains.md b/skills/scrape-web/references/allowed-domains.md new file mode 100644 index 0000000000000000000000000000000000000000..babc5fca572b691f3b9fe0e6786ab30edcacab37 --- /dev/null +++ b/skills/scrape-web/references/allowed-domains.md @@ -0,0 +1,6 @@ +# Web scrape policy + +- Respect `robots.txt` where practical; skip paywalled or login-only pages. +- One request per URL during ingest; no aggressive crawling. +- Use a descriptive User-Agent (`ResearchMind/0.1`). +- On HTTP errors, surface the status code to the user and do not index empty pages. diff --git a/skills/scrape-web/references/html-cleanup.md b/skills/scrape-web/references/html-cleanup.md new file mode 100644 index 0000000000000000000000000000000000000000..957847c27f631f080a07ef48dac5f3147c9d901d --- /dev/null +++ b/skills/scrape-web/references/html-cleanup.md @@ -0,0 +1,9 @@ +# HTML cleanup + +ResearchMind uses **trafilatura** to strip boilerplate and keep main article text. + +- `include_tables=true` for data-heavy pages +- `include_comments=false` +- Fallback: first 50k chars of raw HTML if extraction returns empty + +Raw snapshot saved under `RESEARCHMIND_DATA_DIR/raw/{doc_id}/snapshot.txt`. diff --git a/skills/scrape-web/scripts/scrape_url.py b/skills/scrape-web/scripts/scrape_url.py new file mode 100644 index 0000000000000000000000000000000000000000..6092f275d9ab8067519b8138cea3f5def76f8b87 --- /dev/null +++ b/skills/scrape-web/scripts/scrape_url.py @@ -0,0 +1,32 @@ +#!/usr/bin/env python3 +"""CLI: scrape a URL and print extracted title + text preview.""" + +from __future__ import annotations + +import argparse +import sys + +from researchmind.scrape_web import fetch_and_extract + + +def main() -> int: + parser = argparse.ArgumentParser(description="Scrape a URL for ResearchMind") + parser.add_argument("url", help="HTTPS URL to fetch") + parser.add_argument("--out", help="Write full text to this file") + args = parser.parse_args() + + doc = fetch_and_extract(args.url) + if args.out: + from pathlib import Path + + Path(args.out).write_text(doc.text, encoding="utf-8") + print(f"Title: {doc.title}") + print(f"URI: {doc.uri}") + print(f"Chars: {len(doc.text)}") + if not args.out: + print(doc.text[:2000]) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/uv.lock b/uv.lock index 6f04a05d5a0550bae94064b3b5d22bdcd64a1b67..f01547f1421a16abc1b20a59e2a74fd65c2dc949 100644 --- a/uv.lock +++ b/uv.lock @@ -19,6 +19,7 @@ members = [ "ensemble", "gradio-space", "inference", + "researchmind", "slm-evals", "small-model-hackathon", ] @@ -61,6 +62,7 @@ dependencies = [ { name = "python-docx" }, { name = "python-pptx" }, { name = "pyyaml" }, + { name = "researchmind" }, ] [package.metadata] @@ -71,6 +73,7 @@ requires-dist = [ { name = "python-docx", specifier = ">=1.1.0" }, { name = "python-pptx", specifier = ">=1.0.0" }, { name = "pyyaml", specifier = ">=6.0.2" }, + { name = "researchmind", editable = "libs/researchmind" }, ] [[package]] @@ -291,6 +294,28 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f6/22/91616fe707a5c5510de2cac9b046a30defe7007ba8a0c04f9c08f27df312/audioop_lts-0.2.2-cp314-cp314t-win_arm64.whl", hash = "sha256:b492c3b040153e68b9fdaff5913305aaaba5bb433d8a7f73d5cf6a64ed3cc1dd", size = 25206, upload-time = "2025-08-05T16:43:16.444Z" }, ] +[[package]] +name = "babel" +version = "2.18.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/7d/b2/51899539b6ceeeb420d40ed3cd4b7a40519404f9baf3d4ac99dc413a834b/babel-2.18.0.tar.gz", hash = "sha256:b80b99a14bd085fcacfa15c9165f651fbb3406e66cc603abf11c5750937c992d", size = 9959554, upload-time = "2026-02-01T12:30:56.078Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/77/f5/21d2de20e8b8b0408f0681956ca2c69f1320a3848ac50e6e7f39c6159675/babel-2.18.0-py3-none-any.whl", hash = "sha256:e2b422b277c2b9a9630c1d7903c2a00d0830c409c59ac8cae9081c92f1aeba35", size = 10196845, upload-time = "2026-02-01T12:30:53.445Z" }, +] + +[[package]] +name = "beautifulsoup4" +version = "4.15.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "soupsieve" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/43/65/318323f98dbee45d42dff61d8f047181bc6f2268a9068cfad035a46be5af/beautifulsoup4-4.15.0.tar.gz", hash = "sha256:288e3ca7d54b06f2ac191970bc275c1939cb46d450b255bf6718b04aa37ab4f7", size = 632571, upload-time = "2026-06-07T16:44:20.453Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/88/c6/92fcd42f1ba33e1184263f25bfabf3d27c383410470f169e4b8163bf9c17/beautifulsoup4-4.15.0-py3-none-any.whl", hash = "sha256:d6f88de62e1d4e38ecb1077eb9724cd0eff29d2a08ca16a401e9b9e93f117cf9", size = 109924, upload-time = "2026-06-07T16:44:21.566Z" }, +] + [[package]] name = "bitsandbytes" version = "0.49.2" @@ -345,6 +370,27 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f5/10/56978295c14794b2c12007b07f3e41ba26acda9257457d7085b0bb3bb90c/brotli-1.2.0-cp314-cp314-win_amd64.whl", hash = "sha256:e7c0af964e0b4e3412a0ebf341ea26ec767fa0b4cf81abb5e897c9338b5ad6a3", size = 375639, upload-time = "2025-11-05T18:38:55.67Z" }, ] +[[package]] +name = "brotlicffi" +version = "1.2.0.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "cffi" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/8a/b6/017dc5f852ed9b8735af77774509271acbf1de02d238377667145fcee01d/brotlicffi-1.2.0.1.tar.gz", hash = "sha256:c20d5c596278307ad06414a6d95a892377ea274a5c6b790c2548c009385d621c", size = 478156, upload-time = "2026-03-05T19:54:11.547Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ef/f9/dfa56316837fa798eac19358351e974de8e1e2ca9475af4cb90293cd6576/brotlicffi-1.2.0.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:2c85e65913cf2b79c57a3fdd05b98d9731d9255dc0cb696b09376cc091b9cddd", size = 433046, upload-time = "2026-03-05T19:53:46.209Z" }, + { url = "https://files.pythonhosted.org/packages/4a/f5/f8f492158c76b0d940388801f04f747028971ad5774287bded5f1e53f08d/brotlicffi-1.2.0.1-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:535f2d05d0273408abc13fc0eebb467afac17b0ad85090c8913690d40207dac5", size = 1541126, upload-time = "2026-03-05T19:53:48.248Z" }, + { url = "https://files.pythonhosted.org/packages/3b/e1/ff87af10ac419600c63e9287a0649c673673ae6b4f2bcf48e96cb2f89f60/brotlicffi-1.2.0.1-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ce17eb798ca59ecec67a9bb3fd7a4304e120d1cd02953ce522d959b9a84d58ac", size = 1541983, upload-time = "2026-03-05T19:53:50.317Z" }, + { url = "https://files.pythonhosted.org/packages/47/c0/80ecd9bd45776109fab14040e478bf63e456967c9ddee2353d8330ed8de1/brotlicffi-1.2.0.1-cp314-cp314t-win32.whl", hash = "sha256:3c9544f83cb715d95d7eab3af4adbbef8b2093ad6382288a83b3a25feb1a57ec", size = 349047, upload-time = "2026-03-05T19:53:52.215Z" }, + { url = "https://files.pythonhosted.org/packages/ab/98/13e5b250236a281b6cd9e92a01ee1ae231029fa78faee932ef3766e1cb24/brotlicffi-1.2.0.1-cp314-cp314t-win_amd64.whl", hash = "sha256:625f8115d32ae9c0740d01ea51518437c3fbaa3e78d41cb18459f6f7ac326000", size = 385652, upload-time = "2026-03-05T19:53:53.892Z" }, + { url = "https://files.pythonhosted.org/packages/9a/9f/b98dcd4af47994cee97aebac866996a006a2e5fc1fd1e2b82a8ad95cf09c/brotlicffi-1.2.0.1-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:91ba5f0ccc040f6ff8f7efaf839f797723d03ed46acb8ae9408f99ffd2572cf4", size = 432608, upload-time = "2026-03-05T19:53:56.736Z" }, + { url = "https://files.pythonhosted.org/packages/b1/7a/ac4ee56595a061e3718a6d1ea7e921f4df156894acffb28ed88a1fd52022/brotlicffi-1.2.0.1-cp38-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:be9a670c6811af30a4bd42d7116dc5895d3b41beaa8ed8a89050447a0181f5ce", size = 1534257, upload-time = "2026-03-05T19:53:58.667Z" }, + { url = "https://files.pythonhosted.org/packages/99/39/e7410db7f6f56de57744ea52a115084ceb2735f4d44973f349bb92136586/brotlicffi-1.2.0.1-cp38-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6f3314a3476f59e5443f9f72a6dff16edc0c3463c9b318feaef04ae3e4683f5a", size = 1536838, upload-time = "2026-03-05T19:54:00.705Z" }, + { url = "https://files.pythonhosted.org/packages/a6/75/6e7977d1935fc3fbb201cbd619be8f2c7aea25d40a096967132854b34708/brotlicffi-1.2.0.1-cp38-abi3-win32.whl", hash = "sha256:82ea52e2b5d3145b6c406ebd3efb0d55db718b7ad996bd70c62cec0439de1187", size = 343337, upload-time = "2026-03-05T19:54:02.446Z" }, + { url = "https://files.pythonhosted.org/packages/d8/ef/e7e485ce5e4ba3843a0a92feb767c7b6098fd6e65ce752918074d175ae71/brotlicffi-1.2.0.1-cp38-abi3-win_amd64.whl", hash = "sha256:da2e82a08e7778b8bc539d27ca03cdd684113e81394bfaaad8d0dfc6a17ddede", size = 379026, upload-time = "2026-03-05T19:54:04.322Z" }, +] + [[package]] name = "certifi" version = "2026.5.20" @@ -354,6 +400,63 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/59/8c/57e832b7af6d7c5abe66eb3fbe3a3a32f4d11ea23a1aa7131371035be991/certifi-2026.5.20-py3-none-any.whl", hash = "sha256:3c52e209ba0a4ad7aebe60436a4ab349c39e1e602e8c134221e546902ad25897", size = 134134, upload-time = "2026-05-20T11:46:48.578Z" }, ] +[[package]] +name = "cffi" +version = "2.0.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pycparser", marker = "implementation_name != 'PyPy'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/eb/56/b1ba7935a17738ae8453301356628e8147c79dbb825bcbc73dc7401f9846/cffi-2.0.0.tar.gz", hash = "sha256:44d1b5909021139fe36001ae048dbdde8214afa20200eda0f64c068cac5d5529", size = 523588, upload-time = "2025-09-08T23:24:04.541Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ea/47/4f61023ea636104d4f16ab488e268b93008c3d0bb76893b1b31db1f96802/cffi-2.0.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:6d02d6655b0e54f54c4ef0b94eb6be0607b70853c45ce98bd278dc7de718be5d", size = 185271, upload-time = "2025-09-08T23:22:44.795Z" }, + { url = "https://files.pythonhosted.org/packages/df/a2/781b623f57358e360d62cdd7a8c681f074a71d445418a776eef0aadb4ab4/cffi-2.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8eca2a813c1cb7ad4fb74d368c2ffbbb4789d377ee5bb8df98373c2cc0dee76c", size = 181048, upload-time = "2025-09-08T23:22:45.938Z" }, + { url = "https://files.pythonhosted.org/packages/ff/df/a4f0fbd47331ceeba3d37c2e51e9dfc9722498becbeec2bd8bc856c9538a/cffi-2.0.0-cp312-cp312-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:21d1152871b019407d8ac3985f6775c079416c282e431a4da6afe7aefd2bccbe", size = 212529, upload-time = "2025-09-08T23:22:47.349Z" }, + { url = "https://files.pythonhosted.org/packages/d5/72/12b5f8d3865bf0f87cf1404d8c374e7487dcf097a1c91c436e72e6badd83/cffi-2.0.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:b21e08af67b8a103c71a250401c78d5e0893beff75e28c53c98f4de42f774062", size = 220097, upload-time = "2025-09-08T23:22:48.677Z" }, + { url = "https://files.pythonhosted.org/packages/c2/95/7a135d52a50dfa7c882ab0ac17e8dc11cec9d55d2c18dda414c051c5e69e/cffi-2.0.0-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:1e3a615586f05fc4065a8b22b8152f0c1b00cdbc60596d187c2a74f9e3036e4e", size = 207983, upload-time = "2025-09-08T23:22:50.06Z" }, + { url = "https://files.pythonhosted.org/packages/3a/c8/15cb9ada8895957ea171c62dc78ff3e99159ee7adb13c0123c001a2546c1/cffi-2.0.0-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:81afed14892743bbe14dacb9e36d9e0e504cd204e0b165062c488942b9718037", size = 206519, upload-time = "2025-09-08T23:22:51.364Z" }, + { url = "https://files.pythonhosted.org/packages/78/2d/7fa73dfa841b5ac06c7b8855cfc18622132e365f5b81d02230333ff26e9e/cffi-2.0.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:3e17ed538242334bf70832644a32a7aae3d83b57567f9fd60a26257e992b79ba", size = 219572, upload-time = "2025-09-08T23:22:52.902Z" }, + { url = "https://files.pythonhosted.org/packages/07/e0/267e57e387b4ca276b90f0434ff88b2c2241ad72b16d31836adddfd6031b/cffi-2.0.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3925dd22fa2b7699ed2617149842d2e6adde22b262fcbfada50e3d195e4b3a94", size = 222963, upload-time = "2025-09-08T23:22:54.518Z" }, + { url = "https://files.pythonhosted.org/packages/b6/75/1f2747525e06f53efbd878f4d03bac5b859cbc11c633d0fb81432d98a795/cffi-2.0.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:2c8f814d84194c9ea681642fd164267891702542f028a15fc97d4674b6206187", size = 221361, upload-time = "2025-09-08T23:22:55.867Z" }, + { url = "https://files.pythonhosted.org/packages/7b/2b/2b6435f76bfeb6bbf055596976da087377ede68df465419d192acf00c437/cffi-2.0.0-cp312-cp312-win32.whl", hash = "sha256:da902562c3e9c550df360bfa53c035b2f241fed6d9aef119048073680ace4a18", size = 172932, upload-time = "2025-09-08T23:22:57.188Z" }, + { url = "https://files.pythonhosted.org/packages/f8/ed/13bd4418627013bec4ed6e54283b1959cf6db888048c7cf4b4c3b5b36002/cffi-2.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:da68248800ad6320861f129cd9c1bf96ca849a2771a59e0344e88681905916f5", size = 183557, upload-time = "2025-09-08T23:22:58.351Z" }, + { url = "https://files.pythonhosted.org/packages/95/31/9f7f93ad2f8eff1dbc1c3656d7ca5bfd8fb52c9d786b4dcf19b2d02217fa/cffi-2.0.0-cp312-cp312-win_arm64.whl", hash = "sha256:4671d9dd5ec934cb9a73e7ee9676f9362aba54f7f34910956b84d727b0d73fb6", size = 177762, upload-time = "2025-09-08T23:22:59.668Z" }, + { url = "https://files.pythonhosted.org/packages/4b/8d/a0a47a0c9e413a658623d014e91e74a50cdd2c423f7ccfd44086ef767f90/cffi-2.0.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:00bdf7acc5f795150faa6957054fbbca2439db2f775ce831222b66f192f03beb", size = 185230, upload-time = "2025-09-08T23:23:00.879Z" }, + { url = "https://files.pythonhosted.org/packages/4a/d2/a6c0296814556c68ee32009d9c2ad4f85f2707cdecfd7727951ec228005d/cffi-2.0.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:45d5e886156860dc35862657e1494b9bae8dfa63bf56796f2fb56e1679fc0bca", size = 181043, upload-time = "2025-09-08T23:23:02.231Z" }, + { url = "https://files.pythonhosted.org/packages/b0/1e/d22cc63332bd59b06481ceaac49d6c507598642e2230f201649058a7e704/cffi-2.0.0-cp313-cp313-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:07b271772c100085dd28b74fa0cd81c8fb1a3ba18b21e03d7c27f3436a10606b", size = 212446, upload-time = "2025-09-08T23:23:03.472Z" }, + { url = "https://files.pythonhosted.org/packages/a9/f5/a2c23eb03b61a0b8747f211eb716446c826ad66818ddc7810cc2cc19b3f2/cffi-2.0.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d48a880098c96020b02d5a1f7d9251308510ce8858940e6fa99ece33f610838b", size = 220101, upload-time = "2025-09-08T23:23:04.792Z" }, + { url = "https://files.pythonhosted.org/packages/f2/7f/e6647792fc5850d634695bc0e6ab4111ae88e89981d35ac269956605feba/cffi-2.0.0-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:f93fd8e5c8c0a4aa1f424d6173f14a892044054871c771f8566e4008eaa359d2", size = 207948, upload-time = "2025-09-08T23:23:06.127Z" }, + { url = "https://files.pythonhosted.org/packages/cb/1e/a5a1bd6f1fb30f22573f76533de12a00bf274abcdc55c8edab639078abb6/cffi-2.0.0-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:dd4f05f54a52fb558f1ba9f528228066954fee3ebe629fc1660d874d040ae5a3", size = 206422, upload-time = "2025-09-08T23:23:07.753Z" }, + { url = "https://files.pythonhosted.org/packages/98/df/0a1755e750013a2081e863e7cd37e0cdd02664372c754e5560099eb7aa44/cffi-2.0.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c8d3b5532fc71b7a77c09192b4a5a200ea992702734a2e9279a37f2478236f26", size = 219499, upload-time = "2025-09-08T23:23:09.648Z" }, + { url = "https://files.pythonhosted.org/packages/50/e1/a969e687fcf9ea58e6e2a928ad5e2dd88cc12f6f0ab477e9971f2309b57c/cffi-2.0.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:d9b29c1f0ae438d5ee9acb31cadee00a58c46cc9c0b2f9038c6b0b3470877a8c", size = 222928, upload-time = "2025-09-08T23:23:10.928Z" }, + { url = "https://files.pythonhosted.org/packages/36/54/0362578dd2c9e557a28ac77698ed67323ed5b9775ca9d3fe73fe191bb5d8/cffi-2.0.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6d50360be4546678fc1b79ffe7a66265e28667840010348dd69a314145807a1b", size = 221302, upload-time = "2025-09-08T23:23:12.42Z" }, + { url = "https://files.pythonhosted.org/packages/eb/6d/bf9bda840d5f1dfdbf0feca87fbdb64a918a69bca42cfa0ba7b137c48cb8/cffi-2.0.0-cp313-cp313-win32.whl", hash = "sha256:74a03b9698e198d47562765773b4a8309919089150a0bb17d829ad7b44b60d27", size = 172909, upload-time = "2025-09-08T23:23:14.32Z" }, + { url = "https://files.pythonhosted.org/packages/37/18/6519e1ee6f5a1e579e04b9ddb6f1676c17368a7aba48299c3759bbc3c8b3/cffi-2.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:19f705ada2530c1167abacb171925dd886168931e0a7b78f5bffcae5c6b5be75", size = 183402, upload-time = "2025-09-08T23:23:15.535Z" }, + { url = "https://files.pythonhosted.org/packages/cb/0e/02ceeec9a7d6ee63bb596121c2c8e9b3a9e150936f4fbef6ca1943e6137c/cffi-2.0.0-cp313-cp313-win_arm64.whl", hash = "sha256:256f80b80ca3853f90c21b23ee78cd008713787b1b1e93eae9f3d6a7134abd91", size = 177780, upload-time = "2025-09-08T23:23:16.761Z" }, + { url = "https://files.pythonhosted.org/packages/92/c4/3ce07396253a83250ee98564f8d7e9789fab8e58858f35d07a9a2c78de9f/cffi-2.0.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:fc33c5141b55ed366cfaad382df24fe7dcbc686de5be719b207bb248e3053dc5", size = 185320, upload-time = "2025-09-08T23:23:18.087Z" }, + { url = "https://files.pythonhosted.org/packages/59/dd/27e9fa567a23931c838c6b02d0764611c62290062a6d4e8ff7863daf9730/cffi-2.0.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:c654de545946e0db659b3400168c9ad31b5d29593291482c43e3564effbcee13", size = 181487, upload-time = "2025-09-08T23:23:19.622Z" }, + { url = "https://files.pythonhosted.org/packages/d6/43/0e822876f87ea8a4ef95442c3d766a06a51fc5298823f884ef87aaad168c/cffi-2.0.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:24b6f81f1983e6df8db3adc38562c83f7d4a0c36162885ec7f7b77c7dcbec97b", size = 220049, upload-time = "2025-09-08T23:23:20.853Z" }, + { url = "https://files.pythonhosted.org/packages/b4/89/76799151d9c2d2d1ead63c2429da9ea9d7aac304603de0c6e8764e6e8e70/cffi-2.0.0-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:12873ca6cb9b0f0d3a0da705d6086fe911591737a59f28b7936bdfed27c0d47c", size = 207793, upload-time = "2025-09-08T23:23:22.08Z" }, + { url = "https://files.pythonhosted.org/packages/bb/dd/3465b14bb9e24ee24cb88c9e3730f6de63111fffe513492bf8c808a3547e/cffi-2.0.0-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:d9b97165e8aed9272a6bb17c01e3cc5871a594a446ebedc996e2397a1c1ea8ef", size = 206300, upload-time = "2025-09-08T23:23:23.314Z" }, + { url = "https://files.pythonhosted.org/packages/47/d9/d83e293854571c877a92da46fdec39158f8d7e68da75bf73581225d28e90/cffi-2.0.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:afb8db5439b81cf9c9d0c80404b60c3cc9c3add93e114dcae767f1477cb53775", size = 219244, upload-time = "2025-09-08T23:23:24.541Z" }, + { url = "https://files.pythonhosted.org/packages/2b/0f/1f177e3683aead2bb00f7679a16451d302c436b5cbf2505f0ea8146ef59e/cffi-2.0.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:737fe7d37e1a1bffe70bd5754ea763a62a066dc5913ca57e957824b72a85e205", size = 222828, upload-time = "2025-09-08T23:23:26.143Z" }, + { url = "https://files.pythonhosted.org/packages/c6/0f/cafacebd4b040e3119dcb32fed8bdef8dfe94da653155f9d0b9dc660166e/cffi-2.0.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:38100abb9d1b1435bc4cc340bb4489635dc2f0da7456590877030c9b3d40b0c1", size = 220926, upload-time = "2025-09-08T23:23:27.873Z" }, + { url = "https://files.pythonhosted.org/packages/3e/aa/df335faa45b395396fcbc03de2dfcab242cd61a9900e914fe682a59170b1/cffi-2.0.0-cp314-cp314-win32.whl", hash = "sha256:087067fa8953339c723661eda6b54bc98c5625757ea62e95eb4898ad5e776e9f", size = 175328, upload-time = "2025-09-08T23:23:44.61Z" }, + { url = "https://files.pythonhosted.org/packages/bb/92/882c2d30831744296ce713f0feb4c1cd30f346ef747b530b5318715cc367/cffi-2.0.0-cp314-cp314-win_amd64.whl", hash = "sha256:203a48d1fb583fc7d78a4c6655692963b860a417c0528492a6bc21f1aaefab25", size = 185650, upload-time = "2025-09-08T23:23:45.848Z" }, + { url = "https://files.pythonhosted.org/packages/9f/2c/98ece204b9d35a7366b5b2c6539c350313ca13932143e79dc133ba757104/cffi-2.0.0-cp314-cp314-win_arm64.whl", hash = "sha256:dbd5c7a25a7cb98f5ca55d258b103a2054f859a46ae11aaf23134f9cc0d356ad", size = 180687, upload-time = "2025-09-08T23:23:47.105Z" }, + { url = "https://files.pythonhosted.org/packages/3e/61/c768e4d548bfa607abcda77423448df8c471f25dbe64fb2ef6d555eae006/cffi-2.0.0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:9a67fc9e8eb39039280526379fb3a70023d77caec1852002b4da7e8b270c4dd9", size = 188773, upload-time = "2025-09-08T23:23:29.347Z" }, + { url = "https://files.pythonhosted.org/packages/2c/ea/5f76bce7cf6fcd0ab1a1058b5af899bfbef198bea4d5686da88471ea0336/cffi-2.0.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:7a66c7204d8869299919db4d5069a82f1561581af12b11b3c9f48c584eb8743d", size = 185013, upload-time = "2025-09-08T23:23:30.63Z" }, + { url = "https://files.pythonhosted.org/packages/be/b4/c56878d0d1755cf9caa54ba71e5d049479c52f9e4afc230f06822162ab2f/cffi-2.0.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:7cc09976e8b56f8cebd752f7113ad07752461f48a58cbba644139015ac24954c", size = 221593, upload-time = "2025-09-08T23:23:31.91Z" }, + { url = "https://files.pythonhosted.org/packages/e0/0d/eb704606dfe8033e7128df5e90fee946bbcb64a04fcdaa97321309004000/cffi-2.0.0-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:92b68146a71df78564e4ef48af17551a5ddd142e5190cdf2c5624d0c3ff5b2e8", size = 209354, upload-time = "2025-09-08T23:23:33.214Z" }, + { url = "https://files.pythonhosted.org/packages/d8/19/3c435d727b368ca475fb8742ab97c9cb13a0de600ce86f62eab7fa3eea60/cffi-2.0.0-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:b1e74d11748e7e98e2f426ab176d4ed720a64412b6a15054378afdb71e0f37dc", size = 208480, upload-time = "2025-09-08T23:23:34.495Z" }, + { url = "https://files.pythonhosted.org/packages/d0/44/681604464ed9541673e486521497406fadcc15b5217c3e326b061696899a/cffi-2.0.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:28a3a209b96630bca57cce802da70c266eb08c6e97e5afd61a75611ee6c64592", size = 221584, upload-time = "2025-09-08T23:23:36.096Z" }, + { url = "https://files.pythonhosted.org/packages/25/8e/342a504ff018a2825d395d44d63a767dd8ebc927ebda557fecdaca3ac33a/cffi-2.0.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:7553fb2090d71822f02c629afe6042c299edf91ba1bf94951165613553984512", size = 224443, upload-time = "2025-09-08T23:23:37.328Z" }, + { url = "https://files.pythonhosted.org/packages/e1/5e/b666bacbbc60fbf415ba9988324a132c9a7a0448a9a8f125074671c0f2c3/cffi-2.0.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:6c6c373cfc5c83a975506110d17457138c8c63016b563cc9ed6e056a82f13ce4", size = 223437, upload-time = "2025-09-08T23:23:38.945Z" }, + { url = "https://files.pythonhosted.org/packages/a0/1d/ec1a60bd1a10daa292d3cd6bb0b359a81607154fb8165f3ec95fe003b85c/cffi-2.0.0-cp314-cp314t-win32.whl", hash = "sha256:1fc9ea04857caf665289b7a75923f2c6ed559b8298a1b8c49e59f7dd95c8481e", size = 180487, upload-time = "2025-09-08T23:23:40.423Z" }, + { url = "https://files.pythonhosted.org/packages/bf/41/4c1168c74fac325c0c8156f04b6749c8b6a8f405bbf91413ba088359f60d/cffi-2.0.0-cp314-cp314t-win_amd64.whl", hash = "sha256:d68b6cef7827e8641e8ef16f4494edda8b36104d79773a334beaa1e3521430f6", size = 191726, upload-time = "2025-09-08T23:23:41.742Z" }, + { url = "https://files.pythonhosted.org/packages/ae/3a/dbeec9d1ee0844c679f6bb5d6ad4e9f198b1224f4e7a32825f47f6192b0c/cffi-2.0.0-cp314-cp314t-win_arm64.whl", hash = "sha256:0a1527a803f0a659de1af2e1fd700213caba79377e27e4693648c2923da066f9", size = 184195, upload-time = "2025-09-08T23:23:43.004Z" }, +] + [[package]] name = "chardet" version = "6.0.0.post1" @@ -457,6 +560,20 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" }, ] +[[package]] +name = "courlan" +version = "1.4.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "babel" }, + { name = "tld" }, + { name = "urllib3" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/bb/16/2a771612ee0b3acaa95ac21cc7e8a3319e815d6360f8ffc5987d1ce28499/courlan-1.4.0.tar.gz", hash = "sha256:fbbac7b7fcde2195ea08e707609503c81cf39c891e8d26cdb1fed4585782d63d", size = 208997, upload-time = "2026-06-01T17:30:17.306Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/1f/38/ce65091ff20a16e06d17418c4353af5f56d3190821b1a06983c79ae79274/courlan-1.4.0-py3-none-any.whl", hash = "sha256:ad1dbdefd912ca7238d4607dc855df5df097f56bac175dd662c84eed3802f49e", size = 34193, upload-time = "2026-06-01T17:30:14.984Z" }, +] + [[package]] name = "cuda-bindings" version = "13.3.1" @@ -561,6 +678,37 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/05/66/73034ad30b59f13439b75e620989dacba4c047256e358ba7c2e9ec98ea22/datasets-5.0.0-py3-none-any.whl", hash = "sha256:7dd34927a0fd7046e98aad5cb9430e699c373238a15befa7b9bf22b991a7fee6", size = 555084, upload-time = "2026-06-05T13:18:24.435Z" }, ] +[[package]] +name = "dateparser" +version = "1.4.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "python-dateutil" }, + { name = "pytz" }, + { name = "regex" }, + { name = "tzlocal" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/46/2d/a0ccdb78788064fa0dc901b8524e50615c42be1d78b78d646d0b28d09180/dateparser-1.4.0.tar.gz", hash = "sha256:97a21840d5ecdf7630c584f673338a5afac5dfe84f647baf4d7e8df98f9354a4", size = 321512, upload-time = "2026-03-26T09:56:10.292Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b4/0b/3c3bb7cbe757279e693a0be6049048012f794d01f81099609ecd53b899f0/dateparser-1.4.0-py3-none-any.whl", hash = "sha256:7902b8e85d603494bf70a5a0b1decdddb2270b9c6e6b2bc8a57b93476c0df378", size = 300379, upload-time = "2026-03-26T09:56:08.409Z" }, +] + +[[package]] +name = "ddgs" +version = "9.14.4" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "click" }, + { name = "fake-useragent" }, + { name = "httpx", extra = ["brotli", "http2", "socks"] }, + { name = "lxml" }, + { name = "primp" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/77/24/9d29eeb7dd4852c27c3673adcaf30c4dc55ced76b303c1fbb792ce7cae52/ddgs-9.14.4.tar.gz", hash = "sha256:f7b118a2b709a9e9c04a1dca6e96b98c25d4dfaca1a4b0a244d74454fcca48ef", size = 59742, upload-time = "2026-05-15T06:53:45.946Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e8/5f/32de4d99220eb559b7b1cd1c529a1856efa8097f7a3e10b6c207aa95e36c/ddgs-9.14.4-py3-none-any.whl", hash = "sha256:acb084c34bf1110c974caf7e5e5a2c1973beb4bd9e170bfd191fe5ed2d2b2d6c", size = 70638, upload-time = "2026-05-15T06:53:44.761Z" }, +] + [[package]] name = "dill" version = "0.4.1" @@ -612,6 +760,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/3e/af/3e990d8d4002bbc9342adb4facd59506e653da93b2417de0fa6027cb86b1/evaluate-0.4.6-py3-none-any.whl", hash = "sha256:bca85bc294f338377b7ac2f861e21c308b11b2a285f510d7d5394d5df437db29", size = 84069, upload-time = "2025-09-18T13:06:29.265Z" }, ] +[[package]] +name = "fake-useragent" +version = "2.2.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/41/43/948d10bf42735709edb5ae51e23297d034086f17fc7279fef385a7acb473/fake_useragent-2.2.0.tar.gz", hash = "sha256:4e6ab6571e40cc086d788523cf9e018f618d07f9050f822ff409a4dfe17c16b2", size = 158898, upload-time = "2025-04-14T15:32:19.238Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/51/37/b3ea9cd5558ff4cb51957caca2193981c6b0ff30bd0d2630ac62505d99d0/fake_useragent-2.2.0-py3-none-any.whl", hash = "sha256:67f35ca4d847b0d298187443aaf020413746e56acd985a611908c73dba2daa24", size = 161695, upload-time = "2025-04-14T15:32:17.732Z" }, +] + [[package]] name = "fastapi" version = "0.136.3" @@ -740,6 +897,19 @@ http = [ { name = "aiohttp" }, ] +[[package]] +name = "googlesearch-python" +version = "1.3.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "beautifulsoup4" }, + { name = "requests" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/00/c8/3f76213025b77de23f11d3f87349ff9825cf3b0054f62156858af1bd94f3/googlesearch_python-1.3.0.tar.gz", hash = "sha256:c5729b1247c2a8f5c4b48ed73c4f8e9fd558ac4e09de67865479f0a33f2d97dc", size = 5191, upload-time = "2025-01-21T02:31:24.285Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/60/a6/c1fe6a46a7ac2d3b08acfe88ce3d2b12cd8351c697ee4b300bfa350b7c9a/googlesearch_python-1.3.0-py3-none-any.whl", hash = "sha256:808c4dd390dc4c6a1cfba2f5151f5ef16dceb0a200d9770b388dcd39162b4e19", size = 5563, upload-time = "2025-01-21T02:31:22.102Z" }, +] + [[package]] name = "gradio" version = "6.16.0" @@ -830,6 +1000,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/04/4b/29cac41a4d98d144bf5f6d33995617b185d14b22401f75ca86f384e87ff1/h11-0.16.0-py3-none-any.whl", hash = "sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86", size = 37515, upload-time = "2025-04-24T03:35:24.344Z" }, ] +[[package]] +name = "h2" +version = "4.3.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "hpack" }, + { name = "hyperframe" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/1d/17/afa56379f94ad0fe8defd37d6eb3f89a25404ffc71d4d848893d270325fc/h2-4.3.0.tar.gz", hash = "sha256:6c59efe4323fa18b47a632221a1888bd7fde6249819beda254aeca909f221bf1", size = 2152026, upload-time = "2025-08-23T18:12:19.778Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/69/b2/119f6e6dcbd96f9069ce9a2665e0146588dc9f88f29549711853645e736a/h2-4.3.0-py3-none-any.whl", hash = "sha256:c438f029a25f7945c69e0ccf0fb951dc3f73a5f6412981daee861431b70e2bdd", size = 61779, upload-time = "2025-08-23T18:12:17.779Z" }, +] + [[package]] name = "hf-gradio" version = "0.4.1" @@ -875,6 +1058,31 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/62/94/3b66b148778ee100dcfd69c2ca22b57b41b44d3063ceec934f209e9184ce/hf_xet-1.5.0-cp37-abi3-win_arm64.whl", hash = "sha256:b6c9df403040248c76d808d3e047d64db2d923bae593eb244c41e425cf6cd7be", size = 3806916, upload-time = "2026-05-06T06:18:21.7Z" }, ] +[[package]] +name = "hpack" +version = "4.1.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/2c/48/71de9ed269fdae9c8057e5a4c0aa7402e8bb16f2c6e90b3aa53327b113f8/hpack-4.1.0.tar.gz", hash = "sha256:ec5eca154f7056aa06f196a557655c5b009b382873ac8d1e66e79e87535f1dca", size = 51276, upload-time = "2025-01-22T21:44:58.347Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/07/c6/80c95b1b2b94682a72cbdbfb85b81ae2daffa4291fbfa1b1464502ede10d/hpack-4.1.0-py3-none-any.whl", hash = "sha256:157ac792668d995c657d93111f46b4535ed114f0c9c8d672271bbec7eae1b496", size = 34357, upload-time = "2025-01-22T21:44:56.92Z" }, +] + +[[package]] +name = "htmldate" +version = "1.10.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "charset-normalizer" }, + { name = "dateparser" }, + { name = "lxml" }, + { name = "python-dateutil" }, + { name = "urllib3" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ad/1f/e7cf83e23d7b68105de8b874a8b36ba23b450d6f71388583e4ca3ce475ca/htmldate-1.10.0.tar.gz", hash = "sha256:a38df10772ab5d7dbb11896e3f6a852a8491fb1b0965465bc174e23fc2baae58", size = 44455, upload-time = "2026-06-01T17:43:53.437Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f7/17/d3356233c826c641f940983d9479eab27faec59d49f4070bc58e80fcc021/htmldate-1.10.0-py3-none-any.whl", hash = "sha256:9211dae35ab94147c8ed9e5fc2c9287a5cf31d2394cb7857e7f5dd814eb2aad6", size = 31561, upload-time = "2026-06-01T17:43:51.797Z" }, +] + [[package]] name = "httpcore" version = "1.0.9" @@ -903,6 +1111,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517, upload-time = "2024-12-06T15:37:21.509Z" }, ] +[package.optional-dependencies] +brotli = [ + { name = "brotli", marker = "platform_python_implementation == 'CPython'" }, + { name = "brotlicffi", marker = "platform_python_implementation != 'CPython'" }, +] +http2 = [ + { name = "h2" }, +] +socks = [ + { name = "socksio" }, +] + [[package]] name = "huggingface-hub" version = "1.18.0" @@ -924,6 +1144,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/0b/03/40a05316cb6616e5b7efd7773656441ab04b4b022c2199e79bb4622a92a3/huggingface_hub-1.18.0-py3-none-any.whl", hash = "sha256:729be4a976fb706dcc02d176bcda8a3f32bdf21a294e8f4b3dda6fbcbc9c1ab1", size = 684411, upload-time = "2026-06-05T09:26:31.48Z" }, ] +[[package]] +name = "hyperframe" +version = "6.1.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/02/e7/94f8232d4a74cc99514c13a9f995811485a6903d48e5d952771ef6322e30/hyperframe-6.1.0.tar.gz", hash = "sha256:f630908a00854a7adeabd6382b43923a4c4cd4b821fcb527e6ab9e15382a3b08", size = 26566, upload-time = "2025-01-22T21:41:49.302Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/48/30/47d0bf6072f7252e6521f3447ccfa40b421b6824517f82854703d0f5a98b/hyperframe-6.1.0-py3-none-any.whl", hash = "sha256:b03380493a519fce58ea5af42e4a42317bf9bd425596f7a0835ffce80f1a42e5", size = 13007, upload-time = "2025-01-22T21:41:47.295Z" }, +] + [[package]] name = "idna" version = "3.18" @@ -990,6 +1219,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/7b/91/984aca2ec129e2757d1e4e3c81c3fcda9d0f85b74670a094cc443d9ee949/joblib-1.5.3-py3-none-any.whl", hash = "sha256:5fc3c5039fc5ca8c0276333a188bbd59d6b7ab37fe6632daa76bc7f9ec18e713", size = 309071, upload-time = "2025-12-15T08:41:44.973Z" }, ] +[[package]] +name = "justext" +version = "3.0.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "lxml", extra = ["html-clean"] }, +] +sdist = { url = "https://files.pythonhosted.org/packages/49/f3/45890c1b314f0d04e19c1c83d534e611513150939a7cf039664d9ab1e649/justext-3.0.2.tar.gz", hash = "sha256:13496a450c44c4cd5b5a75a5efcd9996066d2a189794ea99a49949685a0beb05", size = 828521, upload-time = "2025-02-25T20:21:49.934Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f2/ac/52f4e86d1924a7fc05af3aeb34488570eccc39b4af90530dd6acecdf16b5/justext-3.0.2-py2.py3-none-any.whl", hash = "sha256:62b1c562b15c3c6265e121cc070874243a443bfd53060e869393f09d6b6cc9a7", size = 837940, upload-time = "2025-02-25T20:21:44.179Z" }, +] + [[package]] name = "llama-cpp-python" version = "0.3.26" @@ -1115,6 +1356,23 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/7f/2c/0f1e93c636720e8a3eb59af2bfda99d98b55891e1c53bc30c2e0e865f01b/lxml-6.1.1-cp314-cp314t-win_arm64.whl", hash = "sha256:58bb955caba94e467d2a96da17660d2d704e0675894cba21ab8a775b8621fd1c", size = 3817223, upload-time = "2026-05-19T19:22:56.823Z" }, ] +[package.optional-dependencies] +html-clean = [ + { name = "lxml-html-clean" }, +] + +[[package]] +name = "lxml-html-clean" +version = "0.4.5" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "lxml" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/0a/63/195dfdde380a84df309e3bccf4384b034b745dba43426886f7ae623b4fba/lxml_html_clean-0.4.5.tar.gz", hash = "sha256:e2a4c7d5beedd17cd7b484d848a0571e54baa239a4f9df5546e3acba7f990560", size = 24142, upload-time = "2026-05-20T12:17:53.574Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/6a/bd/6e2b76a6c5dee10397db9c929f0c5066766ec1036046f0335b7ca7ca08b8/lxml_html_clean-0.4.5-py3-none-any.whl", hash = "sha256:c76fcadd1e5bfb9b8bafc2200d51e4e78eb0dad67f56881c21dfb6484c7e7746", size = 14573, upload-time = "2026-05-20T12:17:52.215Z" }, +] + [[package]] name = "markdown-it-py" version = "4.2.0" @@ -1825,6 +2083,44 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/4b/a6/38c8e2f318bf67d338f4d629e93b0b4b9af331f455f0390ea8ce4a099b26/portalocker-3.2.0-py3-none-any.whl", hash = "sha256:3cdc5f565312224bc570c49337bd21428bba0ef363bbcf58b9ef4a9f11779968", size = 22424, upload-time = "2025-06-14T13:20:38.083Z" }, ] +[[package]] +name = "primp" +version = "1.3.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/cc/4b/7efa54f38da7de8df6b70dfed173bb41a52b740b144e4be24c1172db4209/primp-1.3.1.tar.gz", hash = "sha256:b04a5941bf9c876d011c5defaf5a25be093d56e7270b8da52c9788b9df2a829a", size = 1360029, upload-time = "2026-05-23T17:39:25.568Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7f/80/c4885a783a7493e396d89a592ba19fce63ef6bd6ad47230924a884a30ec0/primp-1.3.1-cp310-abi3-macosx_10_12_x86_64.whl", hash = "sha256:27b87e6370045a0c65c0e4dfdfacbfe637387d05673ce8ddcce400263f7c27f0", size = 5123967, upload-time = "2026-05-23T17:39:08.586Z" }, + { url = "https://files.pythonhosted.org/packages/58/c1/c965cc23f96a364803d44b4331f33e4465bb6f269add37e39d0ad77ffe33/primp-1.3.1-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:27a8804eb9a3f641f379ee2b443591428cf85c898816e93d04d3e7b6f229ebcb", size = 4743059, upload-time = "2026-05-23T17:39:15.536Z" }, + { url = "https://files.pythonhosted.org/packages/9c/99/f4248d8d833d43fd8ba78208f2f4bf7fba7d3aec8c516090a95d18d6f550/primp-1.3.1-cp310-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:862974796552a51af8e276bb19c5d5e189168ab8bad216aef7ce3726a8d3b1dd", size = 5100121, upload-time = "2026-05-23T17:39:04.64Z" }, + { url = "https://files.pythonhosted.org/packages/9b/ad/519e32e0184763e1a76c9321fdeac0bb9b30bf85746f12058feec0cc4a27/primp-1.3.1-cp310-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ceb24198994799706f4020a00173ba9c1b491aa9805b1e014d87946677bc3c5d", size = 4738042, upload-time = "2026-05-23T17:39:35.967Z" }, + { url = "https://files.pythonhosted.org/packages/dc/7b/723cb40694b47ec79a142ed8492835c0ecae9fef7acbed014f04b018d1de/primp-1.3.1-cp310-abi3-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b3298b8afcf0a88ba6622bfc18e78aeb11afbb7d5afa4774f24acf7491f54a2d", size = 5001773, upload-time = "2026-05-23T17:39:03.01Z" }, + { url = "https://files.pythonhosted.org/packages/52/b8/80a2e3bdab1c51d738b82ea210a5ab93986b443c561e792e42cae296ec10/primp-1.3.1-cp310-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6b8d38c5a6d0a863274cbcae9678f265fcdcead3c20d12d152244e88f5f2186b", size = 5334228, upload-time = "2026-05-23T17:39:24.214Z" }, + { url = "https://files.pythonhosted.org/packages/19/70/c95b8054c7d1fe2d84226ec60a5f48ce6c95a08b7c8b1702d7742082f444/primp-1.3.1-cp310-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:96f831c78ddb5900873f51e294bf9bbb4bbfdac3a2f39ce4023f8c558d299332", size = 5157269, upload-time = "2026-05-23T17:38:48.142Z" }, + { url = "https://files.pythonhosted.org/packages/34/bb/9b66986b7ecf2eff987134cd94bde533142e3085d6f67531f1a369ceaaae/primp-1.3.1-cp310-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:329d0c320841f65b39d80801d8bae126732b84ec1094ca17b14fda0bda1b20ff", size = 5347438, upload-time = "2026-05-23T17:39:17.405Z" }, + { url = "https://files.pythonhosted.org/packages/aa/29/5d127748d06f3c6a3367f3c4974e45b98cda61cd28ea79ef91ad3fe9e093/primp-1.3.1-cp310-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:6c3c67670c38a03e9e8da45b212243d35afc8efa018317c46ecdce47f05329d1", size = 5264862, upload-time = "2026-05-23T17:39:20.625Z" }, + { url = "https://files.pythonhosted.org/packages/16/f3/1aac229425cac142c48418e2de9f70597161ea936543b5e3c9e7476e1921/primp-1.3.1-cp310-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:9409a31028a8c62a609d389554ad4f5339aad075130300cd443beef0336d7179", size = 4969889, upload-time = "2026-05-23T17:39:22.412Z" }, + { url = "https://files.pythonhosted.org/packages/38/86/a94d6e6166139c76ae42eb941328679309ca85139e8753d639657a24474c/primp-1.3.1-cp310-abi3-musllinux_1_2_i686.whl", hash = "sha256:88ca36c2bd1b7c64b96ad07ca367d2d111ac8e9670549be5f232da8bf795d21e", size = 5082679, upload-time = "2026-05-23T17:39:28.411Z" }, + { url = "https://files.pythonhosted.org/packages/cf/61/21d297db575ed660c6aaf35c9014c1874ace45d6dcb79d1a4d3d2608bffb/primp-1.3.1-cp310-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:74d13800b501aa003fb05c263d38f8d61656c83a60b2951046c0fc412bc73976", size = 5605392, upload-time = "2026-05-23T17:39:38.007Z" }, + { url = "https://files.pythonhosted.org/packages/36/d6/9262a7ebb1d980a2db0cd505bb902bb3e66acd8a1cb763a4c2921f2f6a5b/primp-1.3.1-cp310-abi3-win32.whl", hash = "sha256:09ada1752629fe89d7b128beeb59cb641f404af462e24177ba36aed1cf322299", size = 4270373, upload-time = "2026-05-23T17:38:44.98Z" }, + { url = "https://files.pythonhosted.org/packages/8f/68/f0c6a60fadff0c185aef232b951a6fa4bbb64511facc48d34734db14f16f/primp-1.3.1-cp310-abi3-win_amd64.whl", hash = "sha256:c0d1e294466cd5ec7ef173eedf8df25cbdc050138d40447a906e92b8553e7765", size = 4661498, upload-time = "2026-05-23T17:39:32.213Z" }, + { url = "https://files.pythonhosted.org/packages/7f/1d/232a52abc77384ac66b9c1741691dec3659b1207bb6c5e55c1e9b59d22f1/primp-1.3.1-cp310-abi3-win_arm64.whl", hash = "sha256:43304cb41cbb46f361de49faf1cbdba57f969f628c9297239c7ed8ef0cac420f", size = 4624481, upload-time = "2026-05-23T17:38:42.724Z" }, + { url = "https://files.pythonhosted.org/packages/e5/0b/34333b26c533c3122b936dad829f0a6e04f32065d39673c92b157d97aa16/primp-1.3.1-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:72249a4540d0a8965f36eb9a86cd16801d1c7e8dac2f0b0fa23a0a5a03402d36", size = 5116098, upload-time = "2026-05-23T17:38:55.219Z" }, + { url = "https://files.pythonhosted.org/packages/7a/56/7fe14708adf9a5cb5d6a15ad840a3de036cebfaf20692a5bc3b72e188a73/primp-1.3.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:db4e2eaa5707e47899eeba6026f420f9b0108a28c08d63f1826d0cab8d50f06f", size = 4736300, upload-time = "2026-05-23T17:38:51.792Z" }, + { url = "https://files.pythonhosted.org/packages/31/cb/521a8c18e8808a75450b6e91dc62cc1149c0178b7d4a8697d3f9b73fa385/primp-1.3.1-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d62e7609c98b4bc99c9cecc47f16f332fb8fe1a023002176267b0043dedad0c7", size = 5093823, upload-time = "2026-05-23T17:38:50.059Z" }, + { url = "https://files.pythonhosted.org/packages/57/84/90f776fe46aeb0e3b86df72c674c0651326dd6a61846dd86bddbabe903ac/primp-1.3.1-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3d692e912c2b25271163ba7719df0afdb733a7e7c3073c9094e9001882463543", size = 4734511, upload-time = "2026-05-23T17:39:34.166Z" }, + { url = "https://files.pythonhosted.org/packages/19/79/d9bfbc0df0394f18a98b512a65f4bcf3dd7d17bd871937127e1ce4549172/primp-1.3.1-cp314-cp314t-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0c08693517dc160a12c0f9e2565c5319173cef738893a303ff2fb28ecccbd84d", size = 4999315, upload-time = "2026-05-23T17:39:12.061Z" }, + { url = "https://files.pythonhosted.org/packages/bb/d8/5a986957ee1874d08567d7749668cd78a063048d47d6e46a874742b7fed1/primp-1.3.1-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d134ebfa31adc619e4e48289fe3e7eebc8310141560e6a6a04269cc94893d9ab", size = 5329375, upload-time = "2026-05-23T17:39:30.307Z" }, + { url = "https://files.pythonhosted.org/packages/5c/1d/321cac9902cc3992174ed530719141a0da2e426f54f8a90b7b971571d104/primp-1.3.1-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:48e27e7c0e015a6de495cf79c0c8d599ba5f69d091af31572bec2de020522d9c", size = 5140921, upload-time = "2026-05-23T17:38:53.528Z" }, + { url = "https://files.pythonhosted.org/packages/a6/19/ccbe6b67e0e91beb5c9d5cf804354225d5a3a7a9adf84fee3d6acc53febd/primp-1.3.1-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5c3d682df08c1b1f37b1f66b21fd173baebcfcb52490830b12292d8fe89b2147", size = 5344288, upload-time = "2026-05-23T17:39:18.965Z" }, + { url = "https://files.pythonhosted.org/packages/ff/e5/a735751bd11558163e83e0961fc866e4f94634df9eb24937c5f59624e393/primp-1.3.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:fabaac4280df0802377d34b869949d617a0ecf22ca7fd5f9bded3f5c981031f1", size = 5262909, upload-time = "2026-05-23T17:39:00.789Z" }, + { url = "https://files.pythonhosted.org/packages/09/8e/4e3d4520e2e751f2de825dbe2cb43f837d33a5528adc44255f9770ea125a/primp-1.3.1-cp314-cp314t-musllinux_1_2_armv7l.whl", hash = "sha256:f510e5881e0a4c4b9e7dbc03722c316d58454388b88000a0e7bf18a4b36d601e", size = 4964809, upload-time = "2026-05-23T17:38:59.023Z" }, + { url = "https://files.pythonhosted.org/packages/8f/b5/e7b7495687d07df693325a12c497a9e5185d5001b7b216f32019fa7437a0/primp-1.3.1-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:0504de2901c97903a9c369856a4b186dc90a782d8320652c142b066e697d5a1a", size = 5083654, upload-time = "2026-05-23T17:39:06.598Z" }, + { url = "https://files.pythonhosted.org/packages/f2/f9/e4652e93beb14a16cc4218cfb1ccc18eaca8ee7d93b517d614a135928ec9/primp-1.3.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:6c3b24e302d95d327e873834b9423823b9c8af2abf5e0bbf57a03f3354cfe528", size = 5594738, upload-time = "2026-05-23T17:39:27.001Z" }, + { url = "https://files.pythonhosted.org/packages/e2/49/e8c8a7bc6b741ab6f15896022eee4f906d04d7ccd15aeeb515dd04bbeb6d/primp-1.3.1-cp314-cp314t-win32.whl", hash = "sha256:4346dcef805279028bf4a54bb87dd43d0920130e25b5790689f5c96c9ba0d9e5", size = 4262615, upload-time = "2026-05-23T17:39:13.88Z" }, + { url = "https://files.pythonhosted.org/packages/a9/84/7ae4a257dec6dff329d4a8d9051d907316095c27ffc8d1ea15c359e6eeb5/primp-1.3.1-cp314-cp314t-win_amd64.whl", hash = "sha256:6c55f152a73b6d6af8ac37bdb648d8bbfd7e656f9ef40d87feb3c0d81cee930a", size = 4660584, upload-time = "2026-05-23T17:39:10.081Z" }, + { url = "https://files.pythonhosted.org/packages/99/20/10e0d96bfaeef1f0cd339ccf9bb8feb4bf798fde93198f7a96c73441080a/primp-1.3.1-cp314-cp314t-win_arm64.whl", hash = "sha256:46a529d74583d6ceba52e15bf4c678fcf24e6d669c1ce935262d5490d1b25801", size = 4623226, upload-time = "2026-05-23T17:38:57.256Z" }, +] + [[package]] name = "propcache" version = "0.5.2" @@ -1990,6 +2286,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/51/be/6f79d55816d5c22557cf27533543d5d70dfe692adfbee4b99f2760674f38/pyarrow-24.0.0-cp314-cp314t-win_amd64.whl", hash = "sha256:c91d00057f23b8d353039520dc3a6c09d8608164c692e9f59a175a42b2ae0c19", size = 28131282, upload-time = "2026-04-21T10:51:16.815Z" }, ] +[[package]] +name = "pycparser" +version = "3.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/1b/7d/92392ff7815c21062bea51aa7b87d45576f649f16458d78b7cf94b9ab2e6/pycparser-3.0.tar.gz", hash = "sha256:600f49d217304a5902ac3c37e1281c9fe94e4d0489de643a9504c5cdfdfc6b29", size = 103492, upload-time = "2026-01-21T14:26:51.89Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0c/c3/44f3fbbfa403ea2a7c779186dc20772604442dde72947e7d01069cbe98e3/pycparser-3.0-py3-none-any.whl", hash = "sha256:b727414169a36b7d524c1c3e31839a521725078d7b2ff038656844266160a992", size = 48172, upload-time = "2026-01-21T14:26:50.693Z" }, +] + [[package]] name = "pydantic" version = "2.13.4" @@ -2098,6 +2403,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f4/7e/a72dd26f3b0f4f2bf1dd8923c85f7ceb43172af56d63c7383eb62b332364/pygments-2.20.0-py3-none-any.whl", hash = "sha256:81a9e26dd42fd28a23a2d169d86d7ac03b46e2f8b59ed4698fb4785f946d0176", size = 1231151, upload-time = "2026-03-29T13:29:30.038Z" }, ] +[[package]] +name = "pypdf" +version = "6.13.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/99/0a/48fe05c6bb3aa4bb4d2a4079a383d33c0dfec1edf613a642f07d8b8b5c2e/pypdf-6.13.2.tar.gz", hash = "sha256:5a96a17dbdfbf9c2ab24c0a13fa0aba182be22ba6f283098712c16fc242f509f", size = 6479250, upload-time = "2026-06-10T16:42:34.5Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/cb/17/378943705992f74e451a06de3401ce68e3213763c81e44d0614559c45599/pypdf-6.13.2-py3-none-any.whl", hash = "sha256:6eeb9e57693f29d41bd01255d02660cbbb41fd7fc818a982677389a35e4f2083", size = 346555, upload-time = "2026-06-10T16:42:32.37Z" }, +] + [[package]] name = "pytablewriter" version = "1.2.1" @@ -2358,6 +2672,37 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a0/f4/c67b0b3f1b9245e8d266f0f112c500d50e5b4e83cb6f3b71b6528104182a/requests-2.34.2-py3-none-any.whl", hash = "sha256:2a0d60c172f83ac6ab31e4554906c0f3b3588d37b5cb939b1c061f4907e278e0", size = 73075, upload-time = "2026-05-14T19:25:26.443Z" }, ] +[[package]] +name = "researchmind" +version = "0.1.0" +source = { editable = "libs/researchmind" } +dependencies = [ + { name = "ddgs" }, + { name = "googlesearch-python" }, + { name = "httpx" }, + { name = "inference" }, + { name = "numpy" }, + { name = "pydantic" }, + { name = "pypdf" }, + { name = "python-docx" }, + { name = "sentence-transformers" }, + { name = "trafilatura" }, +] + +[package.metadata] +requires-dist = [ + { name = "ddgs", specifier = ">=9.0.0" }, + { name = "googlesearch-python", specifier = ">=1.3.0" }, + { name = "httpx", specifier = ">=0.28.0" }, + { name = "inference", editable = "libs/inference" }, + { name = "numpy", specifier = ">=2.0.0" }, + { name = "pydantic", specifier = ">=2.0.0" }, + { name = "pypdf", specifier = ">=5.0.0" }, + { name = "python-docx", specifier = ">=1.1.0" }, + { name = "sentence-transformers", specifier = ">=3.0.0" }, + { name = "trafilatura", specifier = ">=2.0.0" }, +] + [[package]] name = "rich" version = "15.0.0" @@ -2568,6 +2913,25 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/6a/23/8146aad7d88f4fcb3a6218f41a60f6c2d4e3a72de72da1825dc7c8f7877c/semantic_version-2.10.0-py2.py3-none-any.whl", hash = "sha256:de78a3b8e0feda74cabc54aab2da702113e33ac9d9eb9d2389bcf1f58b7d9177", size = 15552, upload-time = "2022-05-26T13:35:21.206Z" }, ] +[[package]] +name = "sentence-transformers" +version = "5.5.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "huggingface-hub" }, + { name = "numpy" }, + { name = "scikit-learn" }, + { name = "scipy" }, + { name = "torch" }, + { name = "tqdm" }, + { name = "transformers" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/cf/d4/7ef93157485e978c016f49da05363c1e4e7237beb5343b64b5631101f0f1/sentence_transformers-5.5.1.tar.gz", hash = "sha256:02b7740dfc60bdbbcb6061625f5d97a5c1a4e2d3baac5f9391b912bb5eae2290", size = 445161, upload-time = "2026-05-20T07:37:44.465Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/bf/03/ee99a6b030e7a2e056547729f8a4709dd93e13d9c6f07590f74c395c4017/sentence_transformers-5.5.1-py3-none-any.whl", hash = "sha256:4fe11d433badc5282d32f7fc08bc714216b7a5aca426f9df77a45a554756deb7", size = 588887, upload-time = "2026-05-20T07:37:43.004Z" }, +] + [[package]] name = "setuptools" version = "81.0.0" @@ -2644,6 +3008,7 @@ dependencies = [ { name = "ensemble" }, { name = "gradio-space" }, { name = "inference" }, + { name = "researchmind" }, ] [package.dev-dependencies] @@ -2674,6 +3039,7 @@ requires-dist = [ { name = "ensemble", editable = "research/ensemble" }, { name = "gradio-space", editable = "apps/gradio-space" }, { name = "inference", editable = "libs/inference" }, + { name = "researchmind", editable = "libs/researchmind" }, ] [package.metadata.requires-dev] @@ -2694,6 +3060,24 @@ finetune = [ ] lm-eval = [{ name = "slm-evals", extras = ["lm-eval"], editable = "research/evals" }] +[[package]] +name = "socksio" +version = "1.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f8/5c/48a7d9495be3d1c651198fd99dbb6ce190e2274d0f28b9051307bdec6b85/socksio-1.0.0.tar.gz", hash = "sha256:f88beb3da5b5c38b9890469de67d0cb0f9d494b78b106ca1845f96c10b91c4ac", size = 19055, upload-time = "2020-04-17T15:50:34.664Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/37/c3/6eeb6034408dac0fa653d126c9204ade96b819c936e136c5e8a6897eee9c/socksio-1.0.0-py3-none-any.whl", hash = "sha256:95dc1f15f9b34e8d7b16f06d74b8ccf48f609af32ab33c608d08761c5dcbb1f3", size = 12763, upload-time = "2020-04-17T15:50:31.878Z" }, +] + +[[package]] +name = "soupsieve" +version = "2.8.4" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/47/2c/0a5f6f8ee0d5589e48c7640213ed5175d52cf540a06725b628cc1a45d6ce/soupsieve-2.8.4.tar.gz", hash = "sha256:e121fd02e975c695e4e9e8774a5ee35d74714b59307868dcc5319ad2d9e3328e", size = 121110, upload-time = "2026-05-24T13:55:57.154Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/5e/f5/0c41cb68dcae6b7de4fac4188a3a9589e21fb31df21ea3a2e888db95e6c9/soupsieve-2.8.4-py3-none-any.whl", hash = "sha256:e7e6b0769c8f51ed59acab6e994b00621096cfb1c640a7509295987388fbaf65", size = 37304, upload-time = "2026-05-24T13:55:55.406Z" }, +] + [[package]] name = "sqlitedict" version = "2.1.0" @@ -2765,6 +3149,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/32/d5/f9a850d79b0851d1d4ef6456097579a9005b31fea68726a4ae5f2d82ddd9/threadpoolctl-3.6.0-py3-none-any.whl", hash = "sha256:43a0b8fd5a2928500110039e43a5eed8480b918967083ea48dc3ab9f13c4a7fb", size = 18638, upload-time = "2025-03-13T13:49:21.846Z" }, ] +[[package]] +name = "tld" +version = "0.13.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/5c/5d/76b4383ac4e5b5e254e50c09807b3e13820bed6d6c11cd540264988d6802/tld-0.13.2.tar.gz", hash = "sha256:d983fa92b9d717400742fca844e29d5e18271079c7bcfabf66d01b39b4a14345", size = 467175, upload-time = "2026-03-06T23:50:34.498Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9e/90/39a85a4b63c84213e78b3c17d22e1bf45328acf8ebb33ef93be30d0a3911/tld-0.13.2-py2.py3-none-any.whl", hash = "sha256:9b8fdbdb880e7ba65b216a4937f2c94c49a7226723783d5838fc958ac76f4e0c", size = 296743, upload-time = "2026-03-06T23:50:32.465Z" }, +] + [[package]] name = "tokenizers" version = "0.22.2" @@ -2888,6 +3281,24 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/47/aa/218a0eb34de1f753c83e4d0d1c8e7c4cef27f20dcb8342e024f63a80dc86/tqdm-4.68.1-py3-none-any.whl", hash = "sha256:fea4a90e4023f764914569f7802a297277c5ab1a66be5144143e142e1a4031d8", size = 78354, upload-time = "2026-06-05T17:23:13.654Z" }, ] +[[package]] +name = "trafilatura" +version = "2.1.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "certifi" }, + { name = "charset-normalizer" }, + { name = "courlan" }, + { name = "htmldate" }, + { name = "justext" }, + { name = "lxml" }, + { name = "urllib3" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/e8/19/24833e905df2d80e3bb67424f95febcc17709a1f61a522120bc438afca70/trafilatura-2.1.0.tar.gz", hash = "sha256:f689e2116fc89c7bc0b9a296d01dcfe2eb0b5455f8c371a77dc0db1f06a05643", size = 263876, upload-time = "2026-06-07T17:43:31.829Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0e/78/4ad99d79aee2784f49f20fd0a29058ce4c032fe4439047924c43521cd211/trafilatura-2.1.0-py3-none-any.whl", hash = "sha256:0eded5207a806445ddebbe36eae30b9035fe6a2f233c36f6fe82663fca8b9d30", size = 134600, upload-time = "2026-06-07T17:43:28.404Z" }, +] + [[package]] name = "transformers" version = "5.10.2" @@ -2989,6 +3400,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ce/e4/dccd7f47c4b64213ac01ef921a1337ee6e30e8c6466046018326977efd95/tzdata-2026.2-py2.py3-none-any.whl", hash = "sha256:bbe9af844f658da81a5f95019480da3a89415801f6cc966806612cc7169bffe7", size = 349321, upload-time = "2026-04-24T15:22:05.876Z" }, ] +[[package]] +name = "tzlocal" +version = "5.3.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "tzdata", marker = "sys_platform == 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/8b/2e/c14812d3d4d9cd1773c6be938f89e5735a1f11a9f184ac3639b93cef35d5/tzlocal-5.3.1.tar.gz", hash = "sha256:cceffc7edecefea1f595541dbd6e990cb1ea3d19bf01b2809f362a03dd7921fd", size = 30761, upload-time = "2025-03-05T21:17:41.549Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c2/14/e2a54fabd4f08cd7af1c07030603c3356b74da07f7cc056e600436edfa17/tzlocal-5.3.1-py3-none-any.whl", hash = "sha256:eb1a66c3ef5847adf7a834f1be0800581b683b5608e74f86ecbcef8ab91bb85d", size = 18026, upload-time = "2025-03-05T21:17:39.857Z" }, +] + [[package]] name = "urllib3" version = "2.7.0"