Spaces:
Running
Running
MSG commited on
Commit Β·
e7fd66f
1
Parent(s): 59e2c8a
Feat/research tab agent skills (#5)
Browse files* research agent plan
* init research mind config
* rag memory
* store stuff
* skills research mind
* agent libs
* search web agent
* wip app research mind
* url validate tools
* skills fix
* search url
* research wip fix
* chat rag wip
* rag wip
* citations rag chunk check
* wip test
* clean response wip
* fix clean response
This view is limited to 50 files because it contains too many changes. Β See raw diff
- .cursor/plans/researchmind_rag_agent_7390b536.plan.md +366 -0
- .env.example +8 -0
- .gitignore +3 -1
- README.md +17 -3
- apps/gradio-space/src/gradio_space/app.py +8 -5
- apps/gradio-space/src/gradio_space/model_loading.py +3 -1
- apps/gradio-space/src/gradio_space/research_helpers.py +196 -0
- apps/gradio-space/src/gradio_space/tabs/__init__.py +2 -1
- apps/gradio-space/src/gradio_space/tabs/chat.py +56 -9
- apps/gradio-space/src/gradio_space/tabs/research_mind.py +366 -0
- libs/agent/pyproject.toml +2 -0
- libs/agent/src/agent/models.py +52 -0
- libs/agent/src/agent/research_prompts.py +36 -0
- libs/agent/src/agent/runner.py +257 -1
- libs/agent/src/agent/skills.py +5 -0
- libs/agent/src/agent/tools/research_tools.py +93 -0
- libs/agent/src/agent/tools_registry.py +38 -1
- libs/agent/tests/test_research_runner.py +107 -0
- libs/inference/src/inference/response_clean.py +87 -0
- libs/inference/tests/test_response_clean.py +34 -0
- libs/researchmind/README.md +9 -0
- libs/researchmind/pyproject.toml +25 -0
- libs/researchmind/src/researchmind/__init__.py +11 -0
- libs/researchmind/src/researchmind/chunking.py +46 -0
- libs/researchmind/src/researchmind/citations.py +92 -0
- libs/researchmind/src/researchmind/config.py +32 -0
- libs/researchmind/src/researchmind/embeddings.py +32 -0
- libs/researchmind/src/researchmind/extract.py +36 -0
- libs/researchmind/src/researchmind/ingest.py +105 -0
- libs/researchmind/src/researchmind/retrieve.py +57 -0
- libs/researchmind/src/researchmind/scrape_pdf.py +30 -0
- libs/researchmind/src/researchmind/scrape_web.py +38 -0
- libs/researchmind/src/researchmind/search_urls.py +89 -0
- libs/researchmind/src/researchmind/store.py +381 -0
- libs/researchmind/src/researchmind/url_suggest.py +68 -0
- libs/researchmind/src/researchmind/url_validate.py +118 -0
- libs/researchmind/tests/test_chunking.py +15 -0
- libs/researchmind/tests/test_citations.py +67 -0
- libs/researchmind/tests/test_retrieve.py +95 -0
- libs/researchmind/tests/test_search_queries.py +29 -0
- libs/researchmind/tests/test_store.py +57 -0
- libs/researchmind/tests/test_url_validate.py +65 -0
- pyproject.toml +2 -0
- skills/extract-content/SKILL.md +16 -0
- skills/extract-content/references/chunking-policy.md +9 -0
- skills/extract-content/scripts/chunk_and_index.py +35 -0
- skills/research-mind/SKILL.md +30 -0
- skills/research-mind/references/citation-format.md +6 -0
- skills/research-mind/references/ingest-modes.md +9 -0
- skills/research-mind/scripts/ask.py +33 -0
.cursor/plans/researchmind_rag_agent_7390b536.plan.md
ADDED
|
@@ -0,0 +1,366 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
name: ResearchMind RAG Agent
|
| 3 |
+
overview: "Add ResearchMind: ingest skills (web/PDF/extract) with references and scripts, a persistent MemRAG store (SQLite + embeddings), an agent runner with citation-backed Q&A, and a new Gradio tab. Topic mode suggests URLs via the local model (user confirms); optional auto-search mode via app dropdown and skill flags."
|
| 4 |
+
todos:
|
| 5 |
+
- id: pkg-researchmind
|
| 6 |
+
content: "Create libs/researchmind package: MemRAGStore (SQLite), chunking, sentence-transformers embeddings, retrieve + citations"
|
| 7 |
+
status: completed
|
| 8 |
+
- id: skills-scrape-extract
|
| 9 |
+
content: Add skills/scrape-web, scrape-pdf, extract-content, research-mind with references/ and scripts/ CLIs
|
| 10 |
+
status: completed
|
| 11 |
+
- id: agent-runner
|
| 12 |
+
content: Extend SkillRegistry (flags), ToolRegistry (5 tools), AgentRunner ingest/chat with suggest_urls + auto_search boolean
|
| 13 |
+
status: completed
|
| 14 |
+
- id: gradio-tab
|
| 15 |
+
content: "Add research_mind.py tab: topic/URL/file ingest, mode dropdown, URL confirm, session chat, trace accordion"
|
| 16 |
+
status: completed
|
| 17 |
+
- id: tests-docs
|
| 18 |
+
content: Unit tests for store/retrieve/runner; update .env.example and README for ResearchMind offline Q&A
|
| 19 |
+
status: completed
|
| 20 |
+
isProject: false
|
| 21 |
+
---
|
| 22 |
+
|
| 23 |
+
# ResearchMind β Scraper + RAG + MemRAG Plan
|
| 24 |
+
|
| 25 |
+
## Goal
|
| 26 |
+
|
| 27 |
+
Ship a **Backyard AI** research agent that:
|
| 28 |
+
1. Accepts a **topic**, **URL**, or **PDF/doc** upload
|
| 29 |
+
2. **Ingests once** (scrape β extract β chunk β embed β graph persist)
|
| 30 |
+
3. Answers questions **offline** across sessions with **citations**
|
| 31 |
+
4. Uses the **active local preset** from [`models.yaml`](models.yaml) (no new training in MVP)
|
| 32 |
+
|
| 33 |
+
## Architecture
|
| 34 |
+
|
| 35 |
+
```mermaid
|
| 36 |
+
flowchart TB
|
| 37 |
+
subgraph gradio [Gradio Research Tab]
|
| 38 |
+
Input[Topic / URL / File]
|
| 39 |
+
Mode[Ingest mode dropdown]
|
| 40 |
+
Confirm[URL confirm list]
|
| 41 |
+
Chat[Research chat]
|
| 42 |
+
end
|
| 43 |
+
|
| 44 |
+
subgraph skills [skills/]
|
| 45 |
+
SW[scrape-web]
|
| 46 |
+
SP[scrape-pdf]
|
| 47 |
+
EX[extract-content]
|
| 48 |
+
RM[research-mind]
|
| 49 |
+
end
|
| 50 |
+
|
| 51 |
+
subgraph lib [libs/researchmind]
|
| 52 |
+
Ingest[IngestPipeline]
|
| 53 |
+
Store[MemRAGStore]
|
| 54 |
+
Retrieve[Retriever]
|
| 55 |
+
Cite[CitationFormatter]
|
| 56 |
+
end
|
| 57 |
+
|
| 58 |
+
subgraph agent [libs/agent]
|
| 59 |
+
Runner[AgentRunner.run_researchmind]
|
| 60 |
+
Tools[ToolRegistry]
|
| 61 |
+
Trace[TraceRecorder]
|
| 62 |
+
end
|
| 63 |
+
|
| 64 |
+
Input --> Runner
|
| 65 |
+
Mode --> Runner
|
| 66 |
+
Runner --> SW
|
| 67 |
+
Runner --> SP
|
| 68 |
+
Runner --> EX
|
| 69 |
+
SW --> Ingest
|
| 70 |
+
SP --> Ingest
|
| 71 |
+
EX --> Ingest
|
| 72 |
+
Ingest --> Store
|
| 73 |
+
Chat --> Retrieve
|
| 74 |
+
Retrieve --> Store
|
| 75 |
+
Runner --> Cite
|
| 76 |
+
Cite --> Chat
|
| 77 |
+
Runner --> Trace
|
| 78 |
+
```
|
| 79 |
+
|
| 80 |
+
**Separation of concerns**
|
| 81 |
+
- **Skills** (`skills/*/SKILL.md` + `references/` + `scripts/`) β workflow docs and thin CLIs the agent/humans can invoke
|
| 82 |
+
- **`libs/researchmind/`** β real Python library: scrape, extract, chunk, embed, SQLite MemRAG, retrieval
|
| 83 |
+
- **`libs/agent/`** β orchestration: `AgentRunner.run_researchmind()`, tool handlers, prompts with citations
|
| 84 |
+
- **`apps/gradio-space/`** β third top-level tab wired like [`education_pptx.py`](apps/gradio-space/src/gradio_space/tabs/education_pptx.py)
|
| 85 |
+
|
| 86 |
+
**Not in MVP scope:** wiring [`research/ensemble/src/ensemble/memory.py`](research/ensemble/src/ensemble/memory.py) toy `Embedder` (token-id bound, research-only). Production path uses **sentence-transformers** (`all-MiniLM-L6-v2`) for arbitrary text, fully offline after first model download.
|
| 87 |
+
|
| 88 |
+
---
|
| 89 |
+
|
| 90 |
+
## 1. New package: `libs/researchmind/`
|
| 91 |
+
|
| 92 |
+
Add workspace member in root [`pyproject.toml`](pyproject.toml) and depend from `agent` + `gradio-space`.
|
| 93 |
+
|
| 94 |
+
| Module | Responsibility |
|
| 95 |
+
|--------|----------------|
|
| 96 |
+
| `store.py` | **MemRAGStore** β SQLite at `$RESEARCHMIND_DATA_DIR/memory.db` |
|
| 97 |
+
| `ingest.py` | **IngestPipeline** β normalize β chunk β embed β graph edges |
|
| 98 |
+
| `scrape_web.py` | `httpx` + `trafilatura` fetch/clean HTML |
|
| 99 |
+
| `scrape_pdf.py` | `pypdf` text extraction; optional OCR hook stub |
|
| 100 |
+
| `extract.py` | Unified `ExtractedDocument` (title, url, mime, text, metadata) |
|
| 101 |
+
| `chunking.py` | Sliding-window chunks (~512 tokens / 128 overlap) with stable IDs |
|
| 102 |
+
| `embeddings.py` | Lazy-load `SentenceTransformer`, batch encode, L2-normalize |
|
| 103 |
+
| `retrieve.py` | Top-k cosine search + optional graph expansion (same-doc neighbors) |
|
| 104 |
+
| `citations.py` | Map chunks β `[1]` footnotes with source title/URL/page |
|
| 105 |
+
| `search_urls.py` | Optional DuckDuckGo search (`duckduckgo-search`) when `auto_search=True` |
|
| 106 |
+
| `url_suggest.py` | LLM prompt: topic β JSON list of suggested URLs (default path) |
|
| 107 |
+
|
| 108 |
+
### MemRAG graph schema (SQLite)
|
| 109 |
+
|
| 110 |
+
```
|
| 111 |
+
documents(id, source_type, uri, title, ingested_at, content_hash)
|
| 112 |
+
chunks(id, doc_id, ordinal, text, embedding_blob, meta_json)
|
| 113 |
+
edges(src_id, dst_id, rel) -- doc->chunk, chunk->next_chunk, chunk->cites
|
| 114 |
+
sessions(id, topic, created_at)
|
| 115 |
+
session_messages(session_id, role, content, chunk_ids_json)
|
| 116 |
+
```
|
| 117 |
+
|
| 118 |
+
- **Persistence** enables cross-session memory: chat loads `session_id` or creates new; retrieval searches all ingested docs unless filtered by session/topic tag
|
| 119 |
+
- **Dedup**: skip re-ingest when `content_hash` matches
|
| 120 |
+
- **Graph expansion (light MemRAG)**: when retrieving chunk `k`, also pull adjacent chunks (`chunk->next_chunk`) from same document for context window assembly
|
| 121 |
+
|
| 122 |
+
### Dependencies (add to `libs/researchmind/pyproject.toml`)
|
| 123 |
+
|
| 124 |
+
- `httpx`, `trafilatura` β web scrape
|
| 125 |
+
- `pypdf` β PDF
|
| 126 |
+
- `python-docx` β already in agent; reuse for `.docx` uploads
|
| 127 |
+
- `sentence-transformers` β offline embeddings
|
| 128 |
+
- `duckduckgo-search` β optional auto-search mode
|
| 129 |
+
- `numpy` β vector ops (or store as bytes in SQLite)
|
| 130 |
+
|
| 131 |
+
Env vars (extend [`.env.example`](.env.example)):
|
| 132 |
+
|
| 133 |
+
| Variable | Default | Purpose |
|
| 134 |
+
|----------|---------|---------|
|
| 135 |
+
| `RESEARCHMIND_DATA_DIR` | `outputs/researchmind` | DB + raw snapshots |
|
| 136 |
+
| `RESEARCHMIND_EMBED_MODEL` | `all-MiniLM-L6-v2` | Embedding model |
|
| 137 |
+
| `RESEARCHMIND_AUTO_SEARCH` | `false` | Global default for auto-search |
|
| 138 |
+
| `RESEARCHMIND_TOP_K` | `5` | Retrieval depth |
|
| 139 |
+
|
| 140 |
+
---
|
| 141 |
+
|
| 142 |
+
## 2. Skills layout (with references + scripts)
|
| 143 |
+
|
| 144 |
+
Create four skill folders under [`skills/`](skills/), mirroring Cursor skill layout but using existing [`SkillRegistry`](libs/agent/src/agent/skills.py) frontmatter (`name`, `description`, `task`, `tools`):
|
| 145 |
+
|
| 146 |
+
### `skills/scrape-web/`
|
| 147 |
+
|
| 148 |
+
```
|
| 149 |
+
scrape-web/
|
| 150 |
+
βββ SKILL.md
|
| 151 |
+
βββ references/
|
| 152 |
+
β βββ allowed-domains.md # robots.txt / rate-limit notes
|
| 153 |
+
β βββ html-cleanup.md # trafilatura settings
|
| 154 |
+
βββ scripts/
|
| 155 |
+
βββ scrape_url.py # CLI: python scripts/scrape_url.py <url> --out ...
|
| 156 |
+
```
|
| 157 |
+
|
| 158 |
+
- **tools:** `scrape_web`
|
| 159 |
+
- Script calls `researchmind.scrape_web.fetch_and_extract`
|
| 160 |
+
|
| 161 |
+
### `skills/scrape-pdf/`
|
| 162 |
+
|
| 163 |
+
```
|
| 164 |
+
scrape-pdf/
|
| 165 |
+
βββ SKILL.md
|
| 166 |
+
βββ references/
|
| 167 |
+
β βββ pdf-limits.md # max pages, scanned PDF note
|
| 168 |
+
βββ scripts/
|
| 169 |
+
βββ extract_pdf.py
|
| 170 |
+
```
|
| 171 |
+
|
| 172 |
+
- **tools:** `scrape_pdf`
|
| 173 |
+
|
| 174 |
+
### `skills/extract-content/`
|
| 175 |
+
|
| 176 |
+
```
|
| 177 |
+
extract-content/
|
| 178 |
+
βββ SKILL.md
|
| 179 |
+
βββ references/
|
| 180 |
+
β βββ chunking-policy.md
|
| 181 |
+
βββ scripts/
|
| 182 |
+
βββ chunk_and_index.py # ingest into MemRAGStore
|
| 183 |
+
```
|
| 184 |
+
|
| 185 |
+
- **tools:** `extract_and_index`
|
| 186 |
+
|
| 187 |
+
### `skills/research-mind/` (orchestrator)
|
| 188 |
+
|
| 189 |
+
```
|
| 190 |
+
research-mind/
|
| 191 |
+
βββ SKILL.md
|
| 192 |
+
βββ references/
|
| 193 |
+
β βββ ingest-modes.md # suggest / auto_search / direct_url
|
| 194 |
+
β βββ citation-format.md
|
| 195 |
+
βββ scripts/
|
| 196 |
+
βββ suggest_urls.py
|
| 197 |
+
βββ ingest.py
|
| 198 |
+
βββ ask.py # CLI Q&A with citations
|
| 199 |
+
```
|
| 200 |
+
|
| 201 |
+
Frontmatter additions (parsed as optional YAML fields in extended `Skill` dataclass):
|
| 202 |
+
|
| 203 |
+
```yaml
|
| 204 |
+
---
|
| 205 |
+
name: research-mind
|
| 206 |
+
task: research
|
| 207 |
+
tools:
|
| 208 |
+
- suggest_urls
|
| 209 |
+
- scrape_web
|
| 210 |
+
- scrape_pdf
|
| 211 |
+
- extract_and_index
|
| 212 |
+
- research_answer
|
| 213 |
+
flags:
|
| 214 |
+
auto_search: false # skill default; overridden by agent + Gradio
|
| 215 |
+
---
|
| 216 |
+
```
|
| 217 |
+
|
| 218 |
+
Extend [`libs/agent/src/agent/skills.py`](libs/agent/src/agent/skills.py) to read optional `flags:` dict without breaking existing skills.
|
| 219 |
+
|
| 220 |
+
---
|
| 221 |
+
|
| 222 |
+
## 3. Agent orchestration
|
| 223 |
+
|
| 224 |
+
### New tools in [`libs/agent/src/agent/tools_registry.py`](libs/agent/src/agent/tools_registry.py)
|
| 225 |
+
|
| 226 |
+
| Tool | Handler |
|
| 227 |
+
|------|---------|
|
| 228 |
+
| `suggest_urls` | `url_suggest.suggest(topic, backend)` β list[str] |
|
| 229 |
+
| `scrape_web` | fetch + return `ExtractedDocument` |
|
| 230 |
+
| `scrape_pdf` | extract PDF path/bytes |
|
| 231 |
+
| `extract_and_index` | chunk + embed + `MemRAGStore.add_document` |
|
| 232 |
+
| `research_answer` | retrieve + RAG prompt + `backend.chat` β answer + citations |
|
| 233 |
+
|
| 234 |
+
### New runner method in [`libs/agent/src/agent/runner.py`](libs/agent/src/agent/runner.py)
|
| 235 |
+
|
| 236 |
+
```python
|
| 237 |
+
def run_researchmind_ingest(
|
| 238 |
+
*, topic: str | None, urls: list[str], files: list[Path],
|
| 239 |
+
auto_search: bool, session_id: str | None,
|
| 240 |
+
model_key: str, backend: InferenceBackend,
|
| 241 |
+
) -> ResearchIngestResult: ...
|
| 242 |
+
|
| 243 |
+
def run_researchmind_chat(
|
| 244 |
+
*, question: str, session_id: str,
|
| 245 |
+
model_key: str, backend: InferenceBackend,
|
| 246 |
+
) -> ResearchChatResult: ...
|
| 247 |
+
```
|
| 248 |
+
|
| 249 |
+
**Ingest flow (default β Option C)**
|
| 250 |
+
|
| 251 |
+
1. If `topic` and no URLs/files: call `suggest_urls` (local LLM returns JSON URL list)
|
| 252 |
+
2. Return suggested URLs to UI for **user confirmation** (Gradio checkbox group)
|
| 253 |
+
3. On confirm: scrape each URL / PDF / doc β `extract_and_index`
|
| 254 |
+
4. If `auto_search=True`: skip LLM suggest; run DuckDuckGo `search_urls(topic, n=5)` and ingest without confirmation
|
| 255 |
+
|
| 256 |
+
**Chat flow**
|
| 257 |
+
|
| 258 |
+
1. `retrieve(question, top_k)` from `MemRAGStore`
|
| 259 |
+
2. Build system prompt from `skills/research-mind/SKILL.md` body + `references/citation-format.md`
|
| 260 |
+
3. Inject numbered context blocks; instruct model to cite `[n]`
|
| 261 |
+
4. `TraceRecorder` logs retrieval chunk IDs + LLM I/O (Sharing is Caring badge)
|
| 262 |
+
|
| 263 |
+
### Pydantic models in [`libs/agent/src/agent/models.py`](libs/agent/src/agent/models.py)
|
| 264 |
+
|
| 265 |
+
- `ResearchIngestInput`, `ResearchChatInput`, `Citation`, `ResearchChatResult`
|
| 266 |
+
|
| 267 |
+
---
|
| 268 |
+
|
| 269 |
+
## 4. Gradio tab: Research Agent
|
| 270 |
+
|
| 271 |
+
New file: [`apps/gradio-space/src/gradio_space/tabs/research_mind.py`](apps/gradio-space/src/gradio_space/tabs/research_mind.py)
|
| 272 |
+
|
| 273 |
+
Register in [`app.py`](apps/gradio-space/src/gradio_space/app.py) and [`tabs/__init__.py`](apps/gradio-space/src/gradio_space/tabs/__init__.py).
|
| 274 |
+
|
| 275 |
+
### UI layout
|
| 276 |
+
|
| 277 |
+
```
|
| 278 |
+
Research Agent tab
|
| 279 |
+
βββ Markdown intro (offline-after-ingest, citations)
|
| 280 |
+
βββ Session: dropdown of past sessions + "New session"
|
| 281 |
+
βββ Ingest section
|
| 282 |
+
β βββ Textbox: topic (optional)
|
| 283 |
+
β βββ Textbox: URLs (one per line, optional)
|
| 284 |
+
β βββ File: PDF/DOCX upload (optional)
|
| 285 |
+
β βββ Dropdown: ingest mode
|
| 286 |
+
β β βββ "Suggest URLs (confirm)" [default]
|
| 287 |
+
β β βββ "Auto search & ingest"
|
| 288 |
+
β βββ Button: "Discover sources" β shows CheckboxGroup of suggested URLs
|
| 289 |
+
β βββ Button: "Ingest selected" β status + doc count
|
| 290 |
+
βββ Chat section
|
| 291 |
+
β βββ Chatbot (history)
|
| 292 |
+
β βββ Textbox: question
|
| 293 |
+
β βββ Button: Ask
|
| 294 |
+
βββ Accordion: trace JSON + ingested sources table
|
| 295 |
+
```
|
| 296 |
+
|
| 297 |
+
**Handler pattern:** mirror `generate_lesson_slides()` β `ensure_model_loaded()`, `AgentRunner()`, try/except with user-visible errors, `gradio_allowed_paths()` extended for `RESEARCHMIND_DATA_DIR`.
|
| 298 |
+
|
| 299 |
+
Update app header in `app.py` to mention ResearchMind alongside Lesson Agent.
|
| 300 |
+
|
| 301 |
+
---
|
| 302 |
+
|
| 303 |
+
## 5. Offline-after-ingest guarantee
|
| 304 |
+
|
| 305 |
+
| Phase | Network |
|
| 306 |
+
|-------|---------|
|
| 307 |
+
| Ingest (scrape/search) | May use network |
|
| 308 |
+
| Embed model first run | HuggingFace download once |
|
| 309 |
+
| Q&A / chat | **No network** β only SQLite + local LLM |
|
| 310 |
+
|
| 311 |
+
Raw HTML/PDF snapshots saved under `RESEARCHMIND_DATA_DIR/raw/{doc_id}/` for audit and re-chunk without re-scrape.
|
| 312 |
+
|
| 313 |
+
---
|
| 314 |
+
|
| 315 |
+
## 6. Tests
|
| 316 |
+
|
| 317 |
+
| Location | Coverage |
|
| 318 |
+
|----------|----------|
|
| 319 |
+
| `libs/researchmind/tests/test_store.py` | SQLite CRUD, dedup hash |
|
| 320 |
+
| `libs/researchmind/tests/test_chunking.py` | chunk boundaries |
|
| 321 |
+
| `libs/researchmind/tests/test_retrieve.py` | top-k with fixture embeddings |
|
| 322 |
+
| `libs/agent/tests/test_research_runner.py` | mock backend; ingest + chat happy path |
|
| 323 |
+
| `libs/researchmind/tests/fixtures/` | small HTML snippet + 1-page PDF |
|
| 324 |
+
|
| 325 |
+
Use offline fixtures for CI; mark optional network tests `@pytest.mark.network`.
|
| 326 |
+
|
| 327 |
+
---
|
| 328 |
+
|
| 329 |
+
## 7. Docker / Space considerations
|
| 330 |
+
|
| 331 |
+
- Add `sentence-transformers` + embedding model to Docker image **or** lazy-download on first ingest (document in README)
|
| 332 |
+
- `allowed_paths` must include `RESEARCHMIND_DATA_DIR` for any file previews
|
| 333 |
+
- GPU not required for embeddings on CPU (MiniLM is small); same GPU preset works for chat
|
| 334 |
+
|
| 335 |
+
---
|
| 336 |
+
|
| 337 |
+
## 8. Implementation order
|
| 338 |
+
|
| 339 |
+
1. **`libs/researchmind`** core: store, chunk, embed, retrieve, citations
|
| 340 |
+
2. **Skills** skeleton: four folders with SKILL.md + references + script stubs calling library
|
| 341 |
+
3. **Agent tools + runner** methods
|
| 342 |
+
4. **Gradio tab** with suggest-confirm flow + auto-search dropdown
|
| 343 |
+
5. **Tests + `.env.example` + README** section under Backyard AI track
|
| 344 |
+
|
| 345 |
+
---
|
| 346 |
+
|
| 347 |
+
## Key files to modify
|
| 348 |
+
|
| 349 |
+
| File | Change |
|
| 350 |
+
|------|--------|
|
| 351 |
+
| [`pyproject.toml`](pyproject.toml) | Add `researchmind` workspace member |
|
| 352 |
+
| [`libs/agent/pyproject.toml`](libs/agent/pyproject.toml) | Depend on `researchmind` |
|
| 353 |
+
| [`apps/gradio-space/pyproject.toml`](apps/gradio-space/pyproject.toml) | Transitive via `agent` |
|
| 354 |
+
| [`libs/agent/src/agent/skills.py`](libs/agent/src/agent/skills.py) | Optional `flags` in frontmatter |
|
| 355 |
+
| [`libs/agent/src/agent/runner.py`](libs/agent/src/agent/runner.py) | `run_researchmind_*` |
|
| 356 |
+
| [`apps/gradio-space/src/gradio_space/app.py`](apps/gradio-space/src/gradio_space/app.py) | Third tab |
|
| 357 |
+
| [`.env.example`](.env.example) | ResearchMind env vars |
|
| 358 |
+
| [`README.md`](README.md) | ResearchMind usage blurb |
|
| 359 |
+
|
| 360 |
+
---
|
| 361 |
+
|
| 362 |
+
## Future (post-MVP, not in this PR)
|
| 363 |
+
|
| 364 |
+
- LoRA distillation on ingested corpus via [`research/finetune.py`](research/finetune.py)
|
| 365 |
+
- Bridge to [`research/ensemble`](research/ensemble/) for ablation experiments
|
| 366 |
+
- Entity extraction edges in MemRAG graph (true knowledge graph)
|
.env.example
CHANGED
|
@@ -9,6 +9,14 @@ ALLOW_MODEL_SWITCH=false
|
|
| 9 |
# AGENT_TRACES_DIR=outputs/traces
|
| 10 |
# SKILLS_DIR=./skills
|
| 11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
# --- Legacy single-model overrides (optional; applied to ACTIVE_MODEL only) ---
|
| 13 |
# INFERENCE_BACKEND=transformers
|
| 14 |
# MODEL_ID=openbmb/MiniCPM5-1B
|
|
|
|
| 9 |
# AGENT_TRACES_DIR=outputs/traces
|
| 10 |
# SKILLS_DIR=./skills
|
| 11 |
|
| 12 |
+
# --- ResearchMind (MemRAG + scraper) ---
|
| 13 |
+
# RESEARCHMIND_DATA_DIR=outputs/researchmind
|
| 14 |
+
# RESEARCHMIND_EMBED_MODEL=all-MiniLM-L6-v2
|
| 15 |
+
# RESEARCHMIND_AUTO_SEARCH=false
|
| 16 |
+
# RESEARCHMIND_TOP_K=5
|
| 17 |
+
# RESEARCHMIND_CHUNK_SIZE=512
|
| 18 |
+
# RESEARCHMIND_CHUNK_OVERLAP=128
|
| 19 |
+
|
| 20 |
# --- Legacy single-model overrides (optional; applied to ACTIVE_MODEL only) ---
|
| 21 |
# INFERENCE_BACKEND=transformers
|
| 22 |
# MODEL_ID=openbmb/MiniCPM5-1B
|
.gitignore
CHANGED
|
@@ -12,4 +12,6 @@ build/
|
|
| 12 |
|
| 13 |
outputs/traces
|
| 14 |
|
| 15 |
-
/results
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
outputs/traces
|
| 14 |
|
| 15 |
+
/results
|
| 16 |
+
|
| 17 |
+
outputs/researchmind
|
README.md
CHANGED
|
@@ -32,7 +32,10 @@ cp .env.example .env # optional: edit model settings
|
|
| 32 |
uv run --package gradio-space python -m gradio_space.app
|
| 33 |
```
|
| 34 |
|
| 35 |
-
Open [http://localhost:7860](http://localhost:7860).
|
|
|
|
|
|
|
|
|
|
| 36 |
|
| 37 |
## How it works
|
| 38 |
|
|
@@ -42,13 +45,21 @@ Open [http://localhost:7860](http://localhost:7860). Use the **Lesson slides** t
|
|
| 42 |
4. **Trace** β JSON log saved under `outputs/traces/` for the Sharing is Caring badge
|
| 43 |
|
| 44 |
```text
|
| 45 |
-
apps/gradio-space/ # Gradio tabs (Lesson slides
|
| 46 |
libs/agent/ # Skill agent runner, tools, trace recorder
|
|
|
|
| 47 |
libs/inference/ # Transformers + llama.cpp backends
|
| 48 |
-
skills/ # SKILL.md
|
| 49 |
research/ # Fine-tune, ensemble experiments, agentic evals (optional)
|
| 50 |
```
|
| 51 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
Optional research tooling (not required for the Space): see [research/USAGE.md](research/USAGE.md).
|
| 53 |
|
| 54 |
## Environment variables
|
|
@@ -59,6 +70,9 @@ Optional research tooling (not required for the Space): see [research/USAGE.md](
|
|
| 59 |
| `AGENT_OUTPUTS_DIR` | `/tmp/agent_outputs` | Generated `.pptx` files |
|
| 60 |
| `AGENT_TRACES_DIR` | `outputs/traces` | Agent trace JSON |
|
| 61 |
| `SKILLS_DIR` | `./skills` | Skill definitions root |
|
|
|
|
|
|
|
|
|
|
| 62 |
|
| 63 |
See [`.env.example`](.env.example) and [`models.yaml`](models.yaml) for model presets.
|
| 64 |
|
|
|
|
| 32 |
uv run --package gradio-space python -m gradio_space.app
|
| 33 |
```
|
| 34 |
|
| 35 |
+
Open [http://localhost:7860](http://localhost:7860).
|
| 36 |
+
|
| 37 |
+
- **Lesson slides** β topic, grade, slide count β downloadable PowerPoint
|
| 38 |
+
- **Research Agent** β scrape/index sources into MemRAG, then ask questions offline with citations
|
| 39 |
|
| 40 |
## How it works
|
| 41 |
|
|
|
|
| 45 |
4. **Trace** β JSON log saved under `outputs/traces/` for the Sharing is Caring badge
|
| 46 |
|
| 47 |
```text
|
| 48 |
+
apps/gradio-space/ # Gradio tabs (Lesson slides, Research Agent, Chat debug)
|
| 49 |
libs/agent/ # Skill agent runner, tools, trace recorder
|
| 50 |
+
libs/researchmind/ # Scraper, chunk/embed, MemRAG SQLite store, retrieval
|
| 51 |
libs/inference/ # Transformers + llama.cpp backends
|
| 52 |
+
skills/ # SKILL.md + references/ + scripts/ per task
|
| 53 |
research/ # Fine-tune, ensemble experiments, agentic evals (optional)
|
| 54 |
```
|
| 55 |
|
| 56 |
+
### ResearchMind (offline after ingest)
|
| 57 |
+
|
| 58 |
+
1. **Skills** β `skills/scrape-web`, `scrape-pdf`, `extract-content`, `research-mind`
|
| 59 |
+
2. **Ingest** β URL/PDF/DOCX or topic β (optional LLM URL suggest + confirm, or auto search) β chunk + embed β SQLite
|
| 60 |
+
3. **Q&A** β local model + retrieved chunks with `[n]` citations (no network at chat time)
|
| 61 |
+
4. **Memory** β persists under `RESEARCHMIND_DATA_DIR` (default `outputs/researchmind`)
|
| 62 |
+
|
| 63 |
Optional research tooling (not required for the Space): see [research/USAGE.md](research/USAGE.md).
|
| 64 |
|
| 65 |
## Environment variables
|
|
|
|
| 70 |
| `AGENT_OUTPUTS_DIR` | `/tmp/agent_outputs` | Generated `.pptx` files |
|
| 71 |
| `AGENT_TRACES_DIR` | `outputs/traces` | Agent trace JSON |
|
| 72 |
| `SKILLS_DIR` | `./skills` | Skill definitions root |
|
| 73 |
+
| `RESEARCHMIND_DATA_DIR` | `outputs/researchmind` | MemRAG DB and raw snapshots |
|
| 74 |
+
| `RESEARCHMIND_EMBED_MODEL` | `all-MiniLM-L6-v2` | Sentence embedding model |
|
| 75 |
+
| `RESEARCHMIND_AUTO_SEARCH` | `false` | Default auto DuckDuckGo ingest |
|
| 76 |
|
| 77 |
See [`.env.example`](.env.example) and [`models.yaml`](models.yaml) for model presets.
|
| 78 |
|
apps/gradio-space/src/gradio_space/app.py
CHANGED
|
@@ -3,8 +3,9 @@ import os
|
|
| 3 |
import gradio as gr
|
| 4 |
|
| 5 |
from gradio_space.model_loading import preload_active_model
|
| 6 |
-
from gradio_space.tabs import build_chat_tab, build_education_pptx_tab
|
| 7 |
from gradio_space.tabs.education_pptx import gradio_allowed_paths
|
|
|
|
| 8 |
from inference.config import get_app_config
|
| 9 |
|
| 10 |
_app_config = get_app_config()
|
|
@@ -18,12 +19,12 @@ def build_demo() -> gr.Blocks:
|
|
| 18 |
else "Using built-in presets (models.yaml not found)."
|
| 19 |
)
|
| 20 |
|
| 21 |
-
with gr.Blocks(title="Lesson Agent β Build Small Hackathon") as demo:
|
| 22 |
gr.Markdown(
|
| 23 |
f"""
|
| 24 |
-
# Lesson Agent
|
| 25 |
|
| 26 |
-
Local skill-based
|
| 27 |
|
| 28 |
- **Model:** `{active.key}` β {active.label}
|
| 29 |
- **Backend:** `{active.backend}`
|
|
@@ -36,6 +37,8 @@ Part of the [Build Small Hackathon](https://huggingface.co/build-small-hackathon
|
|
| 36 |
with gr.Tabs():
|
| 37 |
with gr.Tab("Lesson slides"):
|
| 38 |
build_education_pptx_tab()
|
|
|
|
|
|
|
| 39 |
with gr.Tab("Chat (debug)"):
|
| 40 |
build_chat_tab()
|
| 41 |
|
|
@@ -48,7 +51,7 @@ def main() -> None:
|
|
| 48 |
demo.launch(
|
| 49 |
server_name="0.0.0.0",
|
| 50 |
server_port=int(os.environ.get("PORT", "7860")),
|
| 51 |
-
allowed_paths=gradio_allowed_paths(),
|
| 52 |
)
|
| 53 |
|
| 54 |
|
|
|
|
| 3 |
import gradio as gr
|
| 4 |
|
| 5 |
from gradio_space.model_loading import preload_active_model
|
| 6 |
+
from gradio_space.tabs import build_chat_tab, build_education_pptx_tab, build_research_mind_tab
|
| 7 |
from gradio_space.tabs.education_pptx import gradio_allowed_paths
|
| 8 |
+
from gradio_space.tabs.research_mind import researchmind_allowed_paths
|
| 9 |
from inference.config import get_app_config
|
| 10 |
|
| 11 |
_app_config = get_app_config()
|
|
|
|
| 19 |
else "Using built-in presets (models.yaml not found)."
|
| 20 |
)
|
| 21 |
|
| 22 |
+
with gr.Blocks(title="Lesson Agent + ResearchMind β Build Small Hackathon") as demo:
|
| 23 |
gr.Markdown(
|
| 24 |
f"""
|
| 25 |
+
# Lesson Agent + ResearchMind
|
| 26 |
|
| 27 |
+
Local skill-based agents β **lesson slides** and **research with MemRAG** (offline Q&A after ingest).
|
| 28 |
|
| 29 |
- **Model:** `{active.key}` β {active.label}
|
| 30 |
- **Backend:** `{active.backend}`
|
|
|
|
| 37 |
with gr.Tabs():
|
| 38 |
with gr.Tab("Lesson slides"):
|
| 39 |
build_education_pptx_tab()
|
| 40 |
+
with gr.Tab("ResearchMind"):
|
| 41 |
+
build_research_mind_tab()
|
| 42 |
with gr.Tab("Chat (debug)"):
|
| 43 |
build_chat_tab()
|
| 44 |
|
|
|
|
| 51 |
demo.launch(
|
| 52 |
server_name="0.0.0.0",
|
| 53 |
server_port=int(os.environ.get("PORT", "7860")),
|
| 54 |
+
allowed_paths=[*gradio_allowed_paths(), *researchmind_allowed_paths()],
|
| 55 |
)
|
| 56 |
|
| 57 |
|
apps/gradio-space/src/gradio_space/model_loading.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
from inference.config import get_app_config, get_model_config
|
| 2 |
from inference.factory import get_backend, reset_backend
|
|
|
|
| 3 |
|
| 4 |
_app_config = get_app_config()
|
| 5 |
_current_model_key: str | None = None
|
|
@@ -111,4 +112,5 @@ def chat(message: str, history: list, model_key: str) -> str:
|
|
| 111 |
|
| 112 |
messages = _history_to_messages(history)
|
| 113 |
messages.append({"role": "user", "content": message})
|
| 114 |
-
|
|
|
|
|
|
| 1 |
from inference.config import get_app_config, get_model_config
|
| 2 |
from inference.factory import get_backend, reset_backend
|
| 3 |
+
from inference.response_clean import strip_reasoning_output
|
| 4 |
|
| 5 |
_app_config = get_app_config()
|
| 6 |
_current_model_key: str | None = None
|
|
|
|
| 112 |
|
| 113 |
messages = _history_to_messages(history)
|
| 114 |
messages.append({"role": "user", "content": message})
|
| 115 |
+
reply = get_backend(model_key).chat(messages)
|
| 116 |
+
return strip_reasoning_output(reply)
|
apps/gradio-space/src/gradio_space/research_helpers.py
ADDED
|
@@ -0,0 +1,196 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
import gradio as gr
|
| 7 |
+
|
| 8 |
+
from agent.models import ResearchIngestResult
|
| 9 |
+
from agent.runner import AgentRunner
|
| 10 |
+
from gradio_space.model_loading import chat, ensure_model_loaded, get_active_model_key
|
| 11 |
+
from inference.factory import get_backend
|
| 12 |
+
from researchmind.ingest import IngestPipeline
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def list_session_choices() -> list[tuple[str, str]]:
|
| 16 |
+
store = IngestPipeline().store
|
| 17 |
+
sessions = store.list_sessions()
|
| 18 |
+
choices: list[tuple[str, str]] = [("New session (chat only)", "")]
|
| 19 |
+
for s in sessions:
|
| 20 |
+
label = f"{s.topic or 'Untitled'} ({s.id})"
|
| 21 |
+
choices.append((label, s.id))
|
| 22 |
+
return choices
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def refresh_sessions(current: str):
|
| 26 |
+
choices = list_session_choices()
|
| 27 |
+
values = [c[1] for c in choices]
|
| 28 |
+
value = current if current in values else ""
|
| 29 |
+
return gr.update(choices=choices, value=value)
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def list_doc_choices(session_id: str | None) -> list[tuple[str, str]]:
|
| 33 |
+
store = IngestPipeline().store
|
| 34 |
+
docs = store.list_documents(session_id=session_id or None)
|
| 35 |
+
choices: list[tuple[str, str]] = []
|
| 36 |
+
for d in docs:
|
| 37 |
+
label = f"{d.title} ({d.source_type})"
|
| 38 |
+
if len(d.uri) > 60:
|
| 39 |
+
label += f" β {d.uri[:57]}β¦"
|
| 40 |
+
else:
|
| 41 |
+
label += f" β {d.uri}"
|
| 42 |
+
choices.append((label, d.id))
|
| 43 |
+
return choices
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def refresh_doc_choices(session_id: str, current: list[str] | None):
|
| 47 |
+
choices = list_doc_choices(session_id or None)
|
| 48 |
+
valid = {c[1] for c in choices}
|
| 49 |
+
selected = [doc_id for doc_id in (current or []) if doc_id in valid]
|
| 50 |
+
default_selected = [c[1] for c in choices] if choices and not selected else selected
|
| 51 |
+
return gr.update(choices=choices, value=default_selected)
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def load_trace_json(trace_path: str) -> str:
|
| 55 |
+
if not trace_path:
|
| 56 |
+
return ""
|
| 57 |
+
if trace_path.strip().startswith("{"):
|
| 58 |
+
return trace_path
|
| 59 |
+
path = Path(trace_path)
|
| 60 |
+
if path.is_file():
|
| 61 |
+
return path.read_text(encoding="utf-8")
|
| 62 |
+
return trace_path
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def trace_summary_markdown(trace_path: str) -> str:
|
| 66 |
+
raw = load_trace_json(trace_path)
|
| 67 |
+
if not raw or not raw.strip().startswith("{"):
|
| 68 |
+
return raw or "_No trace yet._"
|
| 69 |
+
try:
|
| 70 |
+
data = json.loads(raw)
|
| 71 |
+
except json.JSONDecodeError:
|
| 72 |
+
return f"Trace file: `{trace_path}`"
|
| 73 |
+
|
| 74 |
+
lines = [
|
| 75 |
+
f"**Run** `{data.get('run_id', '?')}` Β· skill `{data.get('skill', '?')}`",
|
| 76 |
+
"",
|
| 77 |
+
]
|
| 78 |
+
for step in data.get("steps", []):
|
| 79 |
+
if step.get("type") != "note":
|
| 80 |
+
continue
|
| 81 |
+
msg = step.get("message", "")
|
| 82 |
+
extra = {k: v for k, v in step.items() if k not in ("type", "message")}
|
| 83 |
+
detail = ""
|
| 84 |
+
if extra:
|
| 85 |
+
detail = " β " + ", ".join(f"{k}={v!r}" for k, v in extra.items())
|
| 86 |
+
lines.append(f"- {msg}{detail}")
|
| 87 |
+
if len(lines) <= 2:
|
| 88 |
+
lines.append("_No notes in trace. See Trace JSON below._")
|
| 89 |
+
return "\n".join(lines)
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
def format_ingest_status(result: ResearchIngestResult) -> str:
|
| 93 |
+
lines = [result.message, ""]
|
| 94 |
+
if result.ingested:
|
| 95 |
+
lines.append("**Ingested**")
|
| 96 |
+
lines.extend(f"- {url}" for url in result.ingested)
|
| 97 |
+
lines.append("")
|
| 98 |
+
if result.skipped:
|
| 99 |
+
lines.append("**Skipped (duplicate)**")
|
| 100 |
+
lines.extend(f"- {url}" for url in result.skipped)
|
| 101 |
+
lines.append("")
|
| 102 |
+
if result.failures:
|
| 103 |
+
lines.append("**Failed**")
|
| 104 |
+
for failure in result.failures:
|
| 105 |
+
lines.append(f"- `{failure.url}` β _{failure.stage}_: {failure.reason}")
|
| 106 |
+
lines.append("")
|
| 107 |
+
lines.append("_Open the **Trace** tab for full JSON._")
|
| 108 |
+
return "\n".join(lines).strip()
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
def memory_summary(session_id: str) -> str:
|
| 112 |
+
store = IngestPipeline().store
|
| 113 |
+
docs = store.list_documents(session_id=session_id or None)
|
| 114 |
+
chunks = store.count_chunks()
|
| 115 |
+
if not docs:
|
| 116 |
+
return f"_No documents indexed yet._ Total chunks in store: **{chunks}**."
|
| 117 |
+
scope = f"session `{session_id}`" if session_id else "all sessions"
|
| 118 |
+
lines = [f"**{len(docs)}** document(s) in {scope} Β· **{chunks}** total chunks in store\n"]
|
| 119 |
+
for d in docs:
|
| 120 |
+
lines.append(f"- **{d.title}** (`{d.source_type}`) β {d.uri}")
|
| 121 |
+
return "\n".join(lines)
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
def rag_scope_hint(session_id: str, doc_ids: list[str] | None) -> str:
|
| 125 |
+
if doc_ids:
|
| 126 |
+
return f"RAG scope: **{len(doc_ids)}** selected document(s)."
|
| 127 |
+
if session_id:
|
| 128 |
+
n = len(IngestPipeline().store.list_documents(session_id=session_id))
|
| 129 |
+
return f"RAG scope: all **{n}** document(s) in session `{session_id}`."
|
| 130 |
+
return "RAG scope: **entire** indexed corpus (all sessions)."
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
def run_research_question(
|
| 134 |
+
question: str,
|
| 135 |
+
*,
|
| 136 |
+
session_id: str,
|
| 137 |
+
doc_ids: list[str] | None,
|
| 138 |
+
model_key: str | None = None,
|
| 139 |
+
) -> tuple[str, str, str]:
|
| 140 |
+
"""Returns (answer_markdown, trace_json, trace_summary_md)."""
|
| 141 |
+
key = model_key or get_active_model_key()
|
| 142 |
+
load_error = ensure_model_loaded(key)
|
| 143 |
+
if load_error:
|
| 144 |
+
return load_error, load_error, load_error
|
| 145 |
+
|
| 146 |
+
if not question.strip():
|
| 147 |
+
return "Enter a question.", "", ""
|
| 148 |
+
|
| 149 |
+
sid = session_id
|
| 150 |
+
if not sid:
|
| 151 |
+
sid = IngestPipeline().store.create_session().id
|
| 152 |
+
|
| 153 |
+
runner = AgentRunner()
|
| 154 |
+
result = runner.run_researchmind_chat(
|
| 155 |
+
question=question,
|
| 156 |
+
session_id=sid,
|
| 157 |
+
doc_ids=doc_ids or None,
|
| 158 |
+
model_key=key,
|
| 159 |
+
backend=get_backend(key),
|
| 160 |
+
)
|
| 161 |
+
trace_json = json.dumps(
|
| 162 |
+
{
|
| 163 |
+
"trace_path": result.trace_path,
|
| 164 |
+
"citations": [c.model_dump() for c in result.citations],
|
| 165 |
+
"scope": {
|
| 166 |
+
"session_id": sid,
|
| 167 |
+
"doc_ids": doc_ids or [],
|
| 168 |
+
},
|
| 169 |
+
},
|
| 170 |
+
indent=2,
|
| 171 |
+
)
|
| 172 |
+
return (
|
| 173 |
+
result.answer,
|
| 174 |
+
trace_json,
|
| 175 |
+
trace_summary_markdown(result.trace_path),
|
| 176 |
+
)
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
def rag_aware_chat(
|
| 180 |
+
message: str,
|
| 181 |
+
history: list,
|
| 182 |
+
model_key: str,
|
| 183 |
+
use_rag: bool,
|
| 184 |
+
session_id: str,
|
| 185 |
+
doc_ids: list[str] | None,
|
| 186 |
+
) -> str:
|
| 187 |
+
if not use_rag:
|
| 188 |
+
return chat(message, history, model_key)
|
| 189 |
+
|
| 190 |
+
answer, _, _ = run_research_question(
|
| 191 |
+
message,
|
| 192 |
+
session_id=session_id,
|
| 193 |
+
doc_ids=doc_ids,
|
| 194 |
+
model_key=model_key,
|
| 195 |
+
)
|
| 196 |
+
return answer
|
apps/gradio-space/src/gradio_space/tabs/__init__.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
from gradio_space.tabs.chat import build_chat_tab
|
| 2 |
from gradio_space.tabs.education_pptx import build_education_pptx_tab
|
|
|
|
| 3 |
|
| 4 |
-
__all__ = ["build_chat_tab", "build_education_pptx_tab"]
|
|
|
|
| 1 |
from gradio_space.tabs.chat import build_chat_tab
|
| 2 |
from gradio_space.tabs.education_pptx import build_education_pptx_tab
|
| 3 |
+
from gradio_space.tabs.research_mind import build_research_mind_tab
|
| 4 |
|
| 5 |
+
__all__ = ["build_chat_tab", "build_education_pptx_tab", "build_research_mind_tab"]
|
apps/gradio-space/src/gradio_space/tabs/chat.py
CHANGED
|
@@ -1,6 +1,13 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
|
| 3 |
-
from gradio_space.model_loading import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
from inference.config import get_app_config
|
| 5 |
|
| 6 |
_app_config = get_app_config()
|
|
@@ -11,12 +18,29 @@ def build_chat_tab() -> None:
|
|
| 11 |
"""
|
| 12 |
### Model chat (debug)
|
| 13 |
|
| 14 |
-
Test the active local model
|
| 15 |
"""
|
| 16 |
)
|
| 17 |
|
| 18 |
model_key = _app_config.active_model
|
| 19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
if _app_config.allow_model_switch and len(_app_config.models) > 1:
|
| 21 |
model_dropdown = gr.Dropdown(
|
| 22 |
choices=_app_config.model_choices(),
|
|
@@ -26,19 +50,42 @@ Test the active local model with a simple chat interface.
|
|
| 26 |
status = gr.Markdown(model_status(model_key))
|
| 27 |
model_dropdown.change(fn=model_status, inputs=model_dropdown, outputs=status)
|
| 28 |
gr.ChatInterface(
|
| 29 |
-
fn=
|
| 30 |
-
additional_inputs=[model_dropdown],
|
| 31 |
examples=[
|
| 32 |
-
["
|
| 33 |
-
["
|
| 34 |
],
|
| 35 |
)
|
| 36 |
else:
|
| 37 |
status = gr.Markdown(model_status(model_key))
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
gr.ChatInterface(
|
| 39 |
-
fn=
|
|
|
|
| 40 |
examples=[
|
| 41 |
-
"
|
| 42 |
-
"
|
| 43 |
],
|
| 44 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
|
| 3 |
+
from gradio_space.model_loading import model_status
|
| 4 |
+
from gradio_space.research_helpers import (
|
| 5 |
+
list_session_choices,
|
| 6 |
+
rag_aware_chat,
|
| 7 |
+
rag_scope_hint,
|
| 8 |
+
refresh_doc_choices,
|
| 9 |
+
refresh_sessions,
|
| 10 |
+
)
|
| 11 |
from inference.config import get_app_config
|
| 12 |
|
| 13 |
_app_config = get_app_config()
|
|
|
|
| 18 |
"""
|
| 19 |
### Model chat (debug)
|
| 20 |
|
| 21 |
+
Test the active local model. Enable **ResearchMind RAG** to answer from ingested sessions and documents with citations.
|
| 22 |
"""
|
| 23 |
)
|
| 24 |
|
| 25 |
model_key = _app_config.active_model
|
| 26 |
|
| 27 |
+
with gr.Row():
|
| 28 |
+
use_rag = gr.Checkbox(label="Use ResearchMind RAG", value=False)
|
| 29 |
+
session_dd = gr.Dropdown(
|
| 30 |
+
label="Session",
|
| 31 |
+
choices=list_session_choices(),
|
| 32 |
+
value="",
|
| 33 |
+
interactive=True,
|
| 34 |
+
)
|
| 35 |
+
refresh_sessions_btn = gr.Button("Refresh", size="sm")
|
| 36 |
+
|
| 37 |
+
doc_dd = gr.CheckboxGroup(
|
| 38 |
+
label="Documents to search (empty = all docs in session, or entire corpus if no session)",
|
| 39 |
+
choices=[],
|
| 40 |
+
value=[],
|
| 41 |
+
)
|
| 42 |
+
rag_hint = gr.Markdown(value=rag_scope_hint("", []))
|
| 43 |
+
|
| 44 |
if _app_config.allow_model_switch and len(_app_config.models) > 1:
|
| 45 |
model_dropdown = gr.Dropdown(
|
| 46 |
choices=_app_config.model_choices(),
|
|
|
|
| 50 |
status = gr.Markdown(model_status(model_key))
|
| 51 |
model_dropdown.change(fn=model_status, inputs=model_dropdown, outputs=status)
|
| 52 |
gr.ChatInterface(
|
| 53 |
+
fn=rag_aware_chat,
|
| 54 |
+
additional_inputs=[model_dropdown, use_rag, session_dd, doc_dd],
|
| 55 |
examples=[
|
| 56 |
+
["What do my ingested sources say about AI agents?", _app_config.active_model, True, "", []],
|
| 57 |
+
["Hello! What can you help me with?", _app_config.active_model, False, "", []],
|
| 58 |
],
|
| 59 |
)
|
| 60 |
else:
|
| 61 |
status = gr.Markdown(model_status(model_key))
|
| 62 |
+
|
| 63 |
+
def _chat(message, history, use_rag_flag, sid, docs):
|
| 64 |
+
return rag_aware_chat(message, history, model_key, use_rag_flag, sid, docs)
|
| 65 |
+
|
| 66 |
gr.ChatInterface(
|
| 67 |
+
fn=_chat,
|
| 68 |
+
additional_inputs=[use_rag, session_dd, doc_dd],
|
| 69 |
examples=[
|
| 70 |
+
["What do my ingested sources say about AI agents?", True, "", []],
|
| 71 |
+
["Hello! What can you help me with?", False, "", []],
|
| 72 |
],
|
| 73 |
)
|
| 74 |
+
|
| 75 |
+
def _update_hint(sid: str, docs: list[str] | None, rag_on: bool) -> str:
|
| 76 |
+
if not rag_on:
|
| 77 |
+
return "_Plain chat β model only, no document retrieval._"
|
| 78 |
+
return rag_scope_hint(sid, docs)
|
| 79 |
+
|
| 80 |
+
refresh_sessions_btn.click(fn=refresh_sessions, inputs=[session_dd], outputs=[session_dd])
|
| 81 |
+
session_dd.change(
|
| 82 |
+
fn=refresh_doc_choices,
|
| 83 |
+
inputs=[session_dd, doc_dd],
|
| 84 |
+
outputs=[doc_dd],
|
| 85 |
+
).then(
|
| 86 |
+
fn=_update_hint,
|
| 87 |
+
inputs=[session_dd, doc_dd, use_rag],
|
| 88 |
+
outputs=[rag_hint],
|
| 89 |
+
)
|
| 90 |
+
doc_dd.change(fn=_update_hint, inputs=[session_dd, doc_dd, use_rag], outputs=[rag_hint])
|
| 91 |
+
use_rag.change(fn=_update_hint, inputs=[session_dd, doc_dd, use_rag], outputs=[rag_hint])
|
apps/gradio-space/src/gradio_space/tabs/research_mind.py
ADDED
|
@@ -0,0 +1,366 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import logging
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
import gradio as gr
|
| 7 |
+
|
| 8 |
+
from agent.runner import AgentRunner
|
| 9 |
+
from gradio_space.model_loading import ensure_model_loaded, get_active_model_key, model_status
|
| 10 |
+
from gradio_space.research_helpers import (
|
| 11 |
+
format_ingest_status,
|
| 12 |
+
list_session_choices,
|
| 13 |
+
load_trace_json,
|
| 14 |
+
memory_summary,
|
| 15 |
+
rag_scope_hint,
|
| 16 |
+
refresh_doc_choices,
|
| 17 |
+
refresh_sessions,
|
| 18 |
+
run_research_question,
|
| 19 |
+
trace_summary_markdown,
|
| 20 |
+
)
|
| 21 |
+
from inference.factory import get_backend
|
| 22 |
+
from researchmind.config import get_config
|
| 23 |
+
from researchmind.ingest import IngestPipeline
|
| 24 |
+
|
| 25 |
+
logger = logging.getLogger(__name__)
|
| 26 |
+
|
| 27 |
+
INGEST_MODES = [
|
| 28 |
+
("Suggest URLs (confirm)", "suggest"),
|
| 29 |
+
("Auto search & ingest", "auto"),
|
| 30 |
+
]
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def discover_sources(
|
| 34 |
+
topic: str,
|
| 35 |
+
ingest_mode: str,
|
| 36 |
+
session_id: str,
|
| 37 |
+
) -> tuple[str, gr.Update, str, str, str, str, object]:
|
| 38 |
+
model_key = get_active_model_key()
|
| 39 |
+
load_error = ensure_model_loaded(model_key)
|
| 40 |
+
if load_error:
|
| 41 |
+
return (
|
| 42 |
+
load_error,
|
| 43 |
+
gr.update(choices=[], value=[]),
|
| 44 |
+
session_id,
|
| 45 |
+
load_error,
|
| 46 |
+
load_error,
|
| 47 |
+
memory_summary(session_id),
|
| 48 |
+
refresh_doc_choices(session_id, []),
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
if not topic.strip():
|
| 52 |
+
msg = "Enter a topic to discover sources."
|
| 53 |
+
return (
|
| 54 |
+
msg,
|
| 55 |
+
gr.update(choices=[], value=[]),
|
| 56 |
+
session_id,
|
| 57 |
+
msg,
|
| 58 |
+
msg,
|
| 59 |
+
memory_summary(session_id),
|
| 60 |
+
refresh_doc_choices(session_id, []),
|
| 61 |
+
)
|
| 62 |
+
|
| 63 |
+
auto_search = ingest_mode == "auto"
|
| 64 |
+
try:
|
| 65 |
+
runner = AgentRunner()
|
| 66 |
+
if auto_search:
|
| 67 |
+
result = runner.run_researchmind_ingest(
|
| 68 |
+
topic=topic,
|
| 69 |
+
urls=[],
|
| 70 |
+
files=[],
|
| 71 |
+
auto_search=True,
|
| 72 |
+
session_id=session_id or None,
|
| 73 |
+
model_key=model_key,
|
| 74 |
+
backend=get_backend(model_key),
|
| 75 |
+
)
|
| 76 |
+
trace_json = load_trace_json(result.trace_path)
|
| 77 |
+
return (
|
| 78 |
+
format_ingest_status(result),
|
| 79 |
+
gr.update(choices=[], value=[]),
|
| 80 |
+
result.session_id,
|
| 81 |
+
trace_summary_markdown(result.trace_path),
|
| 82 |
+
trace_json,
|
| 83 |
+
memory_summary(result.session_id),
|
| 84 |
+
refresh_doc_choices(result.session_id, []),
|
| 85 |
+
)
|
| 86 |
+
|
| 87 |
+
discover = runner.run_researchmind_discover(
|
| 88 |
+
topic=topic,
|
| 89 |
+
auto_search=False,
|
| 90 |
+
session_id=session_id or None,
|
| 91 |
+
model_key=model_key,
|
| 92 |
+
backend=get_backend(model_key),
|
| 93 |
+
)
|
| 94 |
+
choices = discover.suggested_urls
|
| 95 |
+
if not choices:
|
| 96 |
+
summary = (
|
| 97 |
+
"No verified URLs found. Try a more specific topic, paste URLs manually, "
|
| 98 |
+
"or switch to **Auto search & ingest**."
|
| 99 |
+
)
|
| 100 |
+
else:
|
| 101 |
+
summary = (
|
| 102 |
+
f"Found **{len(choices)} verified URL(s)** via web search "
|
| 103 |
+
f"(Google + fallbacks). Select sources and click **Ingest selected**."
|
| 104 |
+
)
|
| 105 |
+
trace_json = load_trace_json(discover.trace_path)
|
| 106 |
+
return (
|
| 107 |
+
summary,
|
| 108 |
+
gr.update(choices=choices, value=choices),
|
| 109 |
+
discover.session_id,
|
| 110 |
+
trace_summary_markdown(discover.trace_path),
|
| 111 |
+
trace_json,
|
| 112 |
+
memory_summary(discover.session_id),
|
| 113 |
+
refresh_doc_choices(discover.session_id, []),
|
| 114 |
+
)
|
| 115 |
+
except Exception as exc: # noqa: BLE001
|
| 116 |
+
msg = f"Discover error: {exc}"
|
| 117 |
+
return (
|
| 118 |
+
msg,
|
| 119 |
+
gr.update(choices=[], value=[]),
|
| 120 |
+
session_id,
|
| 121 |
+
msg,
|
| 122 |
+
msg,
|
| 123 |
+
memory_summary(session_id),
|
| 124 |
+
refresh_doc_choices(session_id, []),
|
| 125 |
+
)
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
def ingest_selected(
|
| 129 |
+
topic: str,
|
| 130 |
+
urls_text: str,
|
| 131 |
+
selected_urls: list[str],
|
| 132 |
+
upload_files: list[str] | None,
|
| 133 |
+
session_id: str,
|
| 134 |
+
) -> tuple[str, str, str, str, object, object]:
|
| 135 |
+
model_key = get_active_model_key()
|
| 136 |
+
load_error = ensure_model_loaded(model_key)
|
| 137 |
+
if load_error:
|
| 138 |
+
return (
|
| 139 |
+
load_error,
|
| 140 |
+
memory_summary(session_id),
|
| 141 |
+
load_error,
|
| 142 |
+
load_error,
|
| 143 |
+
refresh_sessions(session_id),
|
| 144 |
+
refresh_doc_choices(session_id, []),
|
| 145 |
+
)
|
| 146 |
+
|
| 147 |
+
direct_urls = [ln.strip() for ln in urls_text.splitlines() if ln.strip()]
|
| 148 |
+
all_urls = list(dict.fromkeys([*direct_urls, *(selected_urls or [])]))
|
| 149 |
+
files = [Path(p) for p in (upload_files or [])]
|
| 150 |
+
|
| 151 |
+
if not all_urls and not files:
|
| 152 |
+
msg = "Provide URLs, select suggested sources, or upload a file."
|
| 153 |
+
return (
|
| 154 |
+
msg,
|
| 155 |
+
memory_summary(session_id),
|
| 156 |
+
msg,
|
| 157 |
+
msg,
|
| 158 |
+
refresh_sessions(session_id),
|
| 159 |
+
refresh_doc_choices(session_id, []),
|
| 160 |
+
)
|
| 161 |
+
|
| 162 |
+
try:
|
| 163 |
+
logger.info("Ingesting %d URL(s) and %d file(s)", len(all_urls), len(files))
|
| 164 |
+
runner = AgentRunner()
|
| 165 |
+
result = runner.run_researchmind_ingest(
|
| 166 |
+
topic=topic or None,
|
| 167 |
+
urls=all_urls,
|
| 168 |
+
files=files,
|
| 169 |
+
auto_search=False,
|
| 170 |
+
session_id=session_id or None,
|
| 171 |
+
model_key=model_key,
|
| 172 |
+
backend=get_backend(model_key),
|
| 173 |
+
)
|
| 174 |
+
trace_json = load_trace_json(result.trace_path)
|
| 175 |
+
return (
|
| 176 |
+
format_ingest_status(result),
|
| 177 |
+
memory_summary(result.session_id),
|
| 178 |
+
trace_json,
|
| 179 |
+
trace_summary_markdown(result.trace_path),
|
| 180 |
+
refresh_sessions(result.session_id),
|
| 181 |
+
refresh_doc_choices(result.session_id, []),
|
| 182 |
+
)
|
| 183 |
+
except Exception as exc: # noqa: BLE001
|
| 184 |
+
logger.exception("Ingest failed")
|
| 185 |
+
msg = f"**Ingest error:** {exc}"
|
| 186 |
+
return (
|
| 187 |
+
msg,
|
| 188 |
+
memory_summary(session_id),
|
| 189 |
+
msg,
|
| 190 |
+
msg,
|
| 191 |
+
refresh_sessions(session_id),
|
| 192 |
+
refresh_doc_choices(session_id, []),
|
| 193 |
+
)
|
| 194 |
+
|
| 195 |
+
|
| 196 |
+
def ask_question(
|
| 197 |
+
question: str,
|
| 198 |
+
session_id: str,
|
| 199 |
+
doc_ids: list[str] | None,
|
| 200 |
+
chat_history: list[dict],
|
| 201 |
+
) -> tuple[list[dict], str, str, str]:
|
| 202 |
+
if not question.strip():
|
| 203 |
+
return chat_history or [], "Enter a question.", "", rag_scope_hint(session_id, doc_ids)
|
| 204 |
+
|
| 205 |
+
try:
|
| 206 |
+
answer, trace_json, trace_summary = run_research_question(
|
| 207 |
+
question,
|
| 208 |
+
session_id=session_id,
|
| 209 |
+
doc_ids=doc_ids,
|
| 210 |
+
)
|
| 211 |
+
history = list(chat_history or [])
|
| 212 |
+
history.append({"role": "user", "content": question})
|
| 213 |
+
history.append({"role": "assistant", "content": answer})
|
| 214 |
+
return history, trace_json, trace_summary, rag_scope_hint(session_id, doc_ids)
|
| 215 |
+
except Exception as exc: # noqa: BLE001
|
| 216 |
+
logger.exception("Research chat failed")
|
| 217 |
+
history = list(chat_history or [])
|
| 218 |
+
history.append({"role": "user", "content": question})
|
| 219 |
+
err = f"Chat error: {exc}"
|
| 220 |
+
history.append({"role": "assistant", "content": err})
|
| 221 |
+
return history, err, err, rag_scope_hint(session_id, doc_ids)
|
| 222 |
+
|
| 223 |
+
|
| 224 |
+
def build_research_mind_tab() -> None:
|
| 225 |
+
"""ResearchMind UI β ingest, memory, trace, and corpus chat."""
|
| 226 |
+
model_key = get_active_model_key()
|
| 227 |
+
cfg = get_config()
|
| 228 |
+
|
| 229 |
+
gr.Markdown(
|
| 230 |
+
"""
|
| 231 |
+
### ResearchMind
|
| 232 |
+
|
| 233 |
+
Scrape sources once, index into **MemRAG** (local SQLite + embeddings), then ask questions **offline** with citations.
|
| 234 |
+
"""
|
| 235 |
+
)
|
| 236 |
+
gr.Markdown(model_status(model_key))
|
| 237 |
+
gr.Markdown(f"Memory store: `{cfg.data_dir.resolve()}`")
|
| 238 |
+
|
| 239 |
+
with gr.Row():
|
| 240 |
+
session_dd = gr.Dropdown(
|
| 241 |
+
label="Session",
|
| 242 |
+
choices=list_session_choices(),
|
| 243 |
+
value="",
|
| 244 |
+
interactive=True,
|
| 245 |
+
)
|
| 246 |
+
refresh_btn = gr.Button("Refresh sessions", size="sm")
|
| 247 |
+
|
| 248 |
+
with gr.Tabs():
|
| 249 |
+
with gr.Tab("Ingest"):
|
| 250 |
+
gr.Markdown(
|
| 251 |
+
"""
|
| 252 |
+
- **Suggest mode:** Google web search β verified URLs β you confirm β ingest
|
| 253 |
+
- **Auto search:** same search, ingests top verified URLs immediately
|
| 254 |
+
- **Direct:** paste URLs or upload PDF/DOCX
|
| 255 |
+
"""
|
| 256 |
+
)
|
| 257 |
+
with gr.Row():
|
| 258 |
+
topic = gr.Textbox(
|
| 259 |
+
label="Topic (optional)",
|
| 260 |
+
placeholder="e.g. Photosynthesis, American Revolution",
|
| 261 |
+
)
|
| 262 |
+
ingest_mode = gr.Dropdown(
|
| 263 |
+
label="Ingest mode",
|
| 264 |
+
choices=[m[0] for m in INGEST_MODES],
|
| 265 |
+
value=INGEST_MODES[0][0],
|
| 266 |
+
)
|
| 267 |
+
|
| 268 |
+
urls_text = gr.Textbox(
|
| 269 |
+
label="URLs (one per line, optional)",
|
| 270 |
+
lines=3,
|
| 271 |
+
placeholder="https://en.wikipedia.org/wiki/...",
|
| 272 |
+
)
|
| 273 |
+
upload_files = gr.File(
|
| 274 |
+
label="Upload PDF or DOCX",
|
| 275 |
+
file_count="multiple",
|
| 276 |
+
file_types=[".pdf", ".docx"],
|
| 277 |
+
)
|
| 278 |
+
|
| 279 |
+
discover_btn = gr.Button("Discover sources", variant="secondary")
|
| 280 |
+
url_choices = gr.CheckboxGroup(label="Suggested URLs to ingest", choices=[])
|
| 281 |
+
ingest_btn = gr.Button("Ingest selected", variant="primary")
|
| 282 |
+
ingest_status = gr.Markdown()
|
| 283 |
+
|
| 284 |
+
with gr.Tab("Memory"):
|
| 285 |
+
gr.Markdown("Indexed documents and chunk counts for the selected session.")
|
| 286 |
+
memory_md = gr.Markdown(value=memory_summary(""))
|
| 287 |
+
refresh_memory_btn = gr.Button("Refresh memory view", size="sm")
|
| 288 |
+
|
| 289 |
+
with gr.Tab("Trace"):
|
| 290 |
+
trace_summary = gr.Markdown()
|
| 291 |
+
trace_box = gr.Textbox(label="Trace JSON", lines=14, interactive=False)
|
| 292 |
+
|
| 293 |
+
gr.Markdown("---")
|
| 294 |
+
gr.Markdown("### Chat with your corpus")
|
| 295 |
+
gr.Markdown(
|
| 296 |
+
"Ask questions about ingested sources. Limit search to specific documents below, "
|
| 297 |
+
"or leave all checked to search the whole session."
|
| 298 |
+
)
|
| 299 |
+
rag_hint = gr.Markdown(value=rag_scope_hint("", []))
|
| 300 |
+
doc_dd = gr.CheckboxGroup(
|
| 301 |
+
label="Documents in session",
|
| 302 |
+
choices=[],
|
| 303 |
+
value=[],
|
| 304 |
+
)
|
| 305 |
+
chatbot = gr.Chatbot(label="Research chat", height=360)
|
| 306 |
+
question = gr.Textbox(
|
| 307 |
+
label="Question",
|
| 308 |
+
placeholder="What do these sources say about AI agents?",
|
| 309 |
+
)
|
| 310 |
+
ask_btn = gr.Button("Ask", variant="primary")
|
| 311 |
+
|
| 312 |
+
refresh_btn.click(fn=refresh_sessions, inputs=[session_dd], outputs=[session_dd])
|
| 313 |
+
refresh_memory_btn.click(fn=memory_summary, inputs=[session_dd], outputs=[memory_md])
|
| 314 |
+
session_dd.change(fn=memory_summary, inputs=[session_dd], outputs=[memory_md])
|
| 315 |
+
session_dd.change(
|
| 316 |
+
fn=refresh_doc_choices,
|
| 317 |
+
inputs=[session_dd, doc_dd],
|
| 318 |
+
outputs=[doc_dd],
|
| 319 |
+
).then(
|
| 320 |
+
fn=rag_scope_hint,
|
| 321 |
+
inputs=[session_dd, doc_dd],
|
| 322 |
+
outputs=[rag_hint],
|
| 323 |
+
)
|
| 324 |
+
doc_dd.change(fn=rag_scope_hint, inputs=[session_dd, doc_dd], outputs=[rag_hint])
|
| 325 |
+
|
| 326 |
+
discover_btn.click(
|
| 327 |
+
fn=lambda topic, mode, sid: discover_sources(
|
| 328 |
+
topic,
|
| 329 |
+
"auto" if mode == INGEST_MODES[1][0] else "suggest",
|
| 330 |
+
sid,
|
| 331 |
+
),
|
| 332 |
+
inputs=[topic, ingest_mode, session_dd],
|
| 333 |
+
outputs=[
|
| 334 |
+
ingest_status,
|
| 335 |
+
url_choices,
|
| 336 |
+
session_dd,
|
| 337 |
+
trace_summary,
|
| 338 |
+
trace_box,
|
| 339 |
+
memory_md,
|
| 340 |
+
doc_dd,
|
| 341 |
+
],
|
| 342 |
+
)
|
| 343 |
+
|
| 344 |
+
ingest_btn.click(
|
| 345 |
+
fn=ingest_selected,
|
| 346 |
+
inputs=[topic, urls_text, url_choices, upload_files, session_dd],
|
| 347 |
+
outputs=[ingest_status, memory_md, trace_box, trace_summary, session_dd, doc_dd],
|
| 348 |
+
)
|
| 349 |
+
|
| 350 |
+
ask_btn.click(
|
| 351 |
+
fn=ask_question,
|
| 352 |
+
inputs=[question, session_dd, doc_dd, chatbot],
|
| 353 |
+
outputs=[chatbot, trace_box, trace_summary, rag_hint],
|
| 354 |
+
)
|
| 355 |
+
question.submit(
|
| 356 |
+
fn=ask_question,
|
| 357 |
+
inputs=[question, session_dd, doc_dd, chatbot],
|
| 358 |
+
outputs=[chatbot, trace_box, trace_summary, rag_hint],
|
| 359 |
+
)
|
| 360 |
+
|
| 361 |
+
|
| 362 |
+
def researchmind_allowed_paths() -> list[str]:
|
| 363 |
+
cfg = get_config()
|
| 364 |
+
root = cfg.data_dir.resolve()
|
| 365 |
+
root.mkdir(parents=True, exist_ok=True)
|
| 366 |
+
return [str(root)]
|
libs/agent/pyproject.toml
CHANGED
|
@@ -9,6 +9,7 @@ authors = [
|
|
| 9 |
requires-python = ">=3.12"
|
| 10 |
dependencies = [
|
| 11 |
"inference",
|
|
|
|
| 12 |
"pillow>=10.0.0",
|
| 13 |
"pydantic>=2.0.0",
|
| 14 |
"python-docx>=1.1.0",
|
|
@@ -18,6 +19,7 @@ dependencies = [
|
|
| 18 |
|
| 19 |
[tool.uv.sources]
|
| 20 |
inference = { workspace = true }
|
|
|
|
| 21 |
|
| 22 |
[build-system]
|
| 23 |
requires = ["uv_build>=0.8.13,<0.9.0"]
|
|
|
|
| 9 |
requires-python = ">=3.12"
|
| 10 |
dependencies = [
|
| 11 |
"inference",
|
| 12 |
+
"researchmind",
|
| 13 |
"pillow>=10.0.0",
|
| 14 |
"pydantic>=2.0.0",
|
| 15 |
"python-docx>=1.1.0",
|
|
|
|
| 19 |
|
| 20 |
[tool.uv.sources]
|
| 21 |
inference = { workspace = true }
|
| 22 |
+
researchmind = { workspace = true }
|
| 23 |
|
| 24 |
[build-system]
|
| 25 |
requires = ["uv_build>=0.8.13,<0.9.0"]
|
libs/agent/src/agent/models.py
CHANGED
|
@@ -18,3 +18,55 @@ class EducationPptxInput(BaseModel):
|
|
| 18 |
topic: str
|
| 19 |
grade: str
|
| 20 |
slide_count: int = Field(ge=3, le=8)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
topic: str
|
| 19 |
grade: str
|
| 20 |
slide_count: int = Field(ge=3, le=8)
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
class Citation(BaseModel):
|
| 24 |
+
index: int
|
| 25 |
+
chunk_id: str
|
| 26 |
+
doc_title: str
|
| 27 |
+
doc_uri: str
|
| 28 |
+
excerpt: str
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
class ResearchIngestInput(BaseModel):
|
| 32 |
+
topic: str = ""
|
| 33 |
+
urls: list[str] = Field(default_factory=list)
|
| 34 |
+
auto_search: bool = False
|
| 35 |
+
session_id: str | None = None
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
class ResearchChatInput(BaseModel):
|
| 39 |
+
question: str
|
| 40 |
+
session_id: str
|
| 41 |
+
doc_ids: list[str] = Field(default_factory=list)
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
class ResearchDiscoverResult(BaseModel):
|
| 45 |
+
suggested_urls: list[str]
|
| 46 |
+
session_id: str
|
| 47 |
+
trace_path: str
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
class IngestFailure(BaseModel):
|
| 51 |
+
url: str
|
| 52 |
+
reason: str
|
| 53 |
+
stage: str = "unknown"
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
class ResearchIngestResult(BaseModel):
|
| 57 |
+
session_id: str
|
| 58 |
+
ingested: list[str]
|
| 59 |
+
skipped: list[str]
|
| 60 |
+
failures: list[IngestFailure] = Field(default_factory=list)
|
| 61 |
+
doc_count: int
|
| 62 |
+
chunk_count: int
|
| 63 |
+
trace_path: str
|
| 64 |
+
message: str
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
class ResearchChatResult(BaseModel):
|
| 68 |
+
answer: str
|
| 69 |
+
citations: list[Citation]
|
| 70 |
+
references_markdown: str
|
| 71 |
+
session_id: str
|
| 72 |
+
trace_path: str
|
libs/agent/src/agent/research_prompts.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def _load_reference(skill_path: Path, rel: str) -> str:
|
| 7 |
+
ref = skill_path.parent / rel
|
| 8 |
+
if ref.is_file():
|
| 9 |
+
return ref.read_text(encoding="utf-8")
|
| 10 |
+
return ""
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def research_answer_system(skill_body: str, skill_path: Path) -> str:
|
| 14 |
+
citation_ref = _load_reference(skill_path, "references/citation-format.md")
|
| 15 |
+
parts = [
|
| 16 |
+
"You are ResearchMind, a local research assistant.",
|
| 17 |
+
"Answer ONLY from the provided context.",
|
| 18 |
+
"Each context block is numbered [1], [2], β¦ β one number per source document.",
|
| 19 |
+
"Cite with those numbers only (e.g. [1]). Use at most a few citations per answer.",
|
| 20 |
+
"Ignore any [n] markers inside source text; never list citation numbers in a row.",
|
| 21 |
+
skill_body,
|
| 22 |
+
]
|
| 23 |
+
if citation_ref:
|
| 24 |
+
parts.append(citation_ref)
|
| 25 |
+
return "\n\n".join(parts)
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def research_answer_user(question: str, context: str) -> str:
|
| 29 |
+
return f"""Context:
|
| 30 |
+
{context}
|
| 31 |
+
|
| 32 |
+
Question: {question}
|
| 33 |
+
|
| 34 |
+
Write a concise answer with inline [n] citations (one index per source document).
|
| 35 |
+
Do not append a References section β it is added automatically.
|
| 36 |
+
If context is insufficient, say so."""
|
libs/agent/src/agent/runner.py
CHANGED
|
@@ -3,11 +3,23 @@ from __future__ import annotations
|
|
| 3 |
import json
|
| 4 |
import re
|
| 5 |
from dataclasses import dataclass
|
|
|
|
| 6 |
from typing import Any
|
| 7 |
|
| 8 |
from inference.base import InferenceBackend
|
|
|
|
|
|
|
| 9 |
|
| 10 |
-
from agent.models import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
from agent.preview import outline_to_html, render_slide_images
|
| 12 |
from agent.prompts import (
|
| 13 |
education_outline_repair,
|
|
@@ -21,6 +33,7 @@ from agent.tools_registry import ToolRegistry
|
|
| 21 |
from agent.trace import TraceRecorder
|
| 22 |
|
| 23 |
EDUCATION_PPTX_SKILL = "education-pptx"
|
|
|
|
| 24 |
|
| 25 |
|
| 26 |
@dataclass
|
|
@@ -225,3 +238,246 @@ class AgentRunner:
|
|
| 225 |
if start >= 0 and end > start:
|
| 226 |
cleaned = cleaned[start : end + 1]
|
| 227 |
return json.loads(cleaned)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
import json
|
| 4 |
import re
|
| 5 |
from dataclasses import dataclass
|
| 6 |
+
from pathlib import Path
|
| 7 |
from typing import Any
|
| 8 |
|
| 9 |
from inference.base import InferenceBackend
|
| 10 |
+
from researchmind.extract import extract_docx
|
| 11 |
+
from researchmind.ingest import IngestPipeline
|
| 12 |
|
| 13 |
+
from agent.models import (
|
| 14 |
+
Citation,
|
| 15 |
+
EducationPptxInput,
|
| 16 |
+
ResearchChatInput,
|
| 17 |
+
ResearchChatResult,
|
| 18 |
+
ResearchDiscoverResult,
|
| 19 |
+
ResearchIngestResult,
|
| 20 |
+
SlideOutline,
|
| 21 |
+
SlideSpec,
|
| 22 |
+
)
|
| 23 |
from agent.preview import outline_to_html, render_slide_images
|
| 24 |
from agent.prompts import (
|
| 25 |
education_outline_repair,
|
|
|
|
| 33 |
from agent.trace import TraceRecorder
|
| 34 |
|
| 35 |
EDUCATION_PPTX_SKILL = "education-pptx"
|
| 36 |
+
RESEARCH_MIND_SKILL = "research-mind"
|
| 37 |
|
| 38 |
|
| 39 |
@dataclass
|
|
|
|
| 238 |
if start >= 0 and end > start:
|
| 239 |
cleaned = cleaned[start : end + 1]
|
| 240 |
return json.loads(cleaned)
|
| 241 |
+
|
| 242 |
+
def _research_skill(self) -> Any:
|
| 243 |
+
return self._skills.get(RESEARCH_MIND_SKILL)
|
| 244 |
+
|
| 245 |
+
def _ensure_session(
|
| 246 |
+
self,
|
| 247 |
+
store: Any,
|
| 248 |
+
session_id: str | None,
|
| 249 |
+
topic: str = "",
|
| 250 |
+
) -> str:
|
| 251 |
+
if session_id and store.get_session(session_id):
|
| 252 |
+
return session_id
|
| 253 |
+
return store.create_session(topic=topic).id
|
| 254 |
+
|
| 255 |
+
def run_researchmind_discover(
|
| 256 |
+
self,
|
| 257 |
+
*,
|
| 258 |
+
topic: str,
|
| 259 |
+
auto_search: bool,
|
| 260 |
+
session_id: str | None,
|
| 261 |
+
model_key: str,
|
| 262 |
+
backend: InferenceBackend,
|
| 263 |
+
) -> ResearchDiscoverResult:
|
| 264 |
+
skill = self._research_skill()
|
| 265 |
+
pipeline = IngestPipeline()
|
| 266 |
+
store = pipeline.store
|
| 267 |
+
sid = self._ensure_session(store, session_id, topic=topic)
|
| 268 |
+
|
| 269 |
+
trace = TraceRecorder(
|
| 270 |
+
skill=skill.name,
|
| 271 |
+
model=model_key,
|
| 272 |
+
user_input={"topic": topic, "auto_search": auto_search, "phase": "discover"},
|
| 273 |
+
)
|
| 274 |
+
backend.load()
|
| 275 |
+
|
| 276 |
+
search_tool = self._tools.get("search_urls")
|
| 277 |
+
urls = search_tool.handler(topic, n=8)
|
| 278 |
+
trace.log_tool(
|
| 279 |
+
"search_urls",
|
| 280 |
+
{"topic": topic, "n": 8, "queries": "google+ddg"},
|
| 281 |
+
json.dumps(urls),
|
| 282 |
+
)
|
| 283 |
+
if not urls:
|
| 284 |
+
suggest_tool = self._tools.get("suggest_urls")
|
| 285 |
+
from researchmind.url_validate import filter_valid_urls
|
| 286 |
+
|
| 287 |
+
raw_llm = suggest_tool.handler(topic, backend)
|
| 288 |
+
urls = filter_valid_urls(raw_llm, check_reachable=True, max_results=5)
|
| 289 |
+
trace.log_tool("suggest_urls", {"topic": topic, "fallback": True}, json.dumps(urls))
|
| 290 |
+
|
| 291 |
+
trace_path = str(trace.save())
|
| 292 |
+
return ResearchDiscoverResult(
|
| 293 |
+
suggested_urls=urls,
|
| 294 |
+
session_id=sid,
|
| 295 |
+
trace_path=trace_path,
|
| 296 |
+
)
|
| 297 |
+
|
| 298 |
+
def run_researchmind_ingest(
|
| 299 |
+
self,
|
| 300 |
+
*,
|
| 301 |
+
topic: str | None,
|
| 302 |
+
urls: list[str],
|
| 303 |
+
files: list[Path],
|
| 304 |
+
auto_search: bool,
|
| 305 |
+
session_id: str | None,
|
| 306 |
+
model_key: str,
|
| 307 |
+
backend: InferenceBackend,
|
| 308 |
+
) -> ResearchIngestResult:
|
| 309 |
+
skill = self._research_skill()
|
| 310 |
+
pipeline = IngestPipeline()
|
| 311 |
+
store = pipeline.store
|
| 312 |
+
sid = self._ensure_session(store, session_id, topic=topic or "")
|
| 313 |
+
|
| 314 |
+
trace = TraceRecorder(
|
| 315 |
+
skill=skill.name,
|
| 316 |
+
model=model_key,
|
| 317 |
+
user_input={
|
| 318 |
+
"topic": topic,
|
| 319 |
+
"urls": urls,
|
| 320 |
+
"files": [str(f) for f in files],
|
| 321 |
+
"auto_search": auto_search,
|
| 322 |
+
"session_id": sid,
|
| 323 |
+
},
|
| 324 |
+
)
|
| 325 |
+
backend.load()
|
| 326 |
+
|
| 327 |
+
targets = [u.strip() for u in urls if u.strip()]
|
| 328 |
+
if auto_search and topic and not targets and not files:
|
| 329 |
+
discover = self.run_researchmind_discover(
|
| 330 |
+
topic=topic,
|
| 331 |
+
auto_search=True,
|
| 332 |
+
session_id=sid,
|
| 333 |
+
model_key=model_key,
|
| 334 |
+
backend=backend,
|
| 335 |
+
)
|
| 336 |
+
targets = discover.suggested_urls
|
| 337 |
+
|
| 338 |
+
from agent.models import IngestFailure
|
| 339 |
+
|
| 340 |
+
ingested: list[str] = []
|
| 341 |
+
skipped: list[str] = []
|
| 342 |
+
failures: list[IngestFailure] = []
|
| 343 |
+
|
| 344 |
+
scrape_web = self._tools.get("scrape_web")
|
| 345 |
+
extract_index = self._tools.get("extract_and_index")
|
| 346 |
+
|
| 347 |
+
from researchmind.url_validate import validate_url
|
| 348 |
+
|
| 349 |
+
for url in targets:
|
| 350 |
+
ok, reason, normalized = validate_url(url, check_reachable=False)
|
| 351 |
+
if not ok:
|
| 352 |
+
trace.log_note(f"Skipped invalid URL {url}", reason=reason, stage="validate")
|
| 353 |
+
failures.append(IngestFailure(url=url, reason=reason, stage="validate"))
|
| 354 |
+
continue
|
| 355 |
+
try:
|
| 356 |
+
doc = scrape_web.handler(normalized)
|
| 357 |
+
if not (doc.text or "").strip():
|
| 358 |
+
msg = "empty content after scrape"
|
| 359 |
+
trace.log_note(f"Ingest failed for {url}", error=msg, stage="scrape")
|
| 360 |
+
failures.append(IngestFailure(url=url, reason=msg, stage="scrape"))
|
| 361 |
+
continue
|
| 362 |
+
doc_id, is_new = extract_index.handler(doc, session_id=sid)
|
| 363 |
+
trace.log_tool("scrape_web", {"url": url}, doc.title)
|
| 364 |
+
trace.log_tool(
|
| 365 |
+
"extract_and_index",
|
| 366 |
+
{"uri": doc.uri},
|
| 367 |
+
f"{doc_id} new={is_new}",
|
| 368 |
+
)
|
| 369 |
+
(ingested if is_new else skipped).append(url)
|
| 370 |
+
except Exception as exc: # noqa: BLE001
|
| 371 |
+
trace.log_note(f"Ingest failed for {url}", error=str(exc), stage="ingest")
|
| 372 |
+
failures.append(IngestFailure(url=url, reason=str(exc), stage="ingest"))
|
| 373 |
+
|
| 374 |
+
for file_path in files:
|
| 375 |
+
path = Path(file_path)
|
| 376 |
+
try:
|
| 377 |
+
if path.suffix.lower() == ".pdf":
|
| 378 |
+
doc = self._tools.get("scrape_pdf").handler(path)
|
| 379 |
+
elif path.suffix.lower() == ".docx":
|
| 380 |
+
doc = extract_docx(path)
|
| 381 |
+
else:
|
| 382 |
+
text = path.read_text(encoding="utf-8", errors="replace")
|
| 383 |
+
from researchmind.extract import ExtractedDocument
|
| 384 |
+
|
| 385 |
+
doc = ExtractedDocument(
|
| 386 |
+
source_type="file",
|
| 387 |
+
uri=str(path.resolve()),
|
| 388 |
+
title=path.stem,
|
| 389 |
+
text=text,
|
| 390 |
+
)
|
| 391 |
+
doc_id, is_new = extract_index.handler(doc, session_id=sid)
|
| 392 |
+
trace.log_tool("extract_and_index", {"file": str(path)}, f"{doc_id} new={is_new}")
|
| 393 |
+
label = path.name
|
| 394 |
+
(ingested if is_new else skipped).append(label)
|
| 395 |
+
except Exception as exc: # noqa: BLE001
|
| 396 |
+
trace.log_note(f"Ingest failed for {path}", error=str(exc))
|
| 397 |
+
skipped.append(path.name)
|
| 398 |
+
|
| 399 |
+
doc_count = len(store.list_documents(session_id=sid))
|
| 400 |
+
chunk_count = store.count_chunks()
|
| 401 |
+
fail_n = len(failures)
|
| 402 |
+
message = (
|
| 403 |
+
f"Ingested {len(ingested)} source(s), skipped/duplicate {len(skipped)}, "
|
| 404 |
+
f"failed {fail_n}. Session `{sid}` has {doc_count} document(s); "
|
| 405 |
+
f"{chunk_count} total chunks."
|
| 406 |
+
)
|
| 407 |
+
trace.log_note(message, failures=[f.model_dump() for f in failures])
|
| 408 |
+
trace_path = str(trace.save())
|
| 409 |
+
|
| 410 |
+
return ResearchIngestResult(
|
| 411 |
+
session_id=sid,
|
| 412 |
+
ingested=ingested,
|
| 413 |
+
skipped=skipped,
|
| 414 |
+
failures=failures,
|
| 415 |
+
doc_count=doc_count,
|
| 416 |
+
chunk_count=chunk_count,
|
| 417 |
+
trace_path=trace_path,
|
| 418 |
+
message=message,
|
| 419 |
+
)
|
| 420 |
+
|
| 421 |
+
def run_researchmind_chat(
|
| 422 |
+
self,
|
| 423 |
+
*,
|
| 424 |
+
question: str,
|
| 425 |
+
session_id: str,
|
| 426 |
+
model_key: str,
|
| 427 |
+
backend: InferenceBackend,
|
| 428 |
+
doc_ids: list[str] | None = None,
|
| 429 |
+
) -> ResearchChatResult:
|
| 430 |
+
skill = self._research_skill()
|
| 431 |
+
req = ResearchChatInput(
|
| 432 |
+
question=question.strip(),
|
| 433 |
+
session_id=session_id,
|
| 434 |
+
doc_ids=doc_ids or [],
|
| 435 |
+
)
|
| 436 |
+
|
| 437 |
+
trace = TraceRecorder(
|
| 438 |
+
skill=skill.name,
|
| 439 |
+
model=model_key,
|
| 440 |
+
user_input=req.model_dump(),
|
| 441 |
+
)
|
| 442 |
+
backend.load()
|
| 443 |
+
|
| 444 |
+
answer_tool = self._tools.get("research_answer")
|
| 445 |
+
raw_answer, citations, refs = answer_tool.handler(
|
| 446 |
+
req.question,
|
| 447 |
+
backend,
|
| 448 |
+
skill_body=skill.body,
|
| 449 |
+
skill_path=skill.path,
|
| 450 |
+
session_id=req.session_id,
|
| 451 |
+
doc_ids=req.doc_ids or None,
|
| 452 |
+
)
|
| 453 |
+
trace.log_llm(req.question, raw_answer)
|
| 454 |
+
trace.log_note(
|
| 455 |
+
"citations",
|
| 456 |
+
count=len(citations),
|
| 457 |
+
session_id=req.session_id,
|
| 458 |
+
doc_ids=req.doc_ids,
|
| 459 |
+
)
|
| 460 |
+
|
| 461 |
+
full_answer = raw_answer
|
| 462 |
+
if refs:
|
| 463 |
+
full_answer = f"{raw_answer}\n\n{refs}"
|
| 464 |
+
|
| 465 |
+
trace_path = str(trace.save())
|
| 466 |
+
pydantic_citations = [
|
| 467 |
+
Citation(
|
| 468 |
+
index=c.index,
|
| 469 |
+
chunk_id=c.chunk_id,
|
| 470 |
+
doc_title=c.doc_title,
|
| 471 |
+
doc_uri=c.doc_uri,
|
| 472 |
+
excerpt=c.excerpt,
|
| 473 |
+
)
|
| 474 |
+
for c in citations
|
| 475 |
+
]
|
| 476 |
+
|
| 477 |
+
return ResearchChatResult(
|
| 478 |
+
answer=full_answer,
|
| 479 |
+
citations=pydantic_citations,
|
| 480 |
+
references_markdown=refs,
|
| 481 |
+
session_id=req.session_id,
|
| 482 |
+
trace_path=trace_path,
|
| 483 |
+
)
|
libs/agent/src/agent/skills.py
CHANGED
|
@@ -15,6 +15,7 @@ class Skill:
|
|
| 15 |
task: str
|
| 16 |
tools: list[str]
|
| 17 |
model_hints: list[str]
|
|
|
|
| 18 |
body: str
|
| 19 |
path: Path
|
| 20 |
|
|
@@ -44,12 +45,16 @@ def _parse_skill_md(path: Path) -> Skill:
|
|
| 44 |
meta: dict[str, Any] = yaml.safe_load(match.group(1)) or {}
|
| 45 |
body = match.group(2).strip()
|
| 46 |
|
|
|
|
|
|
|
|
|
|
| 47 |
return Skill(
|
| 48 |
name=str(meta.get("name", path.parent.name)),
|
| 49 |
description=str(meta.get("description", "")),
|
| 50 |
task=str(meta.get("task", "")),
|
| 51 |
tools=[str(t) for t in meta.get("tools", [])],
|
| 52 |
model_hints=[str(m) for m in meta.get("model_hints", [])],
|
|
|
|
| 53 |
body=body,
|
| 54 |
path=path,
|
| 55 |
)
|
|
|
|
| 15 |
task: str
|
| 16 |
tools: list[str]
|
| 17 |
model_hints: list[str]
|
| 18 |
+
flags: dict[str, Any]
|
| 19 |
body: str
|
| 20 |
path: Path
|
| 21 |
|
|
|
|
| 45 |
meta: dict[str, Any] = yaml.safe_load(match.group(1)) or {}
|
| 46 |
body = match.group(2).strip()
|
| 47 |
|
| 48 |
+
raw_flags = meta.get("flags") or {}
|
| 49 |
+
flags = {str(k): v for k, v in raw_flags.items()} if isinstance(raw_flags, dict) else {}
|
| 50 |
+
|
| 51 |
return Skill(
|
| 52 |
name=str(meta.get("name", path.parent.name)),
|
| 53 |
description=str(meta.get("description", "")),
|
| 54 |
task=str(meta.get("task", "")),
|
| 55 |
tools=[str(t) for t in meta.get("tools", [])],
|
| 56 |
model_hints=[str(m) for m in meta.get("model_hints", [])],
|
| 57 |
+
flags=flags,
|
| 58 |
body=body,
|
| 59 |
path=path,
|
| 60 |
)
|
libs/agent/src/agent/tools/research_tools.py
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from typing import Any
|
| 5 |
+
|
| 6 |
+
from researchmind.citations import Citation, clean_model_answer, format_context_block, format_references
|
| 7 |
+
from researchmind.config import get_config
|
| 8 |
+
from researchmind.extract import ExtractedDocument
|
| 9 |
+
from researchmind.ingest import IngestPipeline
|
| 10 |
+
from researchmind.retrieve import retrieve
|
| 11 |
+
from researchmind.scrape_pdf import extract_pdf
|
| 12 |
+
from researchmind.scrape_web import fetch_and_extract
|
| 13 |
+
from researchmind.search_urls import search_urls
|
| 14 |
+
from researchmind.store import MemRAGStore
|
| 15 |
+
from researchmind.url_suggest import suggest_urls as llm_suggest_urls
|
| 16 |
+
|
| 17 |
+
from agent.research_prompts import research_answer_system, research_answer_user
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def get_store() -> MemRAGStore:
|
| 21 |
+
return IngestPipeline().store
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def tool_suggest_urls(topic: str, backend: Any) -> list[str]:
|
| 25 |
+
return llm_suggest_urls(topic, backend)
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def tool_scrape_web(url: str) -> ExtractedDocument:
|
| 29 |
+
return fetch_and_extract(url)
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def tool_scrape_pdf(path: Path) -> ExtractedDocument:
|
| 33 |
+
return extract_pdf(path)
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def tool_extract_and_index(
|
| 37 |
+
doc: ExtractedDocument,
|
| 38 |
+
*,
|
| 39 |
+
session_id: str | None = None,
|
| 40 |
+
) -> tuple[str, bool]:
|
| 41 |
+
pipeline = IngestPipeline()
|
| 42 |
+
return pipeline.ingest_document(doc, session_id=session_id)
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def tool_research_answer(
|
| 46 |
+
question: str,
|
| 47 |
+
backend: Any,
|
| 48 |
+
*,
|
| 49 |
+
skill_body: str,
|
| 50 |
+
skill_path: Path,
|
| 51 |
+
session_id: str | None = None,
|
| 52 |
+
doc_ids: list[str] | None = None,
|
| 53 |
+
) -> tuple[str, list[Citation], str]:
|
| 54 |
+
cfg = get_config()
|
| 55 |
+
store = get_store()
|
| 56 |
+
scope_session = session_id if session_id and not doc_ids else None
|
| 57 |
+
scope_docs = doc_ids if doc_ids else None
|
| 58 |
+
chunks = retrieve(
|
| 59 |
+
question,
|
| 60 |
+
store,
|
| 61 |
+
config=cfg,
|
| 62 |
+
session_id=scope_session,
|
| 63 |
+
doc_ids=scope_docs,
|
| 64 |
+
)
|
| 65 |
+
if not chunks:
|
| 66 |
+
if doc_ids:
|
| 67 |
+
hint = "No chunks for the selected document(s). Try other sources or re-ingest."
|
| 68 |
+
elif session_id:
|
| 69 |
+
hint = "No indexed sources in this session yet. Ingest URLs or files first."
|
| 70 |
+
else:
|
| 71 |
+
hint = "No indexed sources yet. Ingest URLs or documents first."
|
| 72 |
+
return hint, [], ""
|
| 73 |
+
|
| 74 |
+
context, citations = format_context_block(chunks)
|
| 75 |
+
system = research_answer_system(skill_body, skill_path)
|
| 76 |
+
user = research_answer_user(question, context)
|
| 77 |
+
messages = [
|
| 78 |
+
{"role": "system", "content": system},
|
| 79 |
+
{"role": "user", "content": user},
|
| 80 |
+
]
|
| 81 |
+
answer = clean_model_answer(
|
| 82 |
+
backend.chat(messages, max_tokens=1024, temperature=0.3)
|
| 83 |
+
)
|
| 84 |
+
refs = format_references(citations)
|
| 85 |
+
if session_id:
|
| 86 |
+
store.add_message(session_id, "user", question, [c.chunk_id for c in citations])
|
| 87 |
+
store.add_message(session_id, "assistant", answer, [c.chunk_id for c in citations])
|
| 88 |
+
|
| 89 |
+
return answer, citations, refs
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
def tool_search_urls(topic: str, *, n: int = 5, check_reachable: bool = True) -> list[str]:
|
| 93 |
+
return search_urls(topic, n=n, check_reachable=check_reachable)
|
libs/agent/src/agent/tools_registry.py
CHANGED
|
@@ -6,7 +6,14 @@ from typing import Any
|
|
| 6 |
|
| 7 |
from agent.models import SlideOutline
|
| 8 |
from agent.tools.pptx import create_pptx
|
| 9 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
@dataclass(frozen=True)
|
| 12 |
class ToolSpec:
|
|
@@ -23,6 +30,36 @@ class ToolRegistry:
|
|
| 23 |
"Create a PowerPoint file from a validated SlideOutline",
|
| 24 |
self._handle_create_pptx,
|
| 25 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
def register(self, name: str, description: str, handler: Callable[..., Any]) -> None:
|
| 28 |
self._tools[name] = ToolSpec(name=name, description=description, handler=handler)
|
|
|
|
| 6 |
|
| 7 |
from agent.models import SlideOutline
|
| 8 |
from agent.tools.pptx import create_pptx
|
| 9 |
+
from agent.tools.research_tools import (
|
| 10 |
+
tool_extract_and_index,
|
| 11 |
+
tool_research_answer,
|
| 12 |
+
tool_scrape_pdf,
|
| 13 |
+
tool_scrape_web,
|
| 14 |
+
tool_search_urls,
|
| 15 |
+
tool_suggest_urls,
|
| 16 |
+
)
|
| 17 |
|
| 18 |
@dataclass(frozen=True)
|
| 19 |
class ToolSpec:
|
|
|
|
| 30 |
"Create a PowerPoint file from a validated SlideOutline",
|
| 31 |
self._handle_create_pptx,
|
| 32 |
)
|
| 33 |
+
self.register(
|
| 34 |
+
"suggest_urls",
|
| 35 |
+
"Suggest research URLs for a topic using the local LLM",
|
| 36 |
+
tool_suggest_urls,
|
| 37 |
+
)
|
| 38 |
+
self.register(
|
| 39 |
+
"scrape_web",
|
| 40 |
+
"Fetch and extract text from a web URL",
|
| 41 |
+
tool_scrape_web,
|
| 42 |
+
)
|
| 43 |
+
self.register(
|
| 44 |
+
"scrape_pdf",
|
| 45 |
+
"Extract text from a PDF file path",
|
| 46 |
+
tool_scrape_pdf,
|
| 47 |
+
)
|
| 48 |
+
self.register(
|
| 49 |
+
"extract_and_index",
|
| 50 |
+
"Chunk, embed, and index an ExtractedDocument into MemRAG",
|
| 51 |
+
tool_extract_and_index,
|
| 52 |
+
)
|
| 53 |
+
self.register(
|
| 54 |
+
"research_answer",
|
| 55 |
+
"Answer a question with RAG citations from MemRAG",
|
| 56 |
+
tool_research_answer,
|
| 57 |
+
)
|
| 58 |
+
self.register(
|
| 59 |
+
"search_urls",
|
| 60 |
+
"Web search for URLs on a topic (DuckDuckGo)",
|
| 61 |
+
tool_search_urls,
|
| 62 |
+
)
|
| 63 |
|
| 64 |
def register(self, name: str, description: str, handler: Callable[..., Any]) -> None:
|
| 65 |
self._tools[name] = ToolSpec(name=name, description=description, handler=handler)
|
libs/agent/tests/test_research_runner.py
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
|
| 5 |
+
import numpy as np
|
| 6 |
+
import pytest
|
| 7 |
+
|
| 8 |
+
from agent.runner import AgentRunner
|
| 9 |
+
from researchmind.config import ResearchMindConfig
|
| 10 |
+
from researchmind.extract import ExtractedDocument
|
| 11 |
+
from researchmind.store import MemRAGStore
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class MockBackend:
|
| 15 |
+
def load(self) -> None:
|
| 16 |
+
return None
|
| 17 |
+
|
| 18 |
+
def chat(self, messages, *, max_tokens=512, temperature=0.7):
|
| 19 |
+
user = messages[-1]["content"]
|
| 20 |
+
if "Topic:" in user:
|
| 21 |
+
return '["https://example.com/a", "https://example.com/b"]'
|
| 22 |
+
return "Plants use photosynthesis [1]."
|
| 23 |
+
|
| 24 |
+
def generate(self, prompt, *, max_tokens=512, temperature=0.7):
|
| 25 |
+
return self.chat([{"role": "user", "content": prompt}], max_tokens=max_tokens)
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
@pytest.fixture
|
| 29 |
+
def research_env(tmp_path, monkeypatch):
|
| 30 |
+
cfg = ResearchMindConfig(
|
| 31 |
+
data_dir=tmp_path / "rm",
|
| 32 |
+
embed_model="test",
|
| 33 |
+
auto_search=False,
|
| 34 |
+
top_k=2,
|
| 35 |
+
max_context_chunks=8,
|
| 36 |
+
chunk_size=50,
|
| 37 |
+
chunk_overlap=10,
|
| 38 |
+
)
|
| 39 |
+
monkeypatch.setenv("RESEARCHMIND_DATA_DIR", str(cfg.data_dir))
|
| 40 |
+
|
| 41 |
+
def fake_embed(texts, *, model_name):
|
| 42 |
+
vecs = []
|
| 43 |
+
for t in texts:
|
| 44 |
+
vecs.append(np.array([1.0, 0.0, 0.0], dtype=np.float32))
|
| 45 |
+
return np.stack(vecs) if vecs else np.zeros((0, 3), dtype=np.float32)
|
| 46 |
+
|
| 47 |
+
monkeypatch.setattr("researchmind.ingest.embed_texts", fake_embed)
|
| 48 |
+
monkeypatch.setattr("researchmind.retrieve.embed_texts", fake_embed)
|
| 49 |
+
|
| 50 |
+
def fake_scrape(url: str):
|
| 51 |
+
return ExtractedDocument(
|
| 52 |
+
source_type="web",
|
| 53 |
+
uri=url,
|
| 54 |
+
title="Example",
|
| 55 |
+
text="Photosynthesis converts light to energy in plants.",
|
| 56 |
+
)
|
| 57 |
+
|
| 58 |
+
monkeypatch.setattr("agent.tools.research_tools.fetch_and_extract", fake_scrape)
|
| 59 |
+
|
| 60 |
+
def fake_search(topic, *, n=5, check_reachable=True):
|
| 61 |
+
return [f"https://example.com/{topic.replace(' ', '-')}"]
|
| 62 |
+
|
| 63 |
+
monkeypatch.setattr("agent.tools.research_tools.search_urls", fake_search)
|
| 64 |
+
|
| 65 |
+
def fake_validate(url, *, check_reachable=True):
|
| 66 |
+
normalized = url if url.startswith("http") else f"https://{url}"
|
| 67 |
+
return True, "ok", normalized
|
| 68 |
+
|
| 69 |
+
monkeypatch.setattr("researchmind.url_validate.validate_url", fake_validate)
|
| 70 |
+
return cfg
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def test_discover_urls(research_env):
|
| 74 |
+
runner = AgentRunner()
|
| 75 |
+
result = runner.run_researchmind_discover(
|
| 76 |
+
topic="photosynthesis",
|
| 77 |
+
auto_search=False,
|
| 78 |
+
session_id=None,
|
| 79 |
+
model_key="test",
|
| 80 |
+
backend=MockBackend(),
|
| 81 |
+
)
|
| 82 |
+
assert len(result.suggested_urls) >= 1
|
| 83 |
+
assert result.session_id
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
def test_ingest_and_chat(research_env):
|
| 87 |
+
runner = AgentRunner()
|
| 88 |
+
ingest = runner.run_researchmind_ingest(
|
| 89 |
+
topic=None,
|
| 90 |
+
urls=["https://example.com/a"],
|
| 91 |
+
files=[],
|
| 92 |
+
auto_search=False,
|
| 93 |
+
session_id=None,
|
| 94 |
+
model_key="test",
|
| 95 |
+
backend=MockBackend(),
|
| 96 |
+
)
|
| 97 |
+
assert ingest.doc_count >= 1
|
| 98 |
+
assert ingest.chunk_count >= 1
|
| 99 |
+
|
| 100 |
+
chat = runner.run_researchmind_chat(
|
| 101 |
+
question="How do plants make energy?",
|
| 102 |
+
session_id=ingest.session_id,
|
| 103 |
+
model_key="test",
|
| 104 |
+
backend=MockBackend(),
|
| 105 |
+
)
|
| 106 |
+
assert "photosynthesis" in chat.answer.lower() or "[1]" in chat.answer
|
| 107 |
+
assert chat.session_id == ingest.session_id
|
libs/inference/src/inference/response_clean.py
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import re
|
| 4 |
+
|
| 5 |
+
_RT_OPEN = "<" + "redacted_thinking" + ">"
|
| 6 |
+
_RT_CLOSE = "</" + "redacted_thinking" + ">"
|
| 7 |
+
_THINK_OPEN = "<" + "think" + ">"
|
| 8 |
+
_THINK_CLOSE = "</" + "think" + ">"
|
| 9 |
+
|
| 10 |
+
_THINK_BLOCKS = re.compile(
|
| 11 |
+
"|".join(
|
| 12 |
+
(
|
| 13 |
+
re.escape(_RT_OPEN) + r".*?" + re.escape(_RT_CLOSE),
|
| 14 |
+
re.escape(_THINK_OPEN) + r".*?" + re.escape(_THINK_CLOSE),
|
| 15 |
+
r"<thinking>.*?</thinking>",
|
| 16 |
+
)
|
| 17 |
+
),
|
| 18 |
+
re.DOTALL | re.IGNORECASE,
|
| 19 |
+
)
|
| 20 |
+
_MALFORMED_THINK_OPEN = re.compile(r"^think>\s*", re.IGNORECASE)
|
| 21 |
+
_ANSWER_SPLITS = [
|
| 22 |
+
re.compile(r"(?:Let's draft:|Draft:)\s*", re.IGNORECASE),
|
| 23 |
+
re.compile(r"\nSummary:\s*", re.IGNORECASE),
|
| 24 |
+
re.compile(r"\nAnswer:\s*", re.IGNORECASE),
|
| 25 |
+
re.compile(r"\n\n(?:In summary|To summarize)[,:]\s*", re.IGNORECASE),
|
| 26 |
+
]
|
| 27 |
+
_META_TAIL = re.compile(
|
| 28 |
+
r"\n\n(?:Now,|We need|Also,|But we|However,|The instruction|So we|"
|
| 29 |
+
r"That means|We must|We should|We have|We can)\b",
|
| 30 |
+
re.IGNORECASE,
|
| 31 |
+
)
|
| 32 |
+
_REASONING_OPENERS = (
|
| 33 |
+
"we need to",
|
| 34 |
+
"first,",
|
| 35 |
+
"the user",
|
| 36 |
+
"let me",
|
| 37 |
+
"okay,",
|
| 38 |
+
"now, let",
|
| 39 |
+
"i need to",
|
| 40 |
+
)
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def _normalize_extracted(text: str) -> str:
|
| 44 |
+
cleaned = text.strip()
|
| 45 |
+
cleaned = re.sub(r"^Summary:\s*", "", cleaned, flags=re.IGNORECASE)
|
| 46 |
+
cleaned = re.sub(r"^Answer:\s*", "", cleaned, flags=re.IGNORECASE)
|
| 47 |
+
return cleaned.strip()
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def _extract_answer_from_reasoning(text: str) -> str | None:
|
| 51 |
+
for pattern in _ANSWER_SPLITS:
|
| 52 |
+
match = pattern.search(text)
|
| 53 |
+
if not match:
|
| 54 |
+
continue
|
| 55 |
+
rest = _normalize_extracted(text[match.end() :])
|
| 56 |
+
rest = _META_TAIL.split(rest, maxsplit=1)[0].strip()
|
| 57 |
+
if len(rest) >= 40:
|
| 58 |
+
return rest
|
| 59 |
+
return None
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def looks_like_reasoning_only(text: str) -> bool:
|
| 63 |
+
sample = text[:240].lower()
|
| 64 |
+
return any(sample.startswith(opener) for opener in _REASONING_OPENERS)
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def strip_reasoning_output(text: str) -> str:
|
| 68 |
+
"""Remove model chain-of-thought / thinking traces from user-visible replies."""
|
| 69 |
+
cleaned = text.strip()
|
| 70 |
+
if not cleaned:
|
| 71 |
+
return ""
|
| 72 |
+
|
| 73 |
+
cleaned = _THINK_BLOCKS.sub("", cleaned).strip()
|
| 74 |
+
|
| 75 |
+
if _MALFORMED_THINK_OPEN.match(cleaned):
|
| 76 |
+
body = _MALFORMED_THINK_OPEN.sub("", cleaned, count=1).strip()
|
| 77 |
+
extracted = _extract_answer_from_reasoning(body)
|
| 78 |
+
if extracted:
|
| 79 |
+
return extracted
|
| 80 |
+
cleaned = body
|
| 81 |
+
|
| 82 |
+
if looks_like_reasoning_only(cleaned):
|
| 83 |
+
extracted = _extract_answer_from_reasoning(cleaned)
|
| 84 |
+
if extracted:
|
| 85 |
+
return extracted
|
| 86 |
+
|
| 87 |
+
return cleaned
|
libs/inference/tests/test_response_clean.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from inference.response_clean import strip_reasoning_output
|
| 4 |
+
|
| 5 |
+
_RT_OPEN = "<" + "redacted_thinking" + ">"
|
| 6 |
+
_RT_CLOSE = "</" + "redacted_thinking" + ">"
|
| 7 |
+
_THINK_OPEN = "<" + "think" + ">"
|
| 8 |
+
_THINK_CLOSE = "</" + "think" + ">"
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def test_strips_redacted_thinking_block():
|
| 12 |
+
raw = f"{_RT_OPEN}\nplanning...\n{_RT_CLOSE}\n\nThe capital of France is Paris."
|
| 13 |
+
assert strip_reasoning_output(raw) == "The capital of France is Paris."
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def test_strips_think_block():
|
| 17 |
+
raw = f"{_THINK_OPEN}\nplanning...\n{_THINK_CLOSE}\n\nAgents use memory [1]."
|
| 18 |
+
assert strip_reasoning_output(raw) == "Agents use memory [1]."
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def test_strips_malformed_think_prefix_and_extracts_summary():
|
| 22 |
+
raw = """think> We need to summarize the document. First, identify sources.
|
| 23 |
+
|
| 24 |
+
Let's draft:
|
| 25 |
+
|
| 26 |
+
Summary: This review covers AI agent applications, evaluation, and future work [1]."""
|
| 27 |
+
out = strip_reasoning_output(raw)
|
| 28 |
+
assert out.startswith("This review covers")
|
| 29 |
+
assert "We need to summarize" not in out
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def test_preserves_normal_answer():
|
| 33 |
+
text = "AI agents combine perception, planning, and action [1]."
|
| 34 |
+
assert strip_reasoning_output(text) == text
|
libs/researchmind/README.md
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# researchmind
|
| 2 |
+
|
| 3 |
+
Local ingest, MemRAG persistence, and retrieval for the ResearchMind agent.
|
| 4 |
+
|
| 5 |
+
- Scrape web (httpx + trafilatura), PDF (pypdf), DOCX (python-docx)
|
| 6 |
+
- Chunk, embed (sentence-transformers), store in SQLite
|
| 7 |
+
- Top-k retrieval with graph neighbor expansion and citation formatting
|
| 8 |
+
|
| 9 |
+
Set `RESEARCHMIND_DATA_DIR` (default `outputs/researchmind`) for the memory database and raw snapshots.
|
libs/researchmind/pyproject.toml
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[project]
|
| 2 |
+
name = "researchmind"
|
| 3 |
+
version = "0.1.0"
|
| 4 |
+
description = "Local scraper + RAG + MemRAG store for ResearchMind agent"
|
| 5 |
+
readme = "README.md"
|
| 6 |
+
authors = [
|
| 7 |
+
{ name = "MSGhais", email = "msghais135@gmail.com" }
|
| 8 |
+
]
|
| 9 |
+
requires-python = ">=3.12"
|
| 10 |
+
dependencies = [
|
| 11 |
+
"inference",
|
| 12 |
+
"ddgs>=9.0.0",
|
| 13 |
+
"googlesearch-python>=1.3.0",
|
| 14 |
+
"httpx>=0.28.0",
|
| 15 |
+
"numpy>=2.0.0",
|
| 16 |
+
"pydantic>=2.0.0",
|
| 17 |
+
"pypdf>=5.0.0",
|
| 18 |
+
"python-docx>=1.1.0",
|
| 19 |
+
"sentence-transformers>=3.0.0",
|
| 20 |
+
"trafilatura>=2.0.0",
|
| 21 |
+
]
|
| 22 |
+
|
| 23 |
+
[build-system]
|
| 24 |
+
requires = ["uv_build>=0.8.13,<0.9.0"]
|
| 25 |
+
build-backend = "uv_build"
|
libs/researchmind/src/researchmind/__init__.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from researchmind.config import get_config
|
| 2 |
+
from researchmind.extract import ExtractedDocument
|
| 3 |
+
from researchmind.ingest import IngestPipeline
|
| 4 |
+
from researchmind.store import MemRAGStore
|
| 5 |
+
|
| 6 |
+
__all__ = [
|
| 7 |
+
"ExtractedDocument",
|
| 8 |
+
"IngestPipeline",
|
| 9 |
+
"MemRAGStore",
|
| 10 |
+
"get_config",
|
| 11 |
+
]
|
libs/researchmind/src/researchmind/chunking.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import hashlib
|
| 4 |
+
import re
|
| 5 |
+
from dataclasses import dataclass
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
@dataclass(frozen=True)
|
| 9 |
+
class TextChunk:
|
| 10 |
+
chunk_id: str
|
| 11 |
+
ordinal: int
|
| 12 |
+
text: str
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def _approx_tokens(text: str) -> int:
|
| 16 |
+
return len(re.findall(r"\S+", text))
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def chunk_text(
|
| 20 |
+
text: str,
|
| 21 |
+
*,
|
| 22 |
+
doc_id: str,
|
| 23 |
+
chunk_size: int = 512,
|
| 24 |
+
chunk_overlap: int = 128,
|
| 25 |
+
) -> list[TextChunk]:
|
| 26 |
+
words = text.split()
|
| 27 |
+
if not words:
|
| 28 |
+
return []
|
| 29 |
+
|
| 30 |
+
chunks: list[TextChunk] = []
|
| 31 |
+
start = 0
|
| 32 |
+
ordinal = 0
|
| 33 |
+
step = max(1, chunk_size - chunk_overlap)
|
| 34 |
+
|
| 35 |
+
while start < len(words):
|
| 36 |
+
end = min(len(words), start + chunk_size)
|
| 37 |
+
piece = " ".join(words[start:end]).strip()
|
| 38 |
+
if piece:
|
| 39 |
+
digest = hashlib.sha256(f"{doc_id}:{ordinal}:{piece}".encode()).hexdigest()[:16]
|
| 40 |
+
chunks.append(TextChunk(chunk_id=f"{doc_id}_{ordinal}_{digest}", ordinal=ordinal, text=piece))
|
| 41 |
+
ordinal += 1
|
| 42 |
+
if end >= len(words):
|
| 43 |
+
break
|
| 44 |
+
start += step
|
| 45 |
+
|
| 46 |
+
return chunks
|
libs/researchmind/src/researchmind/citations.py
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import re
|
| 4 |
+
from dataclasses import dataclass
|
| 5 |
+
|
| 6 |
+
from inference.response_clean import looks_like_reasoning_only, strip_reasoning_output
|
| 7 |
+
|
| 8 |
+
from researchmind.store import StoredChunk
|
| 9 |
+
|
| 10 |
+
_EXCERPT_LEN = 400
|
| 11 |
+
_PASSAGE_LEN = 700
|
| 12 |
+
_CITATION_RUN = re.compile(r"(?:\[\d{1,4}\]\s*){3,}")
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
@dataclass(frozen=True)
|
| 16 |
+
class Citation:
|
| 17 |
+
index: int
|
| 18 |
+
chunk_id: str
|
| 19 |
+
doc_title: str
|
| 20 |
+
doc_uri: str
|
| 21 |
+
excerpt: str
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def _clean_passage(text: str) -> str:
|
| 25 |
+
"""Collapse long runs of in-text [n] markers from scraped papers."""
|
| 26 |
+
cleaned = _CITATION_RUN.sub("[β¦] ", text)
|
| 27 |
+
cleaned = re.sub(r"\s+", " ", cleaned).strip()
|
| 28 |
+
if len(cleaned) > _PASSAGE_LEN:
|
| 29 |
+
return cleaned[:_PASSAGE_LEN] + "β¦"
|
| 30 |
+
return cleaned
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def format_context_block(chunks: list[StoredChunk]) -> tuple[str, list[Citation]]:
|
| 34 |
+
"""Build LLM context with one citation index per source document."""
|
| 35 |
+
groups: list[tuple[str, str, list[StoredChunk]]] = []
|
| 36 |
+
seen_uris: set[str] = set()
|
| 37 |
+
for chunk in chunks:
|
| 38 |
+
if chunk.doc_uri in seen_uris:
|
| 39 |
+
for uri, _title, group in groups:
|
| 40 |
+
if uri == chunk.doc_uri:
|
| 41 |
+
group.append(chunk)
|
| 42 |
+
break
|
| 43 |
+
else:
|
| 44 |
+
seen_uris.add(chunk.doc_uri)
|
| 45 |
+
groups.append((chunk.doc_uri, chunk.doc_title, [chunk]))
|
| 46 |
+
|
| 47 |
+
citations: list[Citation] = []
|
| 48 |
+
blocks: list[str] = []
|
| 49 |
+
for i, (uri, title, doc_chunks) in enumerate(groups, start=1):
|
| 50 |
+
passages = [_clean_passage(c.text) for c in doc_chunks if c.text.strip()]
|
| 51 |
+
merged = "\n\n".join(passages)
|
| 52 |
+
excerpt = merged[:_EXCERPT_LEN] + ("..." if len(merged) > _EXCERPT_LEN else "")
|
| 53 |
+
citations.append(
|
| 54 |
+
Citation(
|
| 55 |
+
index=i,
|
| 56 |
+
chunk_id=doc_chunks[0].id,
|
| 57 |
+
doc_title=title,
|
| 58 |
+
doc_uri=uri,
|
| 59 |
+
excerpt=excerpt,
|
| 60 |
+
)
|
| 61 |
+
)
|
| 62 |
+
blocks.append(f"[{i}] **{title}**\n{uri}\n\n{merged}")
|
| 63 |
+
|
| 64 |
+
context = "\n\n---\n\n".join(blocks)
|
| 65 |
+
return context, citations
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
def format_references(citations: list[Citation]) -> str:
|
| 69 |
+
if not citations:
|
| 70 |
+
return ""
|
| 71 |
+
lines = ["**References**"]
|
| 72 |
+
for c in citations:
|
| 73 |
+
lines.append(f"- [{c.index}] {c.doc_title} β {c.doc_uri}")
|
| 74 |
+
return "\n".join(lines)
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def clean_model_answer(answer: str) -> str:
|
| 78 |
+
"""Remove thinking traces, duplicate references, and citation spam from model output."""
|
| 79 |
+
text = strip_reasoning_output(answer)
|
| 80 |
+
if "**References**" in text:
|
| 81 |
+
text = text.split("**References**", maxsplit=1)[0].rstrip()
|
| 82 |
+
if "\nReferences\n" in text:
|
| 83 |
+
text = text.split("\nReferences\n", maxsplit=1)[0].rstrip()
|
| 84 |
+
text = _CITATION_RUN.sub("", text)
|
| 85 |
+
text = re.sub(r"\n{3,}", "\n\n", text)
|
| 86 |
+
text = text.strip()
|
| 87 |
+
if not text or looks_like_reasoning_only(text):
|
| 88 |
+
return (
|
| 89 |
+
"The model returned planning text without a final answer. "
|
| 90 |
+
"Try asking again or switch to a non-reasoning model preset."
|
| 91 |
+
)
|
| 92 |
+
return text
|
libs/researchmind/src/researchmind/config.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
from dataclasses import dataclass
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
@dataclass(frozen=True)
|
| 9 |
+
class ResearchMindConfig:
|
| 10 |
+
data_dir: Path
|
| 11 |
+
embed_model: str
|
| 12 |
+
auto_search: bool
|
| 13 |
+
top_k: int
|
| 14 |
+
max_context_chunks: int
|
| 15 |
+
chunk_size: int
|
| 16 |
+
chunk_overlap: int
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def get_config() -> ResearchMindConfig:
|
| 20 |
+
data_dir = Path(
|
| 21 |
+
os.environ.get("RESEARCHMIND_DATA_DIR", "outputs/researchmind")
|
| 22 |
+
).expanduser()
|
| 23 |
+
return ResearchMindConfig(
|
| 24 |
+
data_dir=data_dir,
|
| 25 |
+
embed_model=os.environ.get("RESEARCHMIND_EMBED_MODEL", "all-MiniLM-L6-v2"),
|
| 26 |
+
auto_search=os.environ.get("RESEARCHMIND_AUTO_SEARCH", "false").lower()
|
| 27 |
+
in ("1", "true", "yes"),
|
| 28 |
+
top_k=int(os.environ.get("RESEARCHMIND_TOP_K", "5")),
|
| 29 |
+
max_context_chunks=int(os.environ.get("RESEARCHMIND_MAX_CONTEXT_CHUNKS", "8")),
|
| 30 |
+
chunk_size=int(os.environ.get("RESEARCHMIND_CHUNK_SIZE", "512")),
|
| 31 |
+
chunk_overlap=int(os.environ.get("RESEARCHMIND_CHUNK_OVERLAP", "128")),
|
| 32 |
+
)
|
libs/researchmind/src/researchmind/embeddings.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import numpy as np
|
| 4 |
+
|
| 5 |
+
_embedder = None
|
| 6 |
+
_embedder_model_name: str | None = None
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def get_embedder(model_name: str):
|
| 10 |
+
global _embedder, _embedder_model_name
|
| 11 |
+
if _embedder is None or _embedder_model_name != model_name:
|
| 12 |
+
from sentence_transformers import SentenceTransformer
|
| 13 |
+
|
| 14 |
+
_embedder = SentenceTransformer(model_name)
|
| 15 |
+
_embedder_model_name = model_name
|
| 16 |
+
return _embedder
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def embed_texts(texts: list[str], *, model_name: str) -> np.ndarray:
|
| 20 |
+
if not texts:
|
| 21 |
+
return np.zeros((0, 0), dtype=np.float32)
|
| 22 |
+
model = get_embedder(model_name)
|
| 23 |
+
vectors = model.encode(texts, normalize_embeddings=True, show_progress_bar=False)
|
| 24 |
+
return np.asarray(vectors, dtype=np.float32)
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def embedding_to_bytes(vector: np.ndarray) -> bytes:
|
| 28 |
+
return vector.astype(np.float32).tobytes()
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def bytes_to_embedding(data: bytes, dim: int) -> np.ndarray:
|
| 32 |
+
return np.frombuffer(data, dtype=np.float32).reshape(dim)
|
libs/researchmind/src/researchmind/extract.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
|
| 5 |
+
from pydantic import BaseModel, Field
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class ExtractedDocument(BaseModel):
|
| 9 |
+
source_type: str
|
| 10 |
+
uri: str
|
| 11 |
+
title: str
|
| 12 |
+
text: str
|
| 13 |
+
mime: str = "text/plain"
|
| 14 |
+
metadata: dict[str, str] = Field(default_factory=dict)
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def extract_docx(path: Path) -> ExtractedDocument:
|
| 18 |
+
from docx import Document
|
| 19 |
+
|
| 20 |
+
doc = Document(path)
|
| 21 |
+
paragraphs = [p.text.strip() for p in doc.paragraphs if p.text.strip()]
|
| 22 |
+
text = "\n\n".join(paragraphs)
|
| 23 |
+
title = path.stem
|
| 24 |
+
for para in doc.paragraphs:
|
| 25 |
+
if para.style and para.style.name and "Heading" in para.style.name:
|
| 26 |
+
if para.text.strip():
|
| 27 |
+
title = para.text.strip()
|
| 28 |
+
break
|
| 29 |
+
return ExtractedDocument(
|
| 30 |
+
source_type="docx",
|
| 31 |
+
uri=str(path.resolve()),
|
| 32 |
+
title=title,
|
| 33 |
+
text=text or path.name,
|
| 34 |
+
mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
| 35 |
+
metadata={"filename": path.name},
|
| 36 |
+
)
|
libs/researchmind/src/researchmind/ingest.py
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from typing import Any
|
| 5 |
+
|
| 6 |
+
import numpy as np
|
| 7 |
+
|
| 8 |
+
from researchmind.chunking import chunk_text
|
| 9 |
+
from researchmind.config import ResearchMindConfig, get_config
|
| 10 |
+
from researchmind.embeddings import embed_texts
|
| 11 |
+
from researchmind.extract import ExtractedDocument, extract_docx
|
| 12 |
+
from researchmind.scrape_pdf import extract_pdf
|
| 13 |
+
from researchmind.scrape_web import fetch_and_extract
|
| 14 |
+
from researchmind.store import MemRAGStore
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class IngestPipeline:
|
| 18 |
+
def __init__(
|
| 19 |
+
self,
|
| 20 |
+
store: MemRAGStore | None = None,
|
| 21 |
+
config: ResearchMindConfig | None = None,
|
| 22 |
+
) -> None:
|
| 23 |
+
self._config = config or get_config()
|
| 24 |
+
self._store = store or MemRAGStore(self._config)
|
| 25 |
+
|
| 26 |
+
@property
|
| 27 |
+
def store(self) -> MemRAGStore:
|
| 28 |
+
return self._store
|
| 29 |
+
|
| 30 |
+
def ingest_document(
|
| 31 |
+
self,
|
| 32 |
+
doc: ExtractedDocument,
|
| 33 |
+
*,
|
| 34 |
+
session_id: str | None = None,
|
| 35 |
+
raw_snapshot: str | None = None,
|
| 36 |
+
) -> tuple[str, bool]:
|
| 37 |
+
doc_id_prefix = self._store.content_hash(doc.text)[:12]
|
| 38 |
+
chunks = chunk_text(
|
| 39 |
+
doc.text,
|
| 40 |
+
doc_id=doc_id_prefix,
|
| 41 |
+
chunk_size=self._config.chunk_size,
|
| 42 |
+
chunk_overlap=self._config.chunk_overlap,
|
| 43 |
+
)
|
| 44 |
+
if not chunks and doc.text.strip():
|
| 45 |
+
from researchmind.chunking import TextChunk
|
| 46 |
+
|
| 47 |
+
chunks = [
|
| 48 |
+
TextChunk(
|
| 49 |
+
chunk_id=f"{doc_id_prefix}_0",
|
| 50 |
+
ordinal=0,
|
| 51 |
+
text=doc.text[: self._config.chunk_size],
|
| 52 |
+
)
|
| 53 |
+
]
|
| 54 |
+
|
| 55 |
+
chunks_text = [c.text for c in chunks]
|
| 56 |
+
embeddings = embed_texts(chunks_text, model_name=self._config.embed_model)
|
| 57 |
+
chunk_tuples: list[tuple[str, int, str, np.ndarray, dict[str, Any]]] = []
|
| 58 |
+
for chunk, emb in zip(chunks, embeddings, strict=True):
|
| 59 |
+
chunk_tuples.append(
|
| 60 |
+
(
|
| 61 |
+
chunk.chunk_id,
|
| 62 |
+
chunk.ordinal,
|
| 63 |
+
chunk.text,
|
| 64 |
+
emb,
|
| 65 |
+
{"source_type": doc.source_type},
|
| 66 |
+
)
|
| 67 |
+
)
|
| 68 |
+
|
| 69 |
+
return self._store.add_document(
|
| 70 |
+
source_type=doc.source_type,
|
| 71 |
+
uri=doc.uri,
|
| 72 |
+
title=doc.title,
|
| 73 |
+
text=doc.text,
|
| 74 |
+
chunks=chunk_tuples,
|
| 75 |
+
session_id=session_id,
|
| 76 |
+
raw_snapshot=raw_snapshot or doc.text[:100_000],
|
| 77 |
+
)
|
| 78 |
+
|
| 79 |
+
def ingest_url(self, url: str, *, session_id: str | None = None) -> tuple[str, bool]:
|
| 80 |
+
doc = fetch_and_extract(url)
|
| 81 |
+
return self.ingest_document(doc, session_id=session_id, raw_snapshot=doc.text)
|
| 82 |
+
|
| 83 |
+
def ingest_pdf(self, path: Path, *, session_id: str | None = None) -> tuple[str, bool]:
|
| 84 |
+
doc = extract_pdf(path)
|
| 85 |
+
return self.ingest_document(doc, session_id=session_id)
|
| 86 |
+
|
| 87 |
+
def ingest_docx(self, path: Path, *, session_id: str | None = None) -> tuple[str, bool]:
|
| 88 |
+
doc = extract_docx(path)
|
| 89 |
+
return self.ingest_document(doc, session_id=session_id)
|
| 90 |
+
|
| 91 |
+
def ingest_path(self, path: Path, *, session_id: str | None = None) -> tuple[str, bool]:
|
| 92 |
+
suffix = path.suffix.lower()
|
| 93 |
+
if suffix == ".pdf":
|
| 94 |
+
return self.ingest_pdf(path, session_id=session_id)
|
| 95 |
+
if suffix == ".docx":
|
| 96 |
+
return self.ingest_docx(path, session_id=session_id)
|
| 97 |
+
text = path.read_text(encoding="utf-8", errors="replace")
|
| 98 |
+
doc = ExtractedDocument(
|
| 99 |
+
source_type="file",
|
| 100 |
+
uri=str(path.resolve()),
|
| 101 |
+
title=path.stem,
|
| 102 |
+
text=text,
|
| 103 |
+
mime="text/plain",
|
| 104 |
+
)
|
| 105 |
+
return self.ingest_document(doc, session_id=session_id)
|
libs/researchmind/src/researchmind/retrieve.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import numpy as np
|
| 4 |
+
|
| 5 |
+
from researchmind.config import ResearchMindConfig, get_config
|
| 6 |
+
from researchmind.embeddings import embed_texts
|
| 7 |
+
from researchmind.store import MemRAGStore, StoredChunk
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def retrieve(
|
| 11 |
+
query: str,
|
| 12 |
+
store: MemRAGStore,
|
| 13 |
+
*,
|
| 14 |
+
config: ResearchMindConfig | None = None,
|
| 15 |
+
top_k: int | None = None,
|
| 16 |
+
expand_neighbors: bool = True,
|
| 17 |
+
session_id: str | None = None,
|
| 18 |
+
doc_ids: list[str] | None = None,
|
| 19 |
+
) -> list[StoredChunk]:
|
| 20 |
+
cfg = config or get_config()
|
| 21 |
+
k = top_k if top_k is not None else cfg.top_k
|
| 22 |
+
all_chunks = store.get_chunks_with_embeddings(
|
| 23 |
+
session_id=session_id,
|
| 24 |
+
doc_ids=doc_ids,
|
| 25 |
+
)
|
| 26 |
+
if not all_chunks:
|
| 27 |
+
return []
|
| 28 |
+
|
| 29 |
+
q_vec = embed_texts([query], model_name=cfg.embed_model)[0]
|
| 30 |
+
scored: list[tuple[float, StoredChunk]] = []
|
| 31 |
+
for chunk, emb in all_chunks:
|
| 32 |
+
sim = float(np.dot(q_vec, emb))
|
| 33 |
+
scored.append((sim, chunk))
|
| 34 |
+
|
| 35 |
+
max_chunks = cfg.max_context_chunks
|
| 36 |
+
scored.sort(key=lambda x: x[0], reverse=True)
|
| 37 |
+
selected: list[StoredChunk] = []
|
| 38 |
+
seen_ids: set[str] = set()
|
| 39 |
+
|
| 40 |
+
for _, chunk in scored[:k]:
|
| 41 |
+
if len(selected) >= max_chunks:
|
| 42 |
+
break
|
| 43 |
+
if chunk.id not in seen_ids:
|
| 44 |
+
selected.append(chunk)
|
| 45 |
+
seen_ids.add(chunk.id)
|
| 46 |
+
if expand_neighbors and len(selected) < max_chunks:
|
| 47 |
+
for nid in store.get_neighbor_chunk_ids(chunk.id)[:1]:
|
| 48 |
+
if len(selected) >= max_chunks:
|
| 49 |
+
break
|
| 50 |
+
if nid not in seen_ids:
|
| 51 |
+
neighbors = store.get_chunks_by_ids([nid])
|
| 52 |
+
for n in neighbors:
|
| 53 |
+
selected.append(n)
|
| 54 |
+
seen_ids.add(n.id)
|
| 55 |
+
break
|
| 56 |
+
|
| 57 |
+
return selected[:max_chunks]
|
libs/researchmind/src/researchmind/scrape_pdf.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
|
| 5 |
+
from pypdf import PdfReader
|
| 6 |
+
|
| 7 |
+
from researchmind.extract import ExtractedDocument
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def extract_pdf(path: Path, *, max_pages: int = 200) -> ExtractedDocument:
|
| 11 |
+
reader = PdfReader(str(path))
|
| 12 |
+
pages: list[str] = []
|
| 13 |
+
for i, page in enumerate(reader.pages[:max_pages]):
|
| 14 |
+
page_text = (page.extract_text() or "").strip()
|
| 15 |
+
if page_text:
|
| 16 |
+
pages.append(page_text)
|
| 17 |
+
|
| 18 |
+
text = "\n\n".join(pages)
|
| 19 |
+
title = path.stem
|
| 20 |
+
if reader.metadata and reader.metadata.title:
|
| 21 |
+
title = str(reader.metadata.title)
|
| 22 |
+
|
| 23 |
+
return ExtractedDocument(
|
| 24 |
+
source_type="pdf",
|
| 25 |
+
uri=str(path.resolve()),
|
| 26 |
+
title=title,
|
| 27 |
+
text=text or path.name,
|
| 28 |
+
mime="application/pdf",
|
| 29 |
+
metadata={"page_count": str(min(len(reader.pages), max_pages))},
|
| 30 |
+
)
|
libs/researchmind/src/researchmind/scrape_web.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import httpx
|
| 4 |
+
import trafilatura
|
| 5 |
+
|
| 6 |
+
from researchmind.extract import ExtractedDocument
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def fetch_and_extract(url: str, *, timeout: float = 30.0) -> ExtractedDocument:
|
| 10 |
+
headers = {
|
| 11 |
+
"User-Agent": "ResearchMind/0.1 (local research agent; hackathon)",
|
| 12 |
+
}
|
| 13 |
+
with httpx.Client(follow_redirects=True, timeout=timeout, headers=headers) as client:
|
| 14 |
+
response = client.get(url)
|
| 15 |
+
response.raise_for_status()
|
| 16 |
+
html = response.text
|
| 17 |
+
|
| 18 |
+
extracted = trafilatura.extract(
|
| 19 |
+
html,
|
| 20 |
+
url=url,
|
| 21 |
+
include_comments=False,
|
| 22 |
+
include_tables=True,
|
| 23 |
+
output_format="txt",
|
| 24 |
+
)
|
| 25 |
+
metadata = trafilatura.extract_metadata(html, default_url=url)
|
| 26 |
+
title = (metadata.title if metadata and metadata.title else url) or url
|
| 27 |
+
text = (extracted or "").strip()
|
| 28 |
+
if not text:
|
| 29 |
+
text = html[:50_000]
|
| 30 |
+
|
| 31 |
+
return ExtractedDocument(
|
| 32 |
+
source_type="web",
|
| 33 |
+
uri=url,
|
| 34 |
+
title=title,
|
| 35 |
+
text=text,
|
| 36 |
+
mime="text/html",
|
| 37 |
+
metadata={"final_url": str(response.url)},
|
| 38 |
+
)
|
libs/researchmind/src/researchmind/search_urls.py
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import logging
|
| 4 |
+
|
| 5 |
+
from researchmind.url_validate import filter_valid_urls, normalize_url
|
| 6 |
+
|
| 7 |
+
logger = logging.getLogger(__name__)
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def build_search_queries(topic: str) -> list[str]:
|
| 11 |
+
"""Craft Google-friendly queries for a research topic."""
|
| 12 |
+
t = topic.strip()
|
| 13 |
+
if not t:
|
| 14 |
+
return []
|
| 15 |
+
return [
|
| 16 |
+
f"{t} site:wikipedia.org",
|
| 17 |
+
f'"{t}" introduction overview',
|
| 18 |
+
f"{t} tutorial guide site:.edu OR site:.gov",
|
| 19 |
+
f"{t} research paper site:arxiv.org",
|
| 20 |
+
f"what is {t}",
|
| 21 |
+
]
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def _google_search(query: str, *, n: int) -> list[str]:
|
| 25 |
+
urls: list[str] = []
|
| 26 |
+
try:
|
| 27 |
+
from googlesearch import search
|
| 28 |
+
|
| 29 |
+
for item in search(query, num_results=n, lang="en", timeout=15):
|
| 30 |
+
if isinstance(item, str):
|
| 31 |
+
urls.append(item)
|
| 32 |
+
else:
|
| 33 |
+
href = getattr(item, "url", None) or getattr(item, "link", None)
|
| 34 |
+
if href:
|
| 35 |
+
urls.append(str(href))
|
| 36 |
+
except Exception as exc: # noqa: BLE001
|
| 37 |
+
logger.warning("Google search failed for %r: %s", query, exc)
|
| 38 |
+
return urls
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def _duckduckgo_search(query: str, *, n: int) -> list[str]:
|
| 42 |
+
urls: list[str] = []
|
| 43 |
+
try:
|
| 44 |
+
try:
|
| 45 |
+
from ddgs import DDGS
|
| 46 |
+
except ImportError:
|
| 47 |
+
from duckduckgo_search import DDGS
|
| 48 |
+
|
| 49 |
+
ddgs = DDGS()
|
| 50 |
+
results = ddgs.text(query, max_results=n)
|
| 51 |
+
if results is None:
|
| 52 |
+
return urls
|
| 53 |
+
for item in results:
|
| 54 |
+
if not isinstance(item, dict):
|
| 55 |
+
continue
|
| 56 |
+
href = item.get("href") or item.get("link")
|
| 57 |
+
if href:
|
| 58 |
+
urls.append(str(href))
|
| 59 |
+
except Exception as exc: # noqa: BLE001
|
| 60 |
+
logger.warning("DuckDuckGo search failed for %r: %s", query, exc)
|
| 61 |
+
return urls
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def _collect_candidates(topic: str, *, per_query: int = 4) -> list[str]:
|
| 65 |
+
candidates: list[str] = []
|
| 66 |
+
seen: set[str] = set()
|
| 67 |
+
for query in build_search_queries(topic):
|
| 68 |
+
batch = _google_search(query, n=per_query)
|
| 69 |
+
if not batch:
|
| 70 |
+
batch = _duckduckgo_search(query, n=per_query)
|
| 71 |
+
for raw in batch:
|
| 72 |
+
normalized = normalize_url(raw)
|
| 73 |
+
if normalized and normalized not in seen:
|
| 74 |
+
seen.add(normalized)
|
| 75 |
+
candidates.append(normalized)
|
| 76 |
+
return candidates
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
def search_urls(
|
| 80 |
+
topic: str,
|
| 81 |
+
*,
|
| 82 |
+
n: int = 5,
|
| 83 |
+
check_reachable: bool = True,
|
| 84 |
+
) -> list[str]:
|
| 85 |
+
"""
|
| 86 |
+
Search the web (Google first, DuckDuckGo fallback) and return verified URLs.
|
| 87 |
+
"""
|
| 88 |
+
candidates = _collect_candidates(topic, per_query=max(n, 4))
|
| 89 |
+
return filter_valid_urls(candidates, check_reachable=check_reachable, max_results=n)
|
libs/researchmind/src/researchmind/store.py
ADDED
|
@@ -0,0 +1,381 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import hashlib
|
| 4 |
+
import json
|
| 5 |
+
import sqlite3
|
| 6 |
+
import uuid
|
| 7 |
+
from dataclasses import dataclass
|
| 8 |
+
from datetime import UTC, datetime
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
from typing import Any
|
| 11 |
+
|
| 12 |
+
import numpy as np
|
| 13 |
+
|
| 14 |
+
from researchmind.config import ResearchMindConfig, get_config
|
| 15 |
+
from researchmind.embeddings import bytes_to_embedding, embedding_to_bytes
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
@dataclass(frozen=True)
|
| 19 |
+
class StoredDocument:
|
| 20 |
+
id: str
|
| 21 |
+
source_type: str
|
| 22 |
+
uri: str
|
| 23 |
+
title: str
|
| 24 |
+
ingested_at: str
|
| 25 |
+
content_hash: str
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
@dataclass(frozen=True)
|
| 29 |
+
class StoredChunk:
|
| 30 |
+
id: str
|
| 31 |
+
doc_id: str
|
| 32 |
+
ordinal: int
|
| 33 |
+
text: str
|
| 34 |
+
doc_title: str
|
| 35 |
+
doc_uri: str
|
| 36 |
+
metadata: dict[str, Any]
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
@dataclass(frozen=True)
|
| 40 |
+
class SessionInfo:
|
| 41 |
+
id: str
|
| 42 |
+
topic: str
|
| 43 |
+
created_at: str
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
class MemRAGStore:
|
| 47 |
+
def __init__(self, config: ResearchMindConfig | None = None) -> None:
|
| 48 |
+
self._config = config or get_config()
|
| 49 |
+
self._config.data_dir.mkdir(parents=True, exist_ok=True)
|
| 50 |
+
(self._config.data_dir / "raw").mkdir(parents=True, exist_ok=True)
|
| 51 |
+
self._db_path = self._config.data_dir / "memory.db"
|
| 52 |
+
self._embed_dim: int | None = None
|
| 53 |
+
self._init_db()
|
| 54 |
+
|
| 55 |
+
@property
|
| 56 |
+
def db_path(self) -> Path:
|
| 57 |
+
return self._db_path
|
| 58 |
+
|
| 59 |
+
@property
|
| 60 |
+
def embed_dim(self) -> int:
|
| 61 |
+
if self._embed_dim is None:
|
| 62 |
+
row = self._conn().execute(
|
| 63 |
+
"SELECT dim FROM embed_meta LIMIT 1"
|
| 64 |
+
).fetchone()
|
| 65 |
+
self._embed_dim = int(row[0]) if row else 384
|
| 66 |
+
return self._embed_dim
|
| 67 |
+
|
| 68 |
+
def _conn(self) -> sqlite3.Connection:
|
| 69 |
+
conn = sqlite3.connect(self._db_path)
|
| 70 |
+
conn.row_factory = sqlite3.Row
|
| 71 |
+
return conn
|
| 72 |
+
|
| 73 |
+
def _init_db(self) -> None:
|
| 74 |
+
with self._conn() as conn:
|
| 75 |
+
conn.executescript(
|
| 76 |
+
"""
|
| 77 |
+
CREATE TABLE IF NOT EXISTS embed_meta (
|
| 78 |
+
dim INTEGER NOT NULL
|
| 79 |
+
);
|
| 80 |
+
CREATE TABLE IF NOT EXISTS documents (
|
| 81 |
+
id TEXT PRIMARY KEY,
|
| 82 |
+
source_type TEXT NOT NULL,
|
| 83 |
+
uri TEXT NOT NULL,
|
| 84 |
+
title TEXT NOT NULL,
|
| 85 |
+
ingested_at TEXT NOT NULL,
|
| 86 |
+
content_hash TEXT NOT NULL UNIQUE,
|
| 87 |
+
session_id TEXT
|
| 88 |
+
);
|
| 89 |
+
CREATE TABLE IF NOT EXISTS chunks (
|
| 90 |
+
id TEXT PRIMARY KEY,
|
| 91 |
+
doc_id TEXT NOT NULL,
|
| 92 |
+
ordinal INTEGER NOT NULL,
|
| 93 |
+
text TEXT NOT NULL,
|
| 94 |
+
embedding_blob BLOB NOT NULL,
|
| 95 |
+
meta_json TEXT NOT NULL DEFAULT '{}',
|
| 96 |
+
FOREIGN KEY (doc_id) REFERENCES documents(id)
|
| 97 |
+
);
|
| 98 |
+
CREATE TABLE IF NOT EXISTS edges (
|
| 99 |
+
src_id TEXT NOT NULL,
|
| 100 |
+
dst_id TEXT NOT NULL,
|
| 101 |
+
rel TEXT NOT NULL,
|
| 102 |
+
PRIMARY KEY (src_id, dst_id, rel)
|
| 103 |
+
);
|
| 104 |
+
CREATE TABLE IF NOT EXISTS sessions (
|
| 105 |
+
id TEXT PRIMARY KEY,
|
| 106 |
+
topic TEXT NOT NULL,
|
| 107 |
+
created_at TEXT NOT NULL
|
| 108 |
+
);
|
| 109 |
+
CREATE TABLE IF NOT EXISTS session_messages (
|
| 110 |
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
| 111 |
+
session_id TEXT NOT NULL,
|
| 112 |
+
role TEXT NOT NULL,
|
| 113 |
+
content TEXT NOT NULL,
|
| 114 |
+
chunk_ids_json TEXT NOT NULL DEFAULT '[]',
|
| 115 |
+
created_at TEXT NOT NULL,
|
| 116 |
+
FOREIGN KEY (session_id) REFERENCES sessions(id)
|
| 117 |
+
);
|
| 118 |
+
CREATE INDEX IF NOT EXISTS idx_chunks_doc ON chunks(doc_id);
|
| 119 |
+
CREATE INDEX IF NOT EXISTS idx_documents_session ON documents(session_id);
|
| 120 |
+
"""
|
| 121 |
+
)
|
| 122 |
+
|
| 123 |
+
def set_embed_dim(self, dim: int) -> None:
|
| 124 |
+
with self._conn() as conn:
|
| 125 |
+
conn.execute("DELETE FROM embed_meta")
|
| 126 |
+
conn.execute("INSERT INTO embed_meta (dim) VALUES (?)", (dim,))
|
| 127 |
+
self._embed_dim = dim
|
| 128 |
+
|
| 129 |
+
@staticmethod
|
| 130 |
+
def content_hash(text: str) -> str:
|
| 131 |
+
return hashlib.sha256(text.encode()).hexdigest()
|
| 132 |
+
|
| 133 |
+
def create_session(self, topic: str = "") -> SessionInfo:
|
| 134 |
+
session_id = uuid.uuid4().hex[:12]
|
| 135 |
+
created_at = datetime.now(UTC).isoformat()
|
| 136 |
+
with self._conn() as conn:
|
| 137 |
+
conn.execute(
|
| 138 |
+
"INSERT INTO sessions (id, topic, created_at) VALUES (?, ?, ?)",
|
| 139 |
+
(session_id, topic, created_at),
|
| 140 |
+
)
|
| 141 |
+
return SessionInfo(id=session_id, topic=topic, created_at=created_at)
|
| 142 |
+
|
| 143 |
+
def list_sessions(self) -> list[SessionInfo]:
|
| 144 |
+
with self._conn() as conn:
|
| 145 |
+
rows = conn.execute(
|
| 146 |
+
"SELECT id, topic, created_at FROM sessions ORDER BY created_at DESC"
|
| 147 |
+
).fetchall()
|
| 148 |
+
return [SessionInfo(id=r["id"], topic=r["topic"], created_at=r["created_at"]) for r in rows]
|
| 149 |
+
|
| 150 |
+
def get_session(self, session_id: str) -> SessionInfo | None:
|
| 151 |
+
with self._conn() as conn:
|
| 152 |
+
row = conn.execute(
|
| 153 |
+
"SELECT id, topic, created_at FROM sessions WHERE id = ?",
|
| 154 |
+
(session_id,),
|
| 155 |
+
).fetchone()
|
| 156 |
+
if not row:
|
| 157 |
+
return None
|
| 158 |
+
return SessionInfo(id=row["id"], topic=row["topic"], created_at=row["created_at"])
|
| 159 |
+
|
| 160 |
+
def document_exists(self, content_hash: str) -> str | None:
|
| 161 |
+
with self._conn() as conn:
|
| 162 |
+
row = conn.execute(
|
| 163 |
+
"SELECT id FROM documents WHERE content_hash = ?",
|
| 164 |
+
(content_hash,),
|
| 165 |
+
).fetchone()
|
| 166 |
+
return row["id"] if row else None
|
| 167 |
+
|
| 168 |
+
def add_document(
|
| 169 |
+
self,
|
| 170 |
+
*,
|
| 171 |
+
source_type: str,
|
| 172 |
+
uri: str,
|
| 173 |
+
title: str,
|
| 174 |
+
text: str,
|
| 175 |
+
chunks: list[tuple[str, int, str, np.ndarray, dict[str, Any]]],
|
| 176 |
+
session_id: str | None = None,
|
| 177 |
+
raw_snapshot: str | None = None,
|
| 178 |
+
) -> tuple[str, bool]:
|
| 179 |
+
"""Returns (doc_id, was_new). Skips if content_hash already indexed."""
|
| 180 |
+
c_hash = self.content_hash(text)
|
| 181 |
+
existing = self.document_exists(c_hash)
|
| 182 |
+
if existing:
|
| 183 |
+
return existing, False
|
| 184 |
+
|
| 185 |
+
doc_id = uuid.uuid4().hex[:12]
|
| 186 |
+
ingested_at = datetime.now(UTC).isoformat()
|
| 187 |
+
if chunks:
|
| 188 |
+
dim = int(chunks[0][3].shape[0])
|
| 189 |
+
self.set_embed_dim(dim)
|
| 190 |
+
|
| 191 |
+
with self._conn() as conn:
|
| 192 |
+
conn.execute(
|
| 193 |
+
"""
|
| 194 |
+
INSERT INTO documents (id, source_type, uri, title, ingested_at, content_hash, session_id)
|
| 195 |
+
VALUES (?, ?, ?, ?, ?, ?, ?)
|
| 196 |
+
""",
|
| 197 |
+
(doc_id, source_type, uri, title, ingested_at, c_hash, session_id),
|
| 198 |
+
)
|
| 199 |
+
for chunk_id, ordinal, chunk_text, emb, meta in chunks:
|
| 200 |
+
conn.execute(
|
| 201 |
+
"""
|
| 202 |
+
INSERT INTO chunks (id, doc_id, ordinal, text, embedding_blob, meta_json)
|
| 203 |
+
VALUES (?, ?, ?, ?, ?, ?)
|
| 204 |
+
""",
|
| 205 |
+
(
|
| 206 |
+
chunk_id,
|
| 207 |
+
doc_id,
|
| 208 |
+
ordinal,
|
| 209 |
+
chunk_text,
|
| 210 |
+
embedding_to_bytes(emb),
|
| 211 |
+
json.dumps(meta),
|
| 212 |
+
),
|
| 213 |
+
)
|
| 214 |
+
conn.execute(
|
| 215 |
+
"INSERT OR IGNORE INTO edges (src_id, dst_id, rel) VALUES (?, ?, ?)",
|
| 216 |
+
(doc_id, chunk_id, "doc_has_chunk"),
|
| 217 |
+
)
|
| 218 |
+
for i in range(len(chunks) - 1):
|
| 219 |
+
conn.execute(
|
| 220 |
+
"INSERT OR IGNORE INTO edges (src_id, dst_id, rel) VALUES (?, ?, ?)",
|
| 221 |
+
(chunks[i][0], chunks[i + 1][0], "chunk_next"),
|
| 222 |
+
)
|
| 223 |
+
|
| 224 |
+
if raw_snapshot is not None:
|
| 225 |
+
raw_dir = self._config.data_dir / "raw" / doc_id
|
| 226 |
+
raw_dir.mkdir(parents=True, exist_ok=True)
|
| 227 |
+
(raw_dir / "snapshot.txt").write_text(raw_snapshot, encoding="utf-8")
|
| 228 |
+
|
| 229 |
+
return doc_id, True
|
| 230 |
+
|
| 231 |
+
def list_documents(self, session_id: str | None = None) -> list[StoredDocument]:
|
| 232 |
+
query = "SELECT id, source_type, uri, title, ingested_at, content_hash FROM documents"
|
| 233 |
+
params: tuple[Any, ...] = ()
|
| 234 |
+
if session_id:
|
| 235 |
+
query += " WHERE session_id = ?"
|
| 236 |
+
params = (session_id,)
|
| 237 |
+
query += " ORDER BY ingested_at DESC"
|
| 238 |
+
with self._conn() as conn:
|
| 239 |
+
rows = conn.execute(query, params).fetchall()
|
| 240 |
+
return [
|
| 241 |
+
StoredDocument(
|
| 242 |
+
id=r["id"],
|
| 243 |
+
source_type=r["source_type"],
|
| 244 |
+
uri=r["uri"],
|
| 245 |
+
title=r["title"],
|
| 246 |
+
ingested_at=r["ingested_at"],
|
| 247 |
+
content_hash=r["content_hash"],
|
| 248 |
+
)
|
| 249 |
+
for r in rows
|
| 250 |
+
]
|
| 251 |
+
|
| 252 |
+
def get_chunks_with_embeddings(
|
| 253 |
+
self,
|
| 254 |
+
*,
|
| 255 |
+
session_id: str | None = None,
|
| 256 |
+
doc_ids: list[str] | None = None,
|
| 257 |
+
) -> list[tuple[StoredChunk, np.ndarray]]:
|
| 258 |
+
dim = self.embed_dim
|
| 259 |
+
query = """
|
| 260 |
+
SELECT c.id, c.doc_id, c.ordinal, c.text, c.embedding_blob, c.meta_json,
|
| 261 |
+
d.title AS doc_title, d.uri AS doc_uri
|
| 262 |
+
FROM chunks c
|
| 263 |
+
JOIN documents d ON d.id = c.doc_id
|
| 264 |
+
WHERE 1=1
|
| 265 |
+
"""
|
| 266 |
+
params: list[Any] = []
|
| 267 |
+
if session_id:
|
| 268 |
+
query += " AND d.session_id = ?"
|
| 269 |
+
params.append(session_id)
|
| 270 |
+
if doc_ids:
|
| 271 |
+
placeholders = ",".join("?" * len(doc_ids))
|
| 272 |
+
query += f" AND d.id IN ({placeholders})"
|
| 273 |
+
params.extend(doc_ids)
|
| 274 |
+
with self._conn() as conn:
|
| 275 |
+
rows = conn.execute(query, params).fetchall()
|
| 276 |
+
result: list[tuple[StoredChunk, np.ndarray]] = []
|
| 277 |
+
for r in rows:
|
| 278 |
+
chunk = StoredChunk(
|
| 279 |
+
id=r["id"],
|
| 280 |
+
doc_id=r["doc_id"],
|
| 281 |
+
ordinal=r["ordinal"],
|
| 282 |
+
text=r["text"],
|
| 283 |
+
doc_title=r["doc_title"],
|
| 284 |
+
doc_uri=r["doc_uri"],
|
| 285 |
+
metadata=json.loads(r["meta_json"] or "{}"),
|
| 286 |
+
)
|
| 287 |
+
emb = bytes_to_embedding(r["embedding_blob"], dim)
|
| 288 |
+
result.append((chunk, emb))
|
| 289 |
+
return result
|
| 290 |
+
|
| 291 |
+
def get_neighbor_chunk_ids(self, chunk_id: str) -> list[str]:
|
| 292 |
+
ids: list[str] = []
|
| 293 |
+
with self._conn() as conn:
|
| 294 |
+
for row in conn.execute(
|
| 295 |
+
"SELECT dst_id FROM edges WHERE src_id = ? AND rel = 'chunk_next'",
|
| 296 |
+
(chunk_id,),
|
| 297 |
+
):
|
| 298 |
+
ids.append(row["dst_id"])
|
| 299 |
+
for row in conn.execute(
|
| 300 |
+
"SELECT src_id FROM edges WHERE dst_id = ? AND rel = 'chunk_next'",
|
| 301 |
+
(chunk_id,),
|
| 302 |
+
):
|
| 303 |
+
ids.append(row["src_id"])
|
| 304 |
+
return ids
|
| 305 |
+
|
| 306 |
+
def get_chunks_by_ids(self, chunk_ids: list[str]) -> list[StoredChunk]:
|
| 307 |
+
if not chunk_ids:
|
| 308 |
+
return []
|
| 309 |
+
placeholders = ",".join("?" for _ in chunk_ids)
|
| 310 |
+
with self._conn() as conn:
|
| 311 |
+
rows = conn.execute(
|
| 312 |
+
f"""
|
| 313 |
+
SELECT c.id, c.doc_id, c.ordinal, c.text, c.meta_json,
|
| 314 |
+
d.title AS doc_title, d.uri AS doc_uri
|
| 315 |
+
FROM chunks c
|
| 316 |
+
JOIN documents d ON d.id = c.doc_id
|
| 317 |
+
WHERE c.id IN ({placeholders})
|
| 318 |
+
""",
|
| 319 |
+
chunk_ids,
|
| 320 |
+
).fetchall()
|
| 321 |
+
by_id = {
|
| 322 |
+
r["id"]: StoredChunk(
|
| 323 |
+
id=r["id"],
|
| 324 |
+
doc_id=r["doc_id"],
|
| 325 |
+
ordinal=r["ordinal"],
|
| 326 |
+
text=r["text"],
|
| 327 |
+
doc_title=r["doc_title"],
|
| 328 |
+
doc_uri=r["doc_uri"],
|
| 329 |
+
metadata=json.loads(r["meta_json"] or "{}"),
|
| 330 |
+
)
|
| 331 |
+
for r in rows
|
| 332 |
+
}
|
| 333 |
+
return [by_id[cid] for cid in chunk_ids if cid in by_id]
|
| 334 |
+
|
| 335 |
+
def add_message(
|
| 336 |
+
self,
|
| 337 |
+
session_id: str,
|
| 338 |
+
role: str,
|
| 339 |
+
content: str,
|
| 340 |
+
chunk_ids: list[str] | None = None,
|
| 341 |
+
) -> None:
|
| 342 |
+
with self._conn() as conn:
|
| 343 |
+
conn.execute(
|
| 344 |
+
"""
|
| 345 |
+
INSERT INTO session_messages (session_id, role, content, chunk_ids_json, created_at)
|
| 346 |
+
VALUES (?, ?, ?, ?, ?)
|
| 347 |
+
""",
|
| 348 |
+
(
|
| 349 |
+
session_id,
|
| 350 |
+
role,
|
| 351 |
+
content,
|
| 352 |
+
json.dumps(chunk_ids or []),
|
| 353 |
+
datetime.now(UTC).isoformat(),
|
| 354 |
+
),
|
| 355 |
+
)
|
| 356 |
+
|
| 357 |
+
def get_messages(self, session_id: str) -> list[dict[str, Any]]:
|
| 358 |
+
with self._conn() as conn:
|
| 359 |
+
rows = conn.execute(
|
| 360 |
+
"""
|
| 361 |
+
SELECT role, content, chunk_ids_json, created_at
|
| 362 |
+
FROM session_messages
|
| 363 |
+
WHERE session_id = ?
|
| 364 |
+
ORDER BY id ASC
|
| 365 |
+
""",
|
| 366 |
+
(session_id,),
|
| 367 |
+
).fetchall()
|
| 368 |
+
return [
|
| 369 |
+
{
|
| 370 |
+
"role": r["role"],
|
| 371 |
+
"content": r["content"],
|
| 372 |
+
"chunk_ids": json.loads(r["chunk_ids_json"] or "[]"),
|
| 373 |
+
"created_at": r["created_at"],
|
| 374 |
+
}
|
| 375 |
+
for r in rows
|
| 376 |
+
]
|
| 377 |
+
|
| 378 |
+
def count_chunks(self) -> int:
|
| 379 |
+
with self._conn() as conn:
|
| 380 |
+
row = conn.execute("SELECT COUNT(*) AS n FROM chunks").fetchone()
|
| 381 |
+
return int(row["n"])
|
libs/researchmind/src/researchmind/url_suggest.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
import re
|
| 5 |
+
from typing import TYPE_CHECKING, Protocol
|
| 6 |
+
|
| 7 |
+
if TYPE_CHECKING:
|
| 8 |
+
pass
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class ChatBackend(Protocol):
|
| 12 |
+
def chat(
|
| 13 |
+
self,
|
| 14 |
+
messages: list[dict[str, str]],
|
| 15 |
+
*,
|
| 16 |
+
max_tokens: int = 512,
|
| 17 |
+
temperature: float = 0.7,
|
| 18 |
+
) -> str: ...
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
SUGGEST_SYSTEM = """You suggest reputable web URLs for research on a topic.
|
| 22 |
+
Return ONLY a JSON array of 3-5 full https URLs as strings.
|
| 23 |
+
No markdown, no explanation. Example: ["https://example.com/a", "https://example.com/b"]
|
| 24 |
+
"""
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def suggest_urls(topic: str, backend: ChatBackend, *, max_urls: int = 5) -> list[str]:
|
| 28 |
+
messages = [
|
| 29 |
+
{"role": "system", "content": SUGGEST_SYSTEM},
|
| 30 |
+
{"role": "user", "content": f"Topic: {topic.strip()}"},
|
| 31 |
+
]
|
| 32 |
+
raw = backend.chat(messages, max_tokens=512, temperature=0.2)
|
| 33 |
+
return _parse_url_list(raw, max_urls=max_urls)
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def _parse_url_list(raw: str, *, max_urls: int) -> list[str]:
|
| 37 |
+
cleaned = raw.strip()
|
| 38 |
+
fence = re.search(r"```(?:json)?\s*(\[.*?\])\s*```", cleaned, re.DOTALL)
|
| 39 |
+
if fence:
|
| 40 |
+
cleaned = fence.group(1)
|
| 41 |
+
else:
|
| 42 |
+
start = cleaned.find("[")
|
| 43 |
+
end = cleaned.rfind("]")
|
| 44 |
+
if start >= 0 and end > start:
|
| 45 |
+
cleaned = cleaned[start : end + 1]
|
| 46 |
+
|
| 47 |
+
try:
|
| 48 |
+
data = json.loads(cleaned)
|
| 49 |
+
except json.JSONDecodeError:
|
| 50 |
+
urls = re.findall(r"https?://[^\s\"'<>]+", raw)
|
| 51 |
+
return _dedupe_urls(urls, max_urls)
|
| 52 |
+
|
| 53 |
+
if not isinstance(data, list):
|
| 54 |
+
return []
|
| 55 |
+
urls = [str(u).strip() for u in data if str(u).strip().startswith("http")]
|
| 56 |
+
return _dedupe_urls(urls, max_urls)
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def _dedupe_urls(urls: list[str], max_urls: int) -> list[str]:
|
| 60 |
+
seen: set[str] = set()
|
| 61 |
+
out: list[str] = []
|
| 62 |
+
for u in urls:
|
| 63 |
+
if u not in seen:
|
| 64 |
+
seen.add(u)
|
| 65 |
+
out.append(u)
|
| 66 |
+
if len(out) >= max_urls:
|
| 67 |
+
break
|
| 68 |
+
return out
|
libs/researchmind/src/researchmind/url_validate.py
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import re
|
| 4 |
+
from urllib.parse import urlparse
|
| 5 |
+
|
| 6 |
+
import httpx
|
| 7 |
+
|
| 8 |
+
# arXiv IDs look like 2301.00001 or 2301.00001v2
|
| 9 |
+
_ARXIV_ABS = re.compile(
|
| 10 |
+
r"^https?://(?:www\.)?arxiv\.org/abs/(\d{4}\.\d{4,5})(?:v\d+)?/?$",
|
| 11 |
+
re.IGNORECASE,
|
| 12 |
+
)
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def normalize_url(url: str) -> str:
|
| 16 |
+
cleaned = url.strip().strip("\"'<>")
|
| 17 |
+
if not cleaned:
|
| 18 |
+
return ""
|
| 19 |
+
if cleaned.startswith("//"):
|
| 20 |
+
cleaned = "https:" + cleaned
|
| 21 |
+
if not cleaned.startswith(("http://", "https://")):
|
| 22 |
+
cleaned = "https://" + cleaned
|
| 23 |
+
parsed = urlparse(cleaned)
|
| 24 |
+
if not parsed.netloc:
|
| 25 |
+
return ""
|
| 26 |
+
return parsed.geturl().split("#")[0].rstrip("/")
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def is_well_formed(url: str) -> tuple[bool, str]:
|
| 30 |
+
if not url:
|
| 31 |
+
return False, "empty url"
|
| 32 |
+
if "..." in url or "β¦" in url:
|
| 33 |
+
return False, "truncated placeholder"
|
| 34 |
+
if " " in url:
|
| 35 |
+
return False, "contains spaces"
|
| 36 |
+
|
| 37 |
+
parsed = urlparse(url)
|
| 38 |
+
if parsed.scheme not in ("http", "https"):
|
| 39 |
+
return False, f"unsupported scheme {parsed.scheme!r}"
|
| 40 |
+
host = parsed.netloc.lower()
|
| 41 |
+
if not host or "." not in host:
|
| 42 |
+
return False, "missing host"
|
| 43 |
+
if host in ("localhost", "127.0.0.1"):
|
| 44 |
+
return False, "local url"
|
| 45 |
+
|
| 46 |
+
path = parsed.path or ""
|
| 47 |
+
if "arxiv.org" in host and "/abs/" in path:
|
| 48 |
+
if not _ARXIV_ABS.match(url):
|
| 49 |
+
return False, "invalid arxiv abs url"
|
| 50 |
+
|
| 51 |
+
if "ieeexplore.ieee.org" in host and path.rstrip("/") in ("", "/document"):
|
| 52 |
+
return False, "incomplete ieee document url"
|
| 53 |
+
|
| 54 |
+
if _is_tracking_or_junk_url(host, path, parsed.query):
|
| 55 |
+
return False, "tracking or redirect link (not a content page)"
|
| 56 |
+
|
| 57 |
+
return True, "ok"
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def _is_tracking_or_junk_url(host: str, path: str, query: str) -> bool:
|
| 61 |
+
"""Reject ad/click trackers and other non-content URLs from search results."""
|
| 62 |
+
if "bing.com" in host and "/aclick" in path:
|
| 63 |
+
return True
|
| 64 |
+
if "google." in host and ("/aclk" in path or "googleadservices" in host):
|
| 65 |
+
return True
|
| 66 |
+
if "doubleclick.net" in host or "googlesyndication.com" in host:
|
| 67 |
+
return True
|
| 68 |
+
if host.endswith("bing.com") and path.startswith("/ck/"):
|
| 69 |
+
return True
|
| 70 |
+
# Search result redirect wrappers, not stable content URLs
|
| 71 |
+
if "google." in host and path.rstrip("/") == "/url" and "q=" in query:
|
| 72 |
+
return True
|
| 73 |
+
return False
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def probe_url_reachable(url: str, *, timeout: float = 12.0) -> tuple[bool, str]:
|
| 77 |
+
headers = {"User-Agent": "ResearchMind/0.1 (url-validator)"}
|
| 78 |
+
try:
|
| 79 |
+
with httpx.Client(follow_redirects=True, timeout=timeout, headers=headers) as client:
|
| 80 |
+
response = client.head(url)
|
| 81 |
+
if response.status_code in (405, 501):
|
| 82 |
+
response = client.get(url)
|
| 83 |
+
if response.status_code >= 400:
|
| 84 |
+
return False, f"http {response.status_code}"
|
| 85 |
+
return True, "ok"
|
| 86 |
+
except httpx.HTTPError as exc:
|
| 87 |
+
return False, str(exc)
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
def validate_url(url: str, *, check_reachable: bool = True) -> tuple[bool, str, str]:
|
| 91 |
+
"""Return (ok, reason, normalized_url)."""
|
| 92 |
+
normalized = normalize_url(url)
|
| 93 |
+
ok, reason = is_well_formed(normalized)
|
| 94 |
+
if not ok:
|
| 95 |
+
return False, reason, normalized
|
| 96 |
+
if check_reachable:
|
| 97 |
+
ok, reason = probe_url_reachable(normalized)
|
| 98 |
+
if not ok:
|
| 99 |
+
return False, reason, normalized
|
| 100 |
+
return True, "ok", normalized
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
def filter_valid_urls(
|
| 104 |
+
urls: list[str],
|
| 105 |
+
*,
|
| 106 |
+
check_reachable: bool = True,
|
| 107 |
+
max_results: int = 5,
|
| 108 |
+
) -> list[str]:
|
| 109 |
+
seen: set[str] = set()
|
| 110 |
+
valid: list[str] = []
|
| 111 |
+
for raw in urls:
|
| 112 |
+
ok, _reason, normalized = validate_url(raw, check_reachable=check_reachable)
|
| 113 |
+
if ok and normalized not in seen:
|
| 114 |
+
seen.add(normalized)
|
| 115 |
+
valid.append(normalized)
|
| 116 |
+
if len(valid) >= max_results:
|
| 117 |
+
break
|
| 118 |
+
return valid
|
libs/researchmind/tests/test_chunking.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from researchmind.chunking import chunk_text
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def test_chunk_text_splits_long_document():
|
| 7 |
+
words = ["word"] * 600
|
| 8 |
+
text = " ".join(words)
|
| 9 |
+
chunks = chunk_text(text, doc_id="doc1", chunk_size=100, chunk_overlap=20)
|
| 10 |
+
assert len(chunks) > 1
|
| 11 |
+
assert chunks[0].ordinal == 0
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def test_chunk_text_empty():
|
| 15 |
+
assert chunk_text("", doc_id="x") == []
|
libs/researchmind/tests/test_citations.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from researchmind.citations import (
|
| 4 |
+
clean_model_answer,
|
| 5 |
+
format_context_block,
|
| 6 |
+
format_references,
|
| 7 |
+
)
|
| 8 |
+
from researchmind.store import StoredChunk
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def _chunk(chunk_id: str, doc_uri: str, text: str) -> StoredChunk:
|
| 12 |
+
return StoredChunk(
|
| 13 |
+
id=chunk_id,
|
| 14 |
+
doc_id="doc1",
|
| 15 |
+
ordinal=0,
|
| 16 |
+
text=text,
|
| 17 |
+
doc_title="AI Agents Review",
|
| 18 |
+
doc_uri=doc_uri,
|
| 19 |
+
metadata={},
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def test_format_context_groups_chunks_by_document():
|
| 24 |
+
chunks = [
|
| 25 |
+
_chunk("c1", "https://example.com/paper", "First passage about agents."),
|
| 26 |
+
_chunk("c2", "https://example.com/paper", "Second passage about planning."),
|
| 27 |
+
]
|
| 28 |
+
context, citations = format_context_block(chunks)
|
| 29 |
+
assert context.count("[1]") == 1
|
| 30 |
+
assert "[2]" not in context
|
| 31 |
+
assert len(citations) == 1
|
| 32 |
+
assert "First passage" in context
|
| 33 |
+
assert "Second passage" in context
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def test_format_references_one_line_per_source():
|
| 37 |
+
_, citations = format_context_block(
|
| 38 |
+
[
|
| 39 |
+
_chunk("c1", "https://a.test", "alpha"),
|
| 40 |
+
_chunk("c2", "https://a.test", "beta"),
|
| 41 |
+
]
|
| 42 |
+
)
|
| 43 |
+
refs = format_references(citations)
|
| 44 |
+
assert refs.count("https://a.test") == 1
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def test_clean_passage_collapses_citation_runs():
|
| 48 |
+
chunks = [_chunk("c1", "https://a.test", "[1] [2] [3] [4] [5] actual content")]
|
| 49 |
+
context, _ = format_context_block(chunks)
|
| 50 |
+
assert "[1] [2] [3] [4] [5]" not in context
|
| 51 |
+
assert "actual content" in context
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def test_clean_model_answer_strips_reference_spam():
|
| 55 |
+
raw = "Summary here [1][2][3][4][5].\n\n**References**\n- [1] dup"
|
| 56 |
+
cleaned = clean_model_answer(raw)
|
| 57 |
+
assert "**References**" not in cleaned
|
| 58 |
+
assert "[1][2][3]" not in cleaned
|
| 59 |
+
assert "Summary here" in cleaned
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def test_clean_model_answer_strips_thinking_block():
|
| 63 |
+
think_open = "<" + "think" + ">"
|
| 64 |
+
think_close = "</" + "think" + ">"
|
| 65 |
+
raw = f"{think_open}\nplan\n{think_close}\n\nAgents use tools and memory [1]."
|
| 66 |
+
cleaned = clean_model_answer(raw)
|
| 67 |
+
assert cleaned == "Agents use tools and memory [1]."
|
libs/researchmind/tests/test_retrieve.py
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import numpy as np
|
| 4 |
+
|
| 5 |
+
from researchmind.config import ResearchMindConfig
|
| 6 |
+
from researchmind.retrieve import retrieve
|
| 7 |
+
from researchmind.store import MemRAGStore
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def _fake_embed(monkeypatch):
|
| 11 |
+
def fake_embed_texts(texts, *, model_name):
|
| 12 |
+
out = []
|
| 13 |
+
for t in texts:
|
| 14 |
+
if "photosynthesis" in t.lower():
|
| 15 |
+
out.append(np.array([1.0, 0.0], dtype=np.float32))
|
| 16 |
+
else:
|
| 17 |
+
out.append(np.array([0.0, 1.0], dtype=np.float32))
|
| 18 |
+
return np.stack(out)
|
| 19 |
+
|
| 20 |
+
monkeypatch.setattr("researchmind.retrieve.embed_texts", fake_embed_texts)
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def test_retrieve_ranks_by_similarity(tmp_path, monkeypatch):
|
| 24 |
+
_fake_embed(monkeypatch)
|
| 25 |
+
cfg = ResearchMindConfig(
|
| 26 |
+
data_dir=tmp_path,
|
| 27 |
+
embed_model="test",
|
| 28 |
+
auto_search=False,
|
| 29 |
+
top_k=1,
|
| 30 |
+
max_context_chunks=8,
|
| 31 |
+
chunk_size=512,
|
| 32 |
+
chunk_overlap=128,
|
| 33 |
+
)
|
| 34 |
+
store = MemRAGStore(cfg)
|
| 35 |
+
store.set_embed_dim(2)
|
| 36 |
+
store.add_document(
|
| 37 |
+
source_type="test",
|
| 38 |
+
uri="a",
|
| 39 |
+
title="A",
|
| 40 |
+
text="photosynthesis in plants",
|
| 41 |
+
chunks=[("c1", 0, "photosynthesis in plants", np.array([1.0, 0.0], dtype=np.float32), {})],
|
| 42 |
+
)
|
| 43 |
+
store.add_document(
|
| 44 |
+
source_type="test",
|
| 45 |
+
uri="b",
|
| 46 |
+
title="B",
|
| 47 |
+
text="fractions math",
|
| 48 |
+
chunks=[("c2", 0, "fractions math", np.array([0.0, 1.0], dtype=np.float32), {})],
|
| 49 |
+
)
|
| 50 |
+
hits = retrieve("photosynthesis", store, config=cfg, top_k=1, expand_neighbors=False)
|
| 51 |
+
assert len(hits) == 1
|
| 52 |
+
assert "photosynthesis" in hits[0].text
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def test_retrieve_filters_by_session(tmp_path, monkeypatch):
|
| 56 |
+
_fake_embed(monkeypatch)
|
| 57 |
+
cfg = ResearchMindConfig(
|
| 58 |
+
data_dir=tmp_path,
|
| 59 |
+
embed_model="test",
|
| 60 |
+
auto_search=False,
|
| 61 |
+
top_k=2,
|
| 62 |
+
max_context_chunks=8,
|
| 63 |
+
chunk_size=512,
|
| 64 |
+
chunk_overlap=128,
|
| 65 |
+
)
|
| 66 |
+
store = MemRAGStore(cfg)
|
| 67 |
+
store.set_embed_dim(2)
|
| 68 |
+
sid_a = store.create_session(topic="a").id
|
| 69 |
+
sid_b = store.create_session(topic="b").id
|
| 70 |
+
store.add_document(
|
| 71 |
+
source_type="test",
|
| 72 |
+
uri="a",
|
| 73 |
+
title="Plants",
|
| 74 |
+
text="photosynthesis in plants",
|
| 75 |
+
chunks=[("c1", 0, "photosynthesis in plants", np.array([1.0, 0.0], dtype=np.float32), {})],
|
| 76 |
+
session_id=sid_a,
|
| 77 |
+
)
|
| 78 |
+
store.add_document(
|
| 79 |
+
source_type="test",
|
| 80 |
+
uri="b",
|
| 81 |
+
title="Math",
|
| 82 |
+
text="fractions math",
|
| 83 |
+
chunks=[("c2", 0, "fractions math", np.array([0.0, 1.0], dtype=np.float32), {})],
|
| 84 |
+
session_id=sid_b,
|
| 85 |
+
)
|
| 86 |
+
scoped = retrieve(
|
| 87 |
+
"photosynthesis",
|
| 88 |
+
store,
|
| 89 |
+
config=cfg,
|
| 90 |
+
top_k=2,
|
| 91 |
+
expand_neighbors=False,
|
| 92 |
+
session_id=sid_a,
|
| 93 |
+
)
|
| 94 |
+
assert len(scoped) == 1
|
| 95 |
+
assert "photosynthesis" in scoped[0].text
|
libs/researchmind/tests/test_search_queries.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from researchmind.search_urls import build_search_queries, search_urls
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def test_build_search_queries_includes_wikipedia_and_arxiv():
|
| 7 |
+
queries = build_search_queries("AI agent")
|
| 8 |
+
joined = " ".join(queries).lower()
|
| 9 |
+
assert "wikipedia" in joined
|
| 10 |
+
assert "arxiv" in joined
|
| 11 |
+
assert "ai agent" in joined
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def test_search_urls_uses_validated_results(monkeypatch):
|
| 15 |
+
monkeypatch.setattr(
|
| 16 |
+
"researchmind.search_urls._collect_candidates",
|
| 17 |
+
lambda topic, per_query=4: [
|
| 18 |
+
"https://en.wikipedia.org/wiki/Intelligent_agent",
|
| 19 |
+
"https://arxiv.org/abs/quantcomm/2021/10.0",
|
| 20 |
+
],
|
| 21 |
+
)
|
| 22 |
+
|
| 23 |
+
def fake_filter(urls, *, check_reachable=True, max_results=5):
|
| 24 |
+
return [u for u in urls if "wikipedia" in u][:max_results]
|
| 25 |
+
|
| 26 |
+
monkeypatch.setattr("researchmind.search_urls.filter_valid_urls", fake_filter)
|
| 27 |
+
out = search_urls("AI agent", n=3, check_reachable=False)
|
| 28 |
+
assert len(out) == 1
|
| 29 |
+
assert "wikipedia" in out[0]
|
libs/researchmind/tests/test_store.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import numpy as np
|
| 4 |
+
|
| 5 |
+
from researchmind.config import ResearchMindConfig
|
| 6 |
+
from researchmind.store import MemRAGStore
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def test_store_dedup_and_chunks(tmp_path):
|
| 10 |
+
cfg = ResearchMindConfig(
|
| 11 |
+
data_dir=tmp_path,
|
| 12 |
+
embed_model="test",
|
| 13 |
+
auto_search=False,
|
| 14 |
+
top_k=3,
|
| 15 |
+
max_context_chunks=8,
|
| 16 |
+
chunk_size=512,
|
| 17 |
+
chunk_overlap=128,
|
| 18 |
+
)
|
| 19 |
+
store = MemRAGStore(cfg)
|
| 20 |
+
emb = np.array([1.0, 0.0, 0.0], dtype=np.float32)
|
| 21 |
+
chunks = [("c1", 0, "hello world", emb, {})]
|
| 22 |
+
doc_id, is_new = store.add_document(
|
| 23 |
+
source_type="test",
|
| 24 |
+
uri="test://a",
|
| 25 |
+
title="A",
|
| 26 |
+
text="hello world",
|
| 27 |
+
chunks=chunks,
|
| 28 |
+
)
|
| 29 |
+
assert is_new
|
| 30 |
+
doc_id2, is_new2 = store.add_document(
|
| 31 |
+
source_type="test",
|
| 32 |
+
uri="test://a",
|
| 33 |
+
title="A",
|
| 34 |
+
text="hello world",
|
| 35 |
+
chunks=chunks,
|
| 36 |
+
)
|
| 37 |
+
assert not is_new2
|
| 38 |
+
assert doc_id == doc_id2
|
| 39 |
+
assert store.count_chunks() == 1
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def test_session_messages(tmp_path):
|
| 43 |
+
cfg = ResearchMindConfig(
|
| 44 |
+
data_dir=tmp_path,
|
| 45 |
+
embed_model="test",
|
| 46 |
+
auto_search=False,
|
| 47 |
+
top_k=3,
|
| 48 |
+
max_context_chunks=8,
|
| 49 |
+
chunk_size=512,
|
| 50 |
+
chunk_overlap=128,
|
| 51 |
+
)
|
| 52 |
+
store = MemRAGStore(cfg)
|
| 53 |
+
session = store.create_session(topic="test topic")
|
| 54 |
+
store.add_message(session.id, "user", "hi", [])
|
| 55 |
+
msgs = store.get_messages(session.id)
|
| 56 |
+
assert len(msgs) == 1
|
| 57 |
+
assert msgs[0]["role"] == "user"
|
libs/researchmind/tests/test_url_validate.py
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from researchmind.url_validate import (
|
| 4 |
+
filter_valid_urls,
|
| 5 |
+
is_well_formed,
|
| 6 |
+
normalize_url,
|
| 7 |
+
validate_url,
|
| 8 |
+
)
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def test_rejects_truncated_and_bad_arxiv():
|
| 12 |
+
ok, reason = is_well_formed("https://arxiv.org/abs/quantcomm/2021/10.0")
|
| 13 |
+
assert not ok
|
| 14 |
+
assert "arxiv" in reason
|
| 15 |
+
|
| 16 |
+
ok, reason = is_well_formed("https://ieeexplore.ieee.org/document/...")
|
| 17 |
+
assert not ok
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def test_accepts_valid_arxiv():
|
| 21 |
+
ok, _ = is_well_formed("https://arxiv.org/abs/2301.00001")
|
| 22 |
+
assert ok
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def test_normalize_adds_scheme():
|
| 26 |
+
assert normalize_url("en.wikipedia.org/wiki/AI_agent").startswith("https://")
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def test_validate_url_does_not_shadow_probe(monkeypatch):
|
| 30 |
+
"""Regression: check_reachable=True must not call the bool parameter."""
|
| 31 |
+
|
| 32 |
+
def fake_probe(url, *, timeout=12.0):
|
| 33 |
+
return True, "ok"
|
| 34 |
+
|
| 35 |
+
monkeypatch.setattr("researchmind.url_validate.probe_url_reachable", fake_probe)
|
| 36 |
+
ok, reason, normalized = validate_url(
|
| 37 |
+
"https://en.wikipedia.org/wiki/Agent",
|
| 38 |
+
check_reachable=True,
|
| 39 |
+
)
|
| 40 |
+
assert ok
|
| 41 |
+
assert reason == "ok"
|
| 42 |
+
assert "wikipedia" in normalized
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def test_rejects_bing_tracking_links():
|
| 46 |
+
ok, reason = is_well_formed(
|
| 47 |
+
"https://www.bing.com/aclick?id=abc&u=aHR0cHM6Ly9leGFtcGxlLmNvbQ"
|
| 48 |
+
)
|
| 49 |
+
assert not ok
|
| 50 |
+
assert "tracking" in reason
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def test_filter_valid_urls_skips_bad(monkeypatch):
|
| 54 |
+
def fake_validate(url, *, check_reachable=True):
|
| 55 |
+
if "bad" in url:
|
| 56 |
+
return False, "bad", url
|
| 57 |
+
return True, "ok", url
|
| 58 |
+
|
| 59 |
+
monkeypatch.setattr("researchmind.url_validate.validate_url", fake_validate)
|
| 60 |
+
out = filter_valid_urls(
|
| 61 |
+
["https://good.example/a", "https://bad.example/b"],
|
| 62 |
+
check_reachable=False,
|
| 63 |
+
max_results=5,
|
| 64 |
+
)
|
| 65 |
+
assert out == ["https://good.example/a"]
|
pyproject.toml
CHANGED
|
@@ -9,6 +9,7 @@ dependencies = [
|
|
| 9 |
"ensemble",
|
| 10 |
"gradio-space",
|
| 11 |
"inference",
|
|
|
|
| 12 |
]
|
| 13 |
|
| 14 |
[dependency-groups]
|
|
@@ -46,4 +47,5 @@ agent = { workspace = true }
|
|
| 46 |
ensemble = { workspace = true }
|
| 47 |
gradio-space = { workspace = true }
|
| 48 |
inference = { workspace = true }
|
|
|
|
| 49 |
slm-evals = { workspace = true }
|
|
|
|
| 9 |
"ensemble",
|
| 10 |
"gradio-space",
|
| 11 |
"inference",
|
| 12 |
+
"researchmind",
|
| 13 |
]
|
| 14 |
|
| 15 |
[dependency-groups]
|
|
|
|
| 47 |
ensemble = { workspace = true }
|
| 48 |
gradio-space = { workspace = true }
|
| 49 |
inference = { workspace = true }
|
| 50 |
+
researchmind = { workspace = true }
|
| 51 |
slm-evals = { workspace = true }
|
skills/extract-content/SKILL.md
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
name: extract-content
|
| 3 |
+
description: Chunk, embed, and index extracted text into MemRAG
|
| 4 |
+
task: research
|
| 5 |
+
tools:
|
| 6 |
+
- extract_and_index
|
| 7 |
+
---
|
| 8 |
+
|
| 9 |
+
## Workflow
|
| 10 |
+
|
| 11 |
+
1. Receive an `ExtractedDocument` (from web, PDF, or DOCX scrape).
|
| 12 |
+
2. Call `extract_and_index` with optional `session_id`.
|
| 13 |
+
3. Chunks are embedded with sentence-transformers and stored in SQLite.
|
| 14 |
+
4. Duplicate content (same hash) is skipped.
|
| 15 |
+
|
| 16 |
+
See `references/chunking-policy.md` for chunk size and overlap defaults.
|
skills/extract-content/references/chunking-policy.md
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Chunking policy
|
| 2 |
+
|
| 3 |
+
| Setting | Env var | Default |
|
| 4 |
+
|---------|---------|---------|
|
| 5 |
+
| Chunk size (words) | `RESEARCHMIND_CHUNK_SIZE` | 512 |
|
| 6 |
+
| Overlap (words) | `RESEARCHMIND_CHUNK_OVERLAP` | 128 |
|
| 7 |
+
| Embedding model | `RESEARCHMIND_EMBED_MODEL` | `all-MiniLM-L6-v2` |
|
| 8 |
+
|
| 9 |
+
Chunks link via `chunk_next` edges for neighbor expansion at retrieval time.
|
skills/extract-content/scripts/chunk_and_index.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""CLI: ingest a text file or URL into MemRAG."""
|
| 3 |
+
|
| 4 |
+
from __future__ import annotations
|
| 5 |
+
|
| 6 |
+
import argparse
|
| 7 |
+
import sys
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
|
| 10 |
+
from researchmind.extract import ExtractedDocument
|
| 11 |
+
from researchmind.ingest import IngestPipeline
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def main() -> int:
|
| 15 |
+
parser = argparse.ArgumentParser(description="Chunk and index content")
|
| 16 |
+
parser.add_argument("--url", help="Scrape and index URL")
|
| 17 |
+
parser.add_argument("--file", type=Path, help="Index local file")
|
| 18 |
+
parser.add_argument("--session", help="Session id to tag document")
|
| 19 |
+
args = parser.parse_args()
|
| 20 |
+
|
| 21 |
+
pipeline = IngestPipeline()
|
| 22 |
+
if args.url:
|
| 23 |
+
doc_id, is_new = pipeline.ingest_url(args.url, session_id=args.session)
|
| 24 |
+
elif args.file:
|
| 25 |
+
doc_id, is_new = pipeline.ingest_path(args.file, session_id=args.session)
|
| 26 |
+
else:
|
| 27 |
+
parser.error("Provide --url or --file")
|
| 28 |
+
|
| 29 |
+
status = "indexed" if is_new else "deduplicated"
|
| 30 |
+
print(f"Document {doc_id} ({status}), chunks in store: {pipeline.store.count_chunks()}")
|
| 31 |
+
return 0
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
if __name__ == "__main__":
|
| 35 |
+
sys.exit(main())
|
skills/research-mind/SKILL.md
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
name: research-mind
|
| 3 |
+
description: Local research agent β scrape, index, and answer with citations
|
| 4 |
+
task: research
|
| 5 |
+
tools:
|
| 6 |
+
- suggest_urls
|
| 7 |
+
- scrape_web
|
| 8 |
+
- scrape_pdf
|
| 9 |
+
- extract_and_index
|
| 10 |
+
- research_answer
|
| 11 |
+
flags:
|
| 12 |
+
auto_search: false
|
| 13 |
+
---
|
| 14 |
+
|
| 15 |
+
## Workflow
|
| 16 |
+
|
| 17 |
+
### Ingest
|
| 18 |
+
|
| 19 |
+
1. **Topic only (default):** run `search_urls` (Google + verification) β user confirms URLs β scrape β `extract_and_index`.
|
| 20 |
+
2. **Auto search:** when `auto_search` is true, same search pipeline ingests top verified URLs without confirmation.
|
| 21 |
+
3. **Direct URL / file:** scrape and index immediately.
|
| 22 |
+
|
| 23 |
+
### Q&A (offline after ingest)
|
| 24 |
+
|
| 25 |
+
1. Call `research_answer` with the user question and `session_id`.
|
| 26 |
+
2. Retrieve top-k chunks from MemRAG, expand neighbors.
|
| 27 |
+
3. Answer using the local model with inline `[n]` citations.
|
| 28 |
+
4. Append references from `references/citation-format.md`.
|
| 29 |
+
|
| 30 |
+
See `references/ingest-modes.md` for mode details.
|
skills/research-mind/references/citation-format.md
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Citation format
|
| 2 |
+
|
| 3 |
+
- Context uses **one number per source document**: `[1]`, `[2]`, β¦
|
| 4 |
+
- Cite inline sparingly (typically 1β3 markers per answer), not after every phrase.
|
| 5 |
+
- Bracket numbers inside scraped paper text are not citation indices β ignore them.
|
| 6 |
+
- Do not output long runs of `[1][2][3]β¦` or duplicate **References** lists.
|
skills/research-mind/references/ingest-modes.md
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Ingest modes
|
| 2 |
+
|
| 3 |
+
| Mode | `auto_search` | Behavior |
|
| 4 |
+
|------|---------------|----------|
|
| 5 |
+
| Suggest URLs (confirm) | `false` | Google search + URL verification; user checks boxes before ingest |
|
| 6 |
+
| Auto search & ingest | `true` | Same search pipeline; ingests verified URLs without confirmation |
|
| 7 |
+
| Direct URL / file | n/a | Skip discovery; ingest provided sources |
|
| 8 |
+
|
| 9 |
+
Global default: `RESEARCHMIND_AUTO_SEARCH=false`. Gradio dropdown and skill `flags.auto_search` override per run.
|
skills/research-mind/scripts/ask.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""CLI stub: Q&A requires a loaded inference backend (use Gradio/agent)."""
|
| 3 |
+
|
| 4 |
+
from __future__ import annotations
|
| 5 |
+
|
| 6 |
+
import argparse
|
| 7 |
+
import sys
|
| 8 |
+
|
| 9 |
+
from researchmind.config import get_config
|
| 10 |
+
from researchmind.ingest import IngestPipeline
|
| 11 |
+
from researchmind.retrieve import retrieve
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def main() -> int:
|
| 15 |
+
parser = argparse.ArgumentParser(description="Preview retrieval for a question")
|
| 16 |
+
parser.add_argument("question", help="Question to retrieve context for")
|
| 17 |
+
parser.add_argument("--top-k", type=int, default=None)
|
| 18 |
+
args = parser.parse_args()
|
| 19 |
+
|
| 20 |
+
cfg = get_config()
|
| 21 |
+
store = IngestPipeline().store
|
| 22 |
+
chunks = retrieve(args.question, store, config=cfg, top_k=args.top_k)
|
| 23 |
+
if not chunks:
|
| 24 |
+
print("No chunks in store. Ingest sources first.")
|
| 25 |
+
return 1
|
| 26 |
+
for i, c in enumerate(chunks, 1):
|
| 27 |
+
print(f"\n--- [{i}] {c.doc_title} ---\n{c.text[:500]}...")
|
| 28 |
+
print("\nUse AgentRunner.run_researchmind_chat() for a full cited answer.")
|
| 29 |
+
return 0
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
if __name__ == "__main__":
|
| 33 |
+
sys.exit(main())
|