Spaces:

sciencialab
/

document-qa-dev

Build error

App Files Files Community

lfoppiano commited on about 10 hours ago

Commit

916dea4

verified ·

1 Parent(s): 290feff

Upload folder using huggingface_hub

Browse files

Files changed (24) hide show

.devcontainer/devcontainer.json +33 -0
.env.example +18 -0
.gitattributes +2 -0
.gitignore +10 -0
.streamlit/config.toml +8 -0
CHANGELOG.md +151 -0
Dockerfile +25 -6
LICENSE +201 -0
README.md +110 -6
docs/README.md +249 -0
docs/images/screenshot1.png +3 -0
docs/images/screenshot2.png +3 -0
document_qa/custom_embeddings.py +107 -0
document_qa/deployment/modal_embeddings.py +117 -0
document_qa/deployment/modal_inference_phi.py +76 -0
document_qa/deployment/modal_inference_qwen.py +71 -0
document_qa/document_qa_engine.py +676 -0
document_qa/grobid_processors.py +999 -0
document_qa/langchain.py +222 -0
document_qa/ner_client_generic.py +461 -0
pyproject.toml +41 -0
pytest.ini +2 -0
requirements.txt +34 -0
streamlit_app.py +490 -0

.devcontainer/devcontainer.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "name": "Python 3",
+  // Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile
+  "image": "mcr.microsoft.com/devcontainers/python:1-3.11-bullseye",
+  "customizations": {
+    "codespaces": {
+      "openFiles": [
+        "README.md",
+        "streamlit_app.py"
+      ]
+    },
+    "vscode": {
+      "settings": {},
+      "extensions": [
+        "ms-python.python",
+        "ms-python.vscode-pylance"
+      ]
+    }
+  },
+  "updateContentCommand": "[ -f packages.txt ] && sudo apt update && sudo apt upgrade -y && sudo xargs apt install -y <packages.txt; [ -f requirements.txt ] && pip3 install --user -r requirements.txt; pip3 install --user streamlit; echo '✅ Packages installed and Requirements met'",
+  "postAttachCommand": {
+    "server": "streamlit run streamlit_app.py --server.enableCORS false --server.enableXsrfProtection false"
+  },
+  "portsAttributes": {
+    "8501": {
+      "label": "Application",
+      "onAutoForward": "openPreview"
+    }
+  },
+  "forwardPorts": [
+    8501
+  ]
+}

.env.example ADDED Viewed

	@@ -0,0 +1,18 @@

+PHI_URL=....
+QWEN_URL=...
+EMBEDS_URL=...
+DEFAULT_MODEL=microsoft/Phi-4-mini-instruct
+DEFAULT_EMBEDDING=intfloat/multilingual-e5-large-instruct-modal
+API_KEY=...
+EMBEDS_API_KEY=...
+GROBID_URL=...
+GROBID_QUANTITIES_URL=...
+QWEN_URL=...
+GROBID_MATERIALS_URL=...
+API_KEY=...
+EMBEDS_API_KEY=...

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+docs/images/screenshot1.png filter=lfs diff=lfs merge=lfs -text
+docs/images/screenshot2.png filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,10 @@

+.idea
+.env
+.env.docker
+**/**/.chroma
+resources/db
+build
+dist
+__pycache__
+document_qa/__pycache__
+document_qa_engine.egg-info/

.streamlit/config.toml ADDED Viewed

	@@ -0,0 +1,8 @@

+[logger]
+level = "info"
+[browser]
+gatherUsageStats = true
+[ui]
+hideTopBar = true

CHANGELOG.md ADDED Viewed

	@@ -0,0 +1,151 @@

+# Changelog
+All notable changes to this project will be documented in this file.
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
+## [0.4.2] - 2024-08-23
+### Fixed
++ Correct invalid dependency of promptlayer slipped in the build
+## [0.4.1] - 2024-08-23
+### Added
++ Scroll to the first relevant context passage, if the most relevant context passage is at the end, it will scroll to the end of the document
++ Added Mistral NEMO as default model
+### Changed
++ Rearranged the interface to get more space
++ Updated libraries to the latest versions
+### Fixed
++ Fixed the chat messages sequence that were buggy
++ Updated the PDF viewer to the latest version
+## [0.4.0] - 2024-06-24
+### Added
++ Add selection of embedding functions
++ Add selection of text from the pdf viewer (provided by https://github.com/lfoppiano/streamlit-pdf-viewer)
++ Added an experimental feature for calculating the coefficient that relate the question and the embedding database
++ Added the data availability statement in the searchable text
+### Changed
++ Removed obsolete and non-working models zephyr and mistral v0.1
++ The underlying library was refactored to make it easier to maintain
++ Removed the native PDF viewer
++ Updated langchain and streamlit to the latest versions
++ Removed conversational memory which was causing more problems than bringing benefits
++ Rearranged the interface to get more space
+### Fixed
++ Updated and removed models that were not working
++ Fixed problems with langchain and other libraries
+## [0.3.4] - 2023-12-26
+### Added
++ Add gpt4 and gpt4-turbo
+### Changed
++ improved UI: replace combo boxes with dropdown box
+### Fixed
++ Fixed dependencies when installing as library
+## [0.3.3] - 2023-12-14
+### Added
++ Add experimental PDF rendering in the page
+### Fixed
++ Fix GrobidProcessors API implementation
+## [0.3.2] - 2023-12-01
+### Fixed
++ Remove memory when using Zephyr-7b-beta, that easily hallucinate
+## [0.3.1] - 2023-11-22
+### Added
++ Include biblio in embeddings by @lfoppiano in #21
+### Fixed
++ Fix conversational memory by @lfoppiano in #20
+## [0.3.0] - 2023-11-18
+### Added
++ add zephyr-7b by @lfoppiano in #15
++ add conversational memory in #18
+## [0.2.1] - 2023-11-01
+### Fixed
++ fix env variables by @lfoppiano in #9
+## [0.2.0] – 2023-10-31
+### Added
++ Selection of chunk size on which embeddings are created upon
++ Mistral model to be used freely via the Huggingface free API
+### Changed
++ Improved documentation, adding privacy statement
++ Moved settings on the sidebar
++ Disable NER extraction by default, and allow user to activate it
++ Read API KEY from the environment variables and if present, avoid asking the user
++ Avoid changing model after update
+## [0.1.3] – 2023-10-30
+### Fixed
++ ChromaDb accumulating information even when new papers were uploaded
+## [0.1.2] – 2023-10-26
+### Fixed
++ docker build
+## [0.1.1] – 2023-10-26
+### Fixed
++ Github action build
++ dependencies of langchain and chromadb
+## [0.1.0] – 2023-10-26
+### Added
++ pypi package
++ docker package release
+## [0.0.1] – 2023-10-26
+### Added
++ Kick off application
++ Support for GPT-3.5
++ Support for Mistral + SentenceTransformer
++ Streamlit application
++ Docker image
++ pypi package
+<!-- markdownlint-disable-file MD024 MD033 -->

Dockerfile CHANGED Viewed

@@ -1,11 +1,30 @@
-FROM lfoppiano/document-insights-qa:latest-develop
-USER root
 WORKDIR /app
-RUN mkdir -m 777 -p /app/.cache
-RUN mkdir -m 777 -p /.cache
-COPY --chown=lfoppiano config.toml .streamlit/config.toml
 ENV PYTHONPATH "${PYTHONPATH}:."
-ENTRYPOINT ["streamlit", "run", "streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]

+FROM python:3.12-slim
 WORKDIR /app
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    git \
+    && rm -rf /var/lib/apt/lists/*
+COPY requirements.txt .
+RUN pip3 install -r requirements.txt
+COPY .streamlit ./.streamlit
+COPY document_qa ./document_qa
+COPY streamlit_app.py .
+# extract version
+COPY .git ./.git
+RUN git rev-parse --short HEAD > revision.txt
+RUN rm -rf ./.git
+EXPOSE 8501
+HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
 ENV PYTHONPATH "${PYTHONPATH}:."
+ENTRYPOINT ["streamlit", "run", "streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md CHANGED Viewed

@@ -1,12 +1,116 @@
 ---
-title: Scientific Document Insights Q/A - Develop
-emoji: 📊
-colorFrom: blue
-colorTo: yellow
-sdk: docker
 pinned: false
 license: apache-2.0
 app_port: 8501
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Scientific Document Insights Q/A
+emoji: 📝
+colorFrom: yellow
+colorTo: pink
+sdk: streamlit
+sdk_version: 1.37.1
+app_file: streamlit_app.py
 pinned: false
 license: apache-2.0
 app_port: 8501
 ---
+# DocumentIQA: Scientific Document Insights Q/A
+**Work in progress** :construction_worker:
+<img src="https://github.com/lfoppiano/document-qa/assets/15426/f0a04a86-96b3-406e-8303-904b93f00015" width=300 align="right" />
+https://lfoppiano-document-qa.hf.space/
+**NOTE**: The LLM API is kindly provided by [Modal.com](https://www.modal.com) which offers 30$/month for computing. When these are done, the app will stop answering. 😅
+## Introduction
+Question/Answering on scientific documents using LLMs. The tool can be customized to use different types of LLM APIs.
+The streamlit application demonstrates the implementation of a RAG (Retrieval Augmented Generation) on scientific documents.
+**Different from most of the projects**, we focus on scientific articles and extract text from a structured document.
+We target only the full text using [Grobid](https://github.com/kermitt2/grobid) which provides cleaner results than the raw PDF2Text converter (which is comparable with most of the other solutions).
+Additionally, this frontend provides the visualisation of named entities on LLM responses to extract <span stype="color:yellow">physical quantities, measurements</span> (with [grobid-quantities](https://github.com/kermitt2/grobid-quantities)) and <span stype="color:blue">materials</span> mentions (with [grobid-superconductors](https://github.com/lfoppiano/grobid-superconductors)).
+(The image on the right was generated with https://huggingface.co/spaces/stabilityai/stable-diffusion)
+[<img src="https://img.youtube.com/vi/M4UaYs5WKGs/hqdefault.jpg" height="300" align="right"
+/>](https://www.youtube.com/embed/M4UaYs5WKGs)
+## Getting started
+- Upload a scientific article as a PDF document. You will see a spinner or loading indicator while the processing is in progress.
+- Once the spinner disappears, you can proceed to ask your questions
+ ![screenshot2.png](docs%2Fimages%2Fscreenshot2.png)
+## Documentation
+ **For full technical documentation** of the `document-qa-engine` library **[`docs/README.md`](docs/README.md)**.
+### Embedding selection
+In the latest version, there is the possibility to select both embedding functions and LLMs. There are some limitations, OpenAI embeddings cannot be used with open source models, and vice-versa.
+### Context size
+Allow to change the number of blocks from the original document that are considered for responding.
+The default size of each block is 250 tokens (which can be changed before uploading the first document).
+With default settings, each question uses around 1000 tokens.
+**NOTE**: if the chat answers something like "the information is not provided in the given context", **changing the context size will likely help**.
+### Chunks size
+When uploaded, each document is split into blocks of a determined size (250 tokens by default).
+This setting allows users to modify the size of such blocks.
+Smaller blocks will result in a smaller context, yielding more precise sections of the document.
+Larger blocks will result in a larger context less constrained around the question.
+### Query mode
+Indicates whether sending a question to the LLM (Language Model) or the vector storage.
+ - **LLM** (default) enables question/answering related to the document content.
+ - **Embeddings**: the response will consist of the raw text from the document related to the question (based on the embeddings). This mode helps to test why sometimes the answers are not satisfying or incomplete.
+ - **Question coefficient** (experimental): provide a coefficient that indicates how the question has been far or closed to the retrieved context
+### NER (Named Entities Recognition)
+This feature is specifically crafted for people working with scientific documents in materials science.
+It enables to run NER on the response from the LLM, to identify materials mentions and properties (quantities, measurements).
+This feature leverages both [grobid-quantities](https://github.com/kermitt2/grobid-quanities) and [grobid-superconductors](https://github.com/lfoppiano/grobid-superconductors) external services.
+### Troubleshooting
+Error: `streamlit: Your system has an unsupported version of sqlite3. Chroma requires sqlite3 >= 3.35.0`.
+Here is the [solution on Linux](https://stackoverflow.com/questions/76958817/streamlit-your-system-has-an-unsupported-version-of-sqlite3-chroma-requires-sq).
+For more information, see the [details](https://docs.trychroma.com/troubleshooting#sqlite) on the Chroma website.
+## Disclaimer on Data, Security, and Privacy ⚠️
+Please read carefully:
+- Avoid uploading sensitive data. We temporarily store text from the uploaded PDF documents only for processing your request, and we disclaim any responsibility for subsequent use or handling of the submitted data by third-party LLMs.
+- Mistral and Zephyr are FREE to use and do not require any API, but as we leverage the free API entrypoint, there is no guarantee that all requests will go through. Use at your own risk.
+- We do not assume responsibility for how the data is utilized by the LLM end-points API.
+## Development notes
+To release a new version:
+- `bump-my-version bump patch`
+- `git push --tags`
+To use docker:
+- docker run `lfoppiano/document-insights-qa:{latest_version}`
+- docker run `lfoppiano/document-insights-qa:latest-develop` for the latest development version
+To install the library with Pypi:
+- `pip install document-qa-engine`
+## Acknowledgement
+The project was initiated at the [National Institute for Materials Science](https://www.nims.go.jp) (NIMS) in Japan.
+Currently, the development is possible thanks to [ScienciLAB](https://www.sciencialab.com).
+This project was contributed by [Guillaume Lambard](https://github.com/GLambard) and the [Lambard-ML-Team](https://github.com/Lambard-ML-Team), [Pedro Ortiz Suarez](https://github.com/pjox), and [Tomoya Mato](https://github.com/t29mato).
+Thanks also to [Patrice Lopez](https://www.science-miner.com), the author of [Grobid](https://github.com/kermitt2/grobid).

docs/README.md ADDED Viewed

	@@ -0,0 +1,249 @@

+# 📝 document-qa-engine documentation
+>  **License**: Apache 2.0 · **PyPI**: `pip install document-qa-engine`
+A Python library and Streamlit application for **Question/Answering on scientific PDF documents** using Retrieval-Augmented Generation (RAG). It uses [GROBID](https://github.com/kermitt2/grobid) for structured text extraction, [ChromaDB](https://www.trychroma.com/) for vector storage, and any OpenAI-compatible LLM for answering.
+## Overview
+Most PDF Q/A tools feed raw extracted text to an LLM, which is noisy and loses document structure. **document-qa-engine** takes a different approach:
+1. **Structured extraction** Sends the PDF to a GROBID server, which returns TEI-XML with separate sections (title, abstract, body paragraphs, figures, back matter) and precise bounding-box coordinates for every paragraph.
+2. **Smart chunking** Paragraphs can be kept as-is or merged into larger chunks using token-aware merging, while preserving coordinate metadata.
+3. **Vector embeddings** Each chunk is embedded (via a remote API or local model) and stored in an in-memory ChromaDB collection.
+4. **Retrieval + LLM answering** User questions are embedded, the most similar chunks are retrieved, and an LLM generates an answer from that context.
+5. **PDF highlighting**  The Streamlit frontend highlights the exact PDF regions the LLM used, with a color gradient (orange = most relevant, blue = least relevant).
+6. **NER post-processing** *(optional)* LLM responses are scanned for physical quantities (via grobid-quantities) and materials mentions (via grobid-superconductors), then annotated inline.
+## Installation
+### Option 1: PyPI (library only)
+```bash
+pip install document-qa-engine
+```
+### Option 2: From source (full app)
+```bash
+git clone https://github.com/lfoppiano/document-qa.git
+cd document-qa
+pip install -r requirements.txt
+```
+### Option 3: Docker
+```bash
+# Latest stable release
+docker run -p 8501:8501 lfoppiano/document-insights-qa:latest
+# Latest development build
+docker run -p 8501:8501 lfoppiano/document-insights-qa:latest-develop
+```
+### Prerequisites
+You need access to:
+| Service | Required? | Purpose |
+|---------|-----------|---------|
+| **GROBID server** | ✅ Yes | Parses PDFs into structured text |
+| **Embedding API** | ✅ Yes | Converts text to vectors |
+| **LLM API** (OpenAI-compatible) | ✅ Yes | Answers questions |
+| **grobid-quantities** | ❌ Optional | NER for measurements |
+| **grobid-superconductors** | ❌ Optional | NER for materials |
+## Configuration
+All configuration is through environment variables. Create a `.env` file in the project root:
+```env
+# ── LLM Endpoints ────────────────────────────────────────
+# Each key in API_MODELS maps a model name to its base URL.
+PHI_URL=http://localhost:1234/v1          # Phi-4-mini-instruct endpoint
+QWEN_URL=http://localhost:1234/v1         # Qwen3-0.6B endpoint
+API_KEY=your-llm-api-key                  # Auth key for LLM APIs
+# ── Embedding Endpoint ───────────────────────────────────
+EMBEDS_URL=http://127.0.0.1:1234/v1      # Embedding service URL
+EMBEDS_API_KEY=your-embedding-api-key     # Auth key for embedding API
+# ── Defaults ─────────────────────────────────────────────
+DEFAULT_MODEL=microsoft/Phi-4-mini-instruct
+DEFAULT_EMBEDDING=intfloat/multilingual-e5-large-instruct-modal
+# ── GROBID Services ──────────────────────────────────────
+GROBID_URL=https://your-grobid-url
+GROBID_QUANTITIES_URL=https://your-grobid-quantities-url/
+GROBID_MATERIALS_URL=https://your-grobid-superconductors-url/
+```
+### Variable Reference
+| Variable | Description |
+|----------|-------------|
+| `PHI_URL` | Base URL for the Phi-4-mini-instruct vLLM server (OpenAI-compatible) |
+| `QWEN_URL` | Base URL for the Qwen3-0.6B vLLM server (OpenAI-compatible) |
+| `API_KEY` | Bearer token for authenticating with the LLM endpoints |
+| `EMBEDS_URL` | Base URL for the embedding service (must expose `/embeddings` endpoint) |
+| `EMBEDS_API_KEY` | Bearer token for authenticating with the embedding service |
+| `DEFAULT_MODEL` | Model name pre-selected in the UI dropdown |
+| `DEFAULT_EMBEDDING` | Embedding name pre-selected in the UI dropdown |
+| `GROBID_URL` | Full URL to a running GROBID server |
+| `GROBID_QUANTITIES_URL` | URL to a grobid-quantities server (for measurement NER) |
+| `GROBID_MATERIALS_URL` | URL to a grobid-superconductors server (for materials NER) |
+---
+## Quick Start — Streamlit App
+```bash
+# 1. Set up environment
+cp .env.example .env   # Edit with your endpoints
+# 2. Run the app
+streamlit run streamlit_app.py
+```
+Then open `http://localhost:8501`, upload a PDF, and ask questions.
+---
+## Quick Start — As a Python Library
+```python
+from langchain_openai import ChatOpenAI
+from document_qa.custom_embeddings import ModalEmbeddings
+from document_qa.document_qa_engine import DocumentQAEngine, DataStorage
+# 1. Set up the LLM
+llm = ChatOpenAI(
+    model="microsoft/Phi-4-mini-instruct",
+    temperature=0.0,
+    base_url="http://localhost:1234/v1",
+    api_key="your-api-key"
+)
+# 2. Set up embeddings
+embeddings = ModalEmbeddings(
+    url="http://localhost:1234/v1",
+    model_name="intfloat/multilingual-e5-large-instruct",
+    api_key="your-embedding-key"
+)
+# 3. Create the storage and engine
+storage = DataStorage(embeddings)
+engine = DocumentQAEngine(
+    llm=llm,
+    data_storage=storage,
+    grobid_url="https://lfoppiano-grobid.hf.space/"
+)
+# 4. Load a PDF (creates in-memory embeddings)
+doc_id = engine.create_memory_embeddings(
+    pdf_path="path/to/paper.pdf",
+    chunk_size=500       # tokens per chunk (-1 = keep paragraphs)
+)
+# 5. Ask a question
+_, answer, coordinates = engine.query_document(
+    query="What is the main contribution of this paper?",
+    doc_id=doc_id,
+    context_size=10      # number of chunks to use as context
+)
+print(answer)
+# 6. Or just retrieve relevant passages (no LLM)
+passages, coordinates = engine.query_storage(
+    query="What materials were studied?",
+    doc_id=doc_id,
+    context_size=5
+)
+for p in passages:
+    print(p)
+```
+## Streamlit App Features
+### Query Modes
+| Mode | What It Does | When to Use |
+|------|-------------|-------------|
+| **LLM Q/A** | Retrieves context → sends to LLM → returns a natural language answer | Default — for asking questions |
+| **Embeddings** | Returns the raw text passages most similar to your question | Debugging — to see what context the LLM would receive |
+| **Question Coefficient** | Computes `min_similarity - mean_similarity` as a quality estimate | Experimental — to predict answer reliability |
+### Settings
+| Setting | Default | Description |
+|---------|---------|-------------|
+| Chunk size | `-1` (paragraphs) | Token count per text chunk. `-1` keeps GROBID paragraphs intact. |
+| Context size | `10` (paragraphs) / `4` (chunks) | Number of chunks sent to the LLM as context |
+| Scroll to context | Off | Auto-scroll the PDF viewer to the most relevant passage |
+| NER processing | Off | Run grobid-quantities + grobid-superconductors on LLM responses |
+### PDF Annotations
+After each query, the PDF viewer highlights the passages used as context:
+- **Orange** (warm) = most relevant passage
+- **Blue** (cold) = least relevant passage
+- **Dotted border** = the single most relevant passage
+## Troubleshooting
+### SQLite version error
+```
+streamlit: Your system has an unsupported version of sqlite3.
+Chroma requires sqlite3 >= 3.35.0.
+```
+**Linux fix**: See [this StackOverflow answer](https://stackoverflow.com/questions/76958817/streamlit-your-system-has-an-unsupported-version-of-sqlite3-chroma-requires-sq).
+**More info**: [Chroma troubleshooting docs](https://docs.trychroma.com/troubleshooting#sqlite).
+### "The information is not provided in the given context"
+The LLM couldn't find the answer in the retrieved passages. Try:
+1. **Increase context size** — use the sidebar slider to retrieve more passages
+2. **Decrease chunk size** — smaller chunks may match more precisely
+3. **Use Embeddings mode** — switch to "Embeddings" query mode to see what passages are being retrieved and verify they contain the answer
+### MissingSchema error on embeddings
+```
+requests.exceptions.MissingSchema: Invalid URL
+```
+Ensure `EMBEDS_URL` in your `.env` starts with `https://` or `http://`. Example:
+```env
+EMBEDS_URL=https://your-modal-endpoint.modal.run/v1
+```
+### GROBID connection errors
+Make sure your GROBID server is running and accessible:
+```bash
+curl https://grobid.hf.space/api/isalive
+```
+If using a local GROBID instance:
+```bash
+docker run --rm -p 8070:8070 lfoppiano/grobid:0.8.0
+# Then set GROBID_URL=http://localhost:8070
+```
+### Embedding API returning empty results
+- Verify the API is running: `curl {EMBEDS_URL}/embeddings`
+- Check that `EMBEDS_API_KEY` matches the server's expected key
+- Ensure the URL does **not** have a trailing `/embeddings` (the client appends it automatically)
+---

docs/images/screenshot1.png ADDED Viewed

Git LFS Details

SHA256: cf082a5479180a7699c1799775e9f24b92cb2c43fbaaa2c3c83d4f85e26a3565
Pointer size: 131 Bytes
Size of remote file: 275 kB

docs/images/screenshot2.png ADDED Viewed

Git LFS Details

SHA256: 1b624732c58ce0d5f1a7ef67cd4893f70fc2d9a7dcdec44b2dbcb76a245e89f6
Pointer size: 131 Bytes
Size of remote file: 155 kB

document_qa/custom_embeddings.py ADDED Viewed

	@@ -0,0 +1,107 @@

+"""Custom LangChain-compatible embedding client.
+Provides :class:`ModalEmbeddings`, a drop-in ``Embeddings`` implementation
+that calls any service exposing an ``/embeddings`` endpoint (OpenAI,
+vLLM, Modal, LM Studio, etc.).
+"""
+from typing import List
+import requests
+from langchain_core.embeddings import Embeddings
+class ModalEmbeddings(Embeddings):
+    """LangChain ``Embeddings`` backed by an OpenAI-compatible HTTP API.
+    The service must expose a ``POST /embeddings`` endpoint that accepts
+    ``{"model": "…", "input": ["…"]}`` and returns the standard OpenAI
+    response shape.
+    Args:
+        url: Base URL of the embedding service (e.g. ``"http://localhost:1234/v1"``).
+        model_name: Model identifier(e.g. ``"intfloat/multilingual-e5-large-instruct"``).
+        api_key: Optional bearer token for authenticated endpoints.
+    """
+    def __init__(self, url: str, model_name: str, api_key: str = None):
+        self.url = url
+        self.model_name = model_name
+        self.api_key = api_key
+    def embed(self, text: List[str]) -> List[List[float]]:
+        """Embed a list of texts via the configured API.
+        Newlines are replaced with spaces before sending, since most
+        embedding models treat them as noise.
+        Args:
+            text: Strings to embed.
+        Returns:
+            list[list[float]]: One embedding vector per input string.
+        Raises:
+            requests.HTTPError: If the API returns a non-2xx status.
+        """
+        # Newlines degrade embedding quality for most models
+        cleaned_text = [t.replace("\n", " ") for t in text]
+        headers = {
+            "Content-Type": "application/json"
+        }
+        if self.api_key:
+            headers["Authorization"] = f"Bearer {self.api_key}"
+        response = requests.post(
+            f"{self.url}/embeddings",
+            json={
+                "model": self.model_name,
+                "input": cleaned_text
+            },
+            headers=headers
+        )
+        response.raise_for_status()
+        data = response.json()["data"]
+        return [item["embedding"] for item in data]
+    def embed_documents(self, text: List[str]) -> List[List[float]]:
+        """Embed multiple documents (LangChain interface).
+        Args:
+            text: Document strings to embed.
+        Returns:
+            list[list[float]]: One embedding vector per document.
+        """
+        return self.embed(text)
+    def embed_query(self, text: str) -> List[float]:
+        """Embed a single query string (LangChain interface).
+        Args:
+            text: The query string.
+        Returns:
+            list[float]: The embedding vector for *text*.
+        """
+        return self.embed([text])[0]
+    def get_model_name(self) -> str:
+        """Return the model identifier used for embedding requests."""
+        return self.model_name
+if __name__ == "__main__":
+    embeds = ModalEmbeddings(
+        url="https://lfoppiano--intfloat-multilingual-e5-large-instruct-embed-5da184.modal.run/",
+        model_name="intfloat/multilingual-e5-large-instruct"
+    )
+    print(embeds.embed(
+        ["We are surrounded by stupid kids",
+         "We are interested in the future of AI"]
+    ))

document_qa/deployment/modal_embeddings.py ADDED Viewed

	@@ -0,0 +1,117 @@

+import os
+from typing import Annotated, List
+from fastapi import Request, HTTPException, Form
+import modal
+import torch
+import torch.nn.functional as F
+from torch import Tensor
+from transformers import AutoTokenizer, AutoModel
+image = (
+    modal.Image.debian_slim(python_version="3.11")
+    .pip_install(
+        "transformers",
+        "huggingface_hub[hf_transfer]==0.26.2",
+        "flashinfer-python==0.2.0.post2",  # pinning, very unstable
+        "fastapi[standard]",
+        extra_index_url="https://flashinfer.ai/whl/cu124/torch2.5",
+    )
+    .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})  # faster model transfers
+)
+MODELS_DIR = "/llamas"
+MODEL_NAME = "intfloat/multilingual-e5-large-instruct"
+MODEL_REVISION = "84344a23ee1820ac951bc365f1e91d094a911763"
+hf_cache_vol = modal.Volume.from_name("huggingface-cache", create_if_missing=True)
+vllm_cache_vol = modal.Volume.from_name("vllm-cache", create_if_missing=True)
+app = modal.App("intfloat-multilingual-e5-large-instruct-embeddings")
+def get_device():
+    return torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+def load_model():
+    print("Loading model...")
+    device = get_device()
+    print(f"Using device: {device}")
+    tokenizer = AutoTokenizer.from_pretrained('intfloat/multilingual-e5-large-instruct')
+    model = AutoModel.from_pretrained('intfloat/multilingual-e5-large-instruct').to(device)
+    print("Model loaded successfully.")
+    return tokenizer, model, device
+N_GPU = 1
+MINUTES = 60  # seconds
+VLLM_PORT = 8000
+def average_pool(last_hidden_states: Tensor,
+                 attention_mask: Tensor) -> Tensor:
+    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
+    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
+@app.function(
+    image=image,
+    gpu=f"L40S:{N_GPU}",
+    # gpu=f"A10G:{N_GPU}",
+    # how long should we stay up with no requests?
+    scaledown_window=3 * MINUTES,
+    volumes={
+        "/root/.cache/huggingface": hf_cache_vol,
+        "/root/.cache/vllm": vllm_cache_vol,
+    },
+    secrets=[modal.Secret.from_name("document-qa-embedding-key")]
+)
+@modal.concurrent(
+    max_inputs=5
+)  # how many requests can one replica handle? tune carefully!
+@modal.fastapi_endpoint(method="POST")
+def embed(request: Request, text: Annotated[str, Form()]):
+    api_key = request.headers.get("x-api-key")
+    expected_key = os.environ["API_KEY"]
+    if api_key != expected_key:
+        raise HTTPException(status_code=401, detail="Unauthorized")
+    texts = [t for t in text.split("\n") if t.strip()]
+    if not texts:
+        return []
+    tokenizer, model, device = load_model()
+    model.eval()
+    print(f"Start embedding {len(texts)} texts")
+    try:
+        with torch.no_grad():
+            # Move inputs to the same device as model
+            batch_dict = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
+            batch_dict = {k: v.to(device) for k, v in batch_dict.items()}
+            # Forward pass
+            outputs = model(**batch_dict)
+            # Process embeddings
+            embeddings = average_pool(
+                outputs.last_hidden_state,
+                batch_dict['attention_mask']
+            )
+            embeddings = F.normalize(embeddings, p=2, dim=1)
+            # Move to CPU and convert to list for serialization
+            embeddings = embeddings.cpu().numpy().tolist()
+        print("Finished embedding texts.")
+        return embeddings
+    except RuntimeError as e:
+        print(f"Error during embedding: {str(e)}")
+        if "CUDA out of memory" in str(e):
+            print("CUDA out of memory error. Try reducing batch size or using a smaller model.")
+        raise

document_qa/deployment/modal_inference_phi.py ADDED Viewed

	@@ -0,0 +1,76 @@

+import os
+import modal
+vllm_image = (
+    modal.Image.debian_slim(python_version="3.10")
+    .pip_install(
+        "vllm",
+        "huggingface_hub[hf_transfer]==0.26.2",
+        "flashinfer-python==0.2.0.post2",  # pinning, very unstable
+        extra_index_url="https://flashinfer.ai/whl/cu124/torch2.5",
+    )
+    .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})  # faster model transfers
+)
+MODELS_DIR = "/llamas"
+MODEL_NAME = "microsoft/Phi-4-mini-instruct"
+MODEL_REVISION = "c0fb9e74abda11b496b7907a9c6c9009a7a0488f"
+FAST_BOOT = True
+hf_cache_vol = modal.Volume.from_name("huggingface-cache", create_if_missing=True)
+vllm_cache_vol = modal.Volume.from_name("vllm-cache", create_if_missing=True)
+app = modal.App("phi-4-mini-instruct-qa-vllm")
+N_GPU = 1
+MINUTES = 60  # seconds
+VLLM_PORT = 8000
+@app.function(
+    image=vllm_image,
+    # gpu=f"L40S:{N_GPU}",
+    gpu=f"A10G:{N_GPU}",
+    # how long should we stay up with no requests?
+    scaledown_window=5 * MINUTES,
+    volumes={
+        "/root/.cache/huggingface": hf_cache_vol,
+        "/root/.cache/vllm": vllm_cache_vol,
+    },
+    secrets=[modal.Secret.from_name("document-qa-api-key")]
+)
+@modal.concurrent(
+    max_inputs=5
+)  # how many requests can one replica handle? tune carefully!
+@modal.web_server(port=VLLM_PORT, startup_timeout=5 * MINUTES)
+def serve():
+    import subprocess
+    cmd = [
+        "vllm",
+        "serve",
+        "--uvicorn-log-level=info",
+        MODEL_NAME,
+        "--revision",
+        MODEL_REVISION,
+        "--max-model-len",
+        "32768",
+        "--host",
+        "0.0.0.0",
+        "--port",
+        str(VLLM_PORT),
+        "--api-key",
+        os.environ["API_KEY"],
+    ]
+    # enforce-eager disables both Torch compilation and CUDA graph capture
+    # default is no-enforce-eager. see the --compilation-config flag for tighter control
+    cmd += ["--enforce-eager" if FAST_BOOT else "--no-enforce-eager"]
+    # assume multiple GPUs are for splitting up large matrix multiplications
+    cmd += ["--tensor-parallel-size", str(N_GPU)]
+    subprocess.Popen(" ".join(cmd), shell=True)

document_qa/deployment/modal_inference_qwen.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import os
+import modal
+vllm_image = (
+    modal.Image.debian_slim(python_version="3.11")
+    .pip_install(
+        "vllm",
+        "transformers>=4.51.0",
+        "huggingface_hub[hf_transfer]>=0.26.2",
+        "flashinfer-python==0.2.0.post2",  # pinning, very unstable
+        extra_index_url="https://flashinfer.ai/whl/cu124/torch2.5",
+    )
+    .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})  # faster model transfers
+)
+MODELS_DIR = "/llamas"
+MODEL_NAME = "Qwen/Qwen3-0.6B"
+MODEL_REVISION = "e6de91484c29aa9480d55605af694f39b081c455"
+hf_cache_vol = modal.Volume.from_name("huggingface-cache", create_if_missing=True)
+vllm_cache_vol = modal.Volume.from_name("vllm-cache", create_if_missing=True)
+app = modal.App("gwen-0.6b-qa-vllm")
+N_GPU = 1
+MINUTES = 60  # seconds
+VLLM_PORT = 8000
+@app.function(
+    image=vllm_image,
+    # gpu=f"L40S:{N_GPU}",
+    gpu=f"A10G:{N_GPU}",
+    # how long should we stay up with no requests?
+    scaledown_window=5 * MINUTES,
+    volumes={
+        "/root/.cache/huggingface": hf_cache_vol,
+        "/root/.cache/vllm": vllm_cache_vol,
+    },
+    secrets=[modal.Secret.from_name("document-qa-api-key")]
+)
+@modal.concurrent(
+    max_inputs=5
+)  # how many requests can one replica handle? tune carefully!
+@modal.web_server(port=VLLM_PORT, startup_timeout=5 * MINUTES)
+def serve():
+    import subprocess
+    cmd = [
+        "vllm",
+        "serve",
+        "--uvicorn-log-level=info",
+        MODEL_NAME,
+        "--revision",
+        MODEL_REVISION,
+        "--enable-reasoning",
+        "--reasoning-parser",
+        "deepseek_r1",
+        "--max-model-len",
+        "32768",
+        "--host",
+        "0.0.0.0",
+        "--port",
+        str(VLLM_PORT),
+        "--api-key",
+        os.environ["API_KEY"],
+    ]
+    subprocess.Popen(" ".join(cmd), shell=True)

document_qa/document_qa_engine.py ADDED Viewed

	@@ -0,0 +1,676 @@

+"""Core Q/A engine for scientific PDF documents.
+This module provides the main classes for building a Retrieval-Augmented
+Generation (RAG) pipeline over scientific PDFs.
+"""
+import copy
+import os
+from pathlib import Path
+from typing import Union, Any, List
+import tiktoken
+from langchain.chains import create_extraction_chain
+from langchain.chains.combine_documents import create_stuff_documents_chain
+from langchain.chains.question_answering import stuff_prompt, refine_prompts, map_reduce_prompt, \
+    map_rerank_prompt
+from langchain.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate
+from langchain.retrievers import MultiQueryRetriever
+from langchain.schema import Document
+from langchain_community.vectorstores.chroma import Chroma
+from langchain_core.vectorstores import VectorStore
+from tqdm import tqdm
+from document_qa.grobid_processors import GrobidProcessor
+from document_qa.langchain import ChromaAdvancedRetrieval
+class TextMerger:
+    """Token-aware text merger that preserves PDF coordinate metadata.
+    Unlike LangChain's ``RecursiveTextSplitter``, this merger keeps the
+    bounding-box coordinates extracted by GROBID so that downstream
+    consumers (e.g. the PDF viewer) can highlight the exact regions.
+    Args:
+        model_name: A tiktoken model name (e.g. ``"gpt-4"``).  When given,
+            the tokenizer for that model is used.
+        encoding_name: A tiktoken encoding name (default ``"gpt2"``).
+            Ignored when *model_name* is provided.
+    """
+    def __init__(self, model_name=None, encoding_name="gpt2"):
+        if model_name is not None:
+            self.enc = tiktoken.encoding_for_model(model_name)
+        else:
+            self.enc = tiktoken.get_encoding(encoding_name)
+    def encode(self, text, allowed_special=set(), disallowed_special="all"):
+        """Tokenize *text* and return a list of token IDs.
+        Thin wrapper around ``tiktoken.Encoding.encode`` that exposes the
+        same special-token controls.
+        Args:
+            text: The string to tokenize.
+            allowed_special: Set of special tokens allowed in *text*.
+            disallowed_special: Special-token handling policy.
+        Returns:
+            list[int]: Token IDs produced by the configured tokenizer.
+        """
+        return self.enc.encode(
+            text,
+            allowed_special=allowed_special,
+            disallowed_special=disallowed_special,
+        )
+    def merge_passages(self, passages, chunk_size, tolerance=0.2):
+        """Merge consecutive passages into chunks of approximately *chunk_size* tokens.
+        Args:
+            passages: List of dicts, each with ``"text"`` (str) and
+                ``"coordinates"`` (str) keys — as returned by
+                method:`GrobidProcessor.process_structure`.
+            chunk_size: Target number of tokens per merged chunk.
+            tolerance: Fraction of *chunk_size* allowed as overflow
+                (default ``0.2``).
+        Returns:
+            list[dict]: Merged passages.  Each dict has:
+            - ``"text"`` — concatenated paragraph texts.
+            - ``"coordinates"`` — semicolon-joined coordinate strings.
+            - ``"type"`` — always ``"aggregated chunks"``.
+            - ``"section"`` / ``"subSection"`` — always ``"mixed"``.
+        """
+        new_passages = []
+        new_coordinates = []
+        current_texts = []
+        current_coordinates = []
+        for idx, passage in enumerate(passages):
+            text = passage['text']
+            coordinates = passage['coordinates']
+            current_texts.append(text)
+            current_coordinates.append(coordinates)
+            accumulated_text = " ".join(current_texts)
+            encoded_accumulated_text = self.encode(accumulated_text)
+            if len(encoded_accumulated_text) > chunk_size + chunk_size * tolerance:
+                if len(current_texts) > 1:
+                    new_passages.append(current_texts[:-1])
+                    new_coordinates.append(current_coordinates[:-1])
+                    current_texts = [current_texts[-1]]
+                    current_coordinates = [current_coordinates[-1]]
+                else:
+                    new_passages.append(current_texts)
+                    new_coordinates.append(current_coordinates)
+                    current_texts = []
+                    current_coordinates = []
+            elif chunk_size <= len(encoded_accumulated_text) < chunk_size + chunk_size * tolerance:
+                new_passages.append(current_texts)
+                new_coordinates.append(current_coordinates)
+                current_texts = []
+                current_coordinates = []
+        if len(current_texts) > 0:
+            new_passages.append(current_texts)
+            new_coordinates.append(current_coordinates)
+        new_passages_struct = []
+        for i, passages in enumerate(new_passages):
+            text = " ".join(passages)
+            coordinates = ";".join(new_coordinates[i])
+            new_passages_struct.append(
+                {
+                    "text": text,
+                    "coordinates": coordinates,
+                    "type": "aggregated chunks",
+                    "section": "mixed",
+                    "subSection": "mixed"
+                }
+            )
+        return new_passages_struct
+class BaseRetrieval:
+    """Abstract base for retrieval backends.
+    """
+    def __init__(
+            self,
+            persist_directory: Path,
+            embedding_function
+    ):
+        self.embedding_function = embedding_function
+        self.persist_directory = persist_directory
+class NER_Retrival(VectorStore):
+    """
+    This class implement a retrieval based on NER models.
+    This is an alternative retrieval to embeddings that relies on extracted entities.
+    """
+    pass
+engines = {
+    'chroma': ChromaAdvancedRetrieval,
+    'ner': NER_Retrival
+}
+class DataStorage:
+    """Manages per-document vector-store collections.
+    Each uploaded PDF gets its own ChromaDB collection,
+    keyed by a document ID (typically an MD5 hash).  Collections can live
+    in memory or be persisted to disk.
+    Args:
+        embedding_function: A LangChain-compatible ``Embeddings`` instance
+        root_path: Optional directory for persisted embeddings.
+        engine: The vector-store class to use.
+    """
+    embeddings_dict = {}
+    embeddings_map_from_md5 = {}
+    embeddings_map_to_md5 = {}
+    def __init__(
+            self,
+            embedding_function,
+            root_path: Path = None,
+            engine=ChromaAdvancedRetrieval,
+    ) -> None:
+        self.root_path = root_path
+        self.engine = engine
+        self.embedding_function = embedding_function
+        if root_path is not None:
+            self.embeddings_root_path = root_path
+            if not os.path.exists(root_path):
+                os.makedirs(root_path)
+            else:
+                self.load_embeddings(self.embeddings_root_path)
+    def load_embeddings(self, embeddings_root_path: Union[str, Path]) -> None:
+        """
+        Load the vector storage assuming they are all persisted and stored in a single directory.
+        The root path of the embeddings containing one data store for each document in each subdirectory
+        """
+        embeddings_directories = [f for f in os.scandir(embeddings_root_path) if f.is_dir()]
+        if len(embeddings_directories) == 0:
+            print("No available embeddings")
+            return
+        for embedding_document_dir in embeddings_directories:
+            self.embeddings_dict[embedding_document_dir.name] = self.engine(
+                persist_directory=embedding_document_dir.path,
+                embedding_function=self.embedding_function
+            )
+            filename_list = list(Path(embedding_document_dir).glob('*.storage_filename'))
+            if filename_list:
+                filenam = filename_list[0].name.replace(".storage_filename", "")
+                self.embeddings_map_from_md5[embedding_document_dir.name] = filenam
+                self.embeddings_map_to_md5[filenam] = embedding_document_dir.name
+        print("Embedding loaded: ", len(self.embeddings_dict.keys()))
+    def get_loaded_embeddings_ids(self):
+        """Return the document IDs (MD5 hashes) of all loaded collections."""
+        return list(self.embeddings_dict.keys())
+    def get_md5_from_filename(self, filename):
+        """Look up the MD5 document ID for a given original *filename*."""
+        return self.embeddings_map_to_md5[filename]
+    def get_filename_from_md5(self, md5):
+        """Look up the original filename for a given *md5* document ID."""
+        return self.embeddings_map_from_md5[md5]
+    def embed_document(self, doc_id, texts, metadatas):
+        """Create (or replace) an in-memory vector collection for a document.
+        Args:
+            doc_id: Unique identifier for the document.
+            texts: List of text chunks to embed.
+            metadatas: List of metadata dicts (one per chunk).
+        """
+        if doc_id not in self.embeddings_dict.keys():
+            self.embeddings_dict[doc_id] = self.engine.from_texts(
+                texts,
+                embedding=self.embedding_function,
+                metadatas=metadatas,
+                collection_name=doc_id)
+        else:
+            # Workaround Chroma (?) breaking change
+            self.embeddings_dict[doc_id].delete_collection()
+            self.embeddings_dict[doc_id] = self.engine.from_texts(
+                texts,
+                embedding=self.embedding_function,
+                metadatas=metadatas,
+                collection_name=doc_id)
+        self.embeddings_root_path = None
+class DocumentQAEngine:
+    """End-to-end RAG engine for scientific PDF documents.
+    Orchestrates the full pipeline:
+    1. **PDF parsing** via a GROBID server (structured text + coordinates).
+    2. **Chunking** — paragraphs kept as-is or merged with :class:`TextMerger`.
+    3. **Embedding and storage**  chunks are embedded and stored.
+    4. **Retrieval + LLM** — relevant chunks are retrieved and fed to an LLM
+       to produce an answer.
+    Args:
+        llm: A LangChain chat model (e.g. ``ChatOpenAI``).
+        data_storage: A `DataStorage` instance for managing embeddings.
+        grobid_url: URL of the GROBID server.
+        memory: Optional ``ConversationBufferMemory`` for multi-turn context.
+    """
+    llm = None
+    qa_chain_type = None
+    default_prompts = {
+        'stuff': stuff_prompt,
+        'refine': refine_prompts,
+        "map_reduce": map_reduce_prompt,
+        "map_rerank": map_rerank_prompt
+    }
+    def __init__(self,
+                 llm,
+                 data_storage: DataStorage,
+                 grobid_url=None,
+                 memory=None
+                 ):
+        self.llm = llm
+        self.memory = memory
+        self.chain = create_stuff_documents_chain(llm, self.default_prompts['stuff'].PROMPT)
+        self.text_merger = TextMerger()
+        self.data_storage = data_storage
+        if grobid_url:
+            self.grobid_processor = GrobidProcessor(grobid_url)
+    def query_document(
+            self,
+            query: str,
+            doc_id,
+            output_parser=None,
+            context_size=4,
+            extraction_schema=None,
+            verbose=False
+    ) -> tuple[Any, str]:
+        """Ask a question and get an LLM-generated answer.
+        Retrieves the most relevant chunks from the vector store, feeds
+        them as context to the LLM, and returns the response.
+        Args:
+            query: The natural-language question.
+            doc_id: Document identifier returned by create_memory_embeddings`.
+            output_parser: Optional LangChain output parser.  If provided the
+                raw LLM response is re-processed into structured output.
+            context_size: Number of chunks to retrieve as context (default 4).
+            extraction_schema: Optional extraction schema.
+            verbose: Print debug information.
+        Returns:
+            tuple: ``(parsed_output | None, raw_text_response, coordinates)``
+            - *parsed_output* — structured data if a parser/schema was given,
+              otherwise ``None``.
+            - *raw_text_response* — the LLM's raw text answer.
+            - *coordinates* — list of lists of coordinate strings for each
+              retrieved chunk (for PDF highlighting).
+        """
+        # self.load_embeddings(self.embeddings_root_path)
+        if verbose:
+            print(query)
+        response, coordinates = self._run_query(doc_id, query, context_size=context_size)
+        response = response['output_text'] if 'output_text' in response else response
+        if verbose:
+            print(doc_id, "->", response)
+        if output_parser:
+            try:
+                return self._parse_json(response, output_parser), response
+            except Exception as oe:
+                print("Failing to parse the response", oe)
+                return None, response, coordinates
+        elif extraction_schema:
+            try:
+                chain = create_extraction_chain(extraction_schema, self.llm)
+                parsed = chain.run(response)
+                return parsed, response, coordinates
+            except Exception as oe:
+                print("Failing to parse the response", oe)
+                return None, response, coordinates
+        else:
+            return None, response, coordinates
+    def query_storage(self, query: str, doc_id, context_size=4) -> tuple[List[Document], list]:
+        """Retrieve relevant text passages without calling the LLM.
+        Useful for debugging which chunks would be used as context, or for
+        building custom pipelines on top of the retrieval step.
+        Args:
+            query: The natural-language question.
+            doc_id: Document identifier.
+            context_size: Number of chunks to retrieve (default 4).
+        Returns:
+            tuple: ``(texts, coordinates)``
+            - *texts* — list of passage strings.
+            - *coordinates* — list of lists of coordinate strings.
+        """
+        documents, coordinates = self._get_context(doc_id, query, context_size)
+        context_as_text = [doc.page_content for doc in documents]
+        return context_as_text, coordinates
+    def query_storage_and_embeddings(self, query: str, doc_id, context_size=4) -> List[Document]:
+        """Retrieve passages with their similarity scores and raw embeddings.
+        Each returned ``Document`` has extra metadata keys:
+        - ``__similarity`` — cosine distance to the query.
+        - ``__embeddings`` — the chunk's embedding vector.
+        Args:
+            query: The natural-language question.
+            doc_id: Document identifier.
+            context_size: Number of chunks to retrieve (default 4).
+        Returns:
+            list[Document]: Retrieved documents enriched with similarity and
+            embedding metadata.
+        """
+        db = self.data_storage.embeddings_dict[doc_id]
+        retriever = db.as_retriever(
+            search_kwargs={"k": context_size},
+            search_type="similarity_with_embeddings"
+        )
+        relevant_documents = retriever.invoke(query)
+        return relevant_documents
+    def analyse_query(self, query, doc_id, context_size=4):
+        """Compute a relevance coefficient for *query* against *doc_id*.
+        The coefficient is ``min_similarity - mean_similarity`` over the
+        top-k retrieved chunks.  A value close to zero suggests the
+        question matches multiple passages equally well.
+        Args:
+            query: The natural-language question.
+            doc_id: Document identifier.
+            context_size: Number of chunks to consider (default 4).
+        Returns:
+            tuple: ``(summary_string, coordinates)``
+        """
+        db = self.data_storage.embeddings_dict[doc_id]
+        # retriever = db.as_retriever(
+        #     search_kwargs={"k": context_size, 'score_threshold': 0.0},
+        #     search_type="similarity_score_threshold"
+        # )
+        retriever = db.as_retriever(search_kwargs={"k": context_size}, search_type="similarity_with_embeddings")
+        relevant_documents = retriever.invoke(query)
+        relevant_document_coordinates = [doc.metadata['coordinates'].split(";") if 'coordinates' in doc.metadata else []
+                                         for doc in
+                                         relevant_documents]
+        all_documents = db.get(include=['documents', 'metadatas', 'embeddings'])
+        # all_documents_embeddings = all_documents["embeddings"]
+        # query_embedding = db._embedding_function.embed_query(query)
+        # distance_evaluator = load_evaluator("pairwise_embedding_distance",
+        #                               embeddings=db._embedding_function,
+        #                               distance_metric=EmbeddingDistance.EUCLIDEAN)
+        # distance_evaluator.evaluate_string_pairs(query=query_embedding, documents="")
+        similarities = [doc.metadata['__similarity'] for doc in relevant_documents]
+        min_similarity = min(similarities)
+        mean_similarity = sum(similarities) / len(similarities)
+        coefficient = min_similarity - mean_similarity
+        return f"Coefficient: {coefficient}, (Min similarity {min_similarity}, Mean similarity: {mean_similarity})", relevant_document_coordinates
+    def _parse_json(self, response, output_parser):
+        system_message = "You are an useful assistant expert in materials science, physics, and chemistry " \
+                         "that can process text and transform it to JSON."
+        human_message = """Transform the text between three double quotes in JSON.\n\n\n\n
+        {format_instructions}\n\nText: \"\"\"{text}\"\"\""""
+        system_message_prompt = SystemMessagePromptTemplate.from_template(system_message)
+        human_message_prompt = HumanMessagePromptTemplate.from_template(human_message)
+        prompt_template = ChatPromptTemplate.from_messages([system_message_prompt, human_message_prompt])
+        results = self.llm(
+            prompt_template.format_prompt(
+                text=response,
+                format_instructions=output_parser.get_format_instructions()
+            ).to_messages()
+        )
+        parsed_output = output_parser.parse(results.content)
+        return parsed_output
+    def _run_query(self, doc_id, query, context_size=4) -> tuple[List[Document], list]:
+        relevant_documents, relevant_document_coordinates = self._get_context(doc_id, query, context_size)
+        response = self.chain.invoke({"context": relevant_documents, "question": query})
+        return response, relevant_document_coordinates
+    def _get_context(self, doc_id, query, context_size=4) -> tuple[List[Document], list]:
+        db = self.data_storage.embeddings_dict[doc_id]
+        retriever = db.as_retriever(search_kwargs={"k": context_size})
+        relevant_documents = retriever.invoke(query)
+        relevant_document_coordinates = [
+            doc.metadata['coordinates'].split(";") if 'coordinates' in doc.metadata else []
+            for doc in
+            relevant_documents
+        ]
+        if self.memory and len(self.memory.buffer_as_messages) > 0:
+            relevant_documents.append(
+                Document(
+                    page_content="""Following, the previous question and answers. Use these information only when in the question there are unspecified references:\n{}\n\n""".format(
+                        self.memory.buffer_as_str))
+            )
+        return relevant_documents, relevant_document_coordinates
+    def get_full_context_by_document(self, doc_id):
+        """
+        Return the full context from the document
+        """
+        db = self.data_storage.embeddings_dict[doc_id]
+        docs = db.get()
+        return docs['documents']
+    def _get_context_multiquery(self, doc_id, query, context_size=4):
+        db = self.data_storage.embeddings_dict[doc_id].as_retriever(search_kwargs={"k": context_size})
+        multi_query_retriever = MultiQueryRetriever.from_llm(retriever=db, llm=self.llm)
+        relevant_documents = multi_query_retriever.invoke(query)
+        return relevant_documents
+    def get_text_from_document(self, pdf_file_path, chunk_size=-1, perc_overlap=0.1, verbose=False):
+        """Extract and chunk text from a PDF via GROBID.
+        Sends the PDF to the configured GROBID server, parses the returned
+        TEI-XML into passages with coordinate metadata, and optionally
+        merges passages into larger token-based chunks.
+        Args:
+            pdf_file_path: Path to the PDF file on disk.
+            chunk_size: Target tokens per chunk.  ``-1`` (default) keeps
+                GROBID paragraphs as-is; a positive value merges them.
+            perc_overlap: Reserved for future overlap support.
+            verbose: Print debug information.
+        Returns:
+            tuple: ``(texts, metadatas, ids)``
+            - *texts* — list of passage strings.
+            - *metadatas* — list of metadata dicts (coordinates, section, …).
+            - *ids* — list of integer chunk IDs.
+        Raises:
+            AttributeError: If ``grobid_url`` was not provided at init time.
+        """
+        if verbose:
+            print("File", pdf_file_path)
+        filename = Path(pdf_file_path).stem
+        coordinates = True  # if chunk_size == -1 else False
+        structure = self.grobid_processor.process_structure(pdf_file_path, coordinates=coordinates)
+        biblio = structure['biblio']
+        biblio['filename'] = filename.replace(" ", "_")
+        if verbose:
+            print("Generating embeddings for:", hash, ", filename: ", filename)
+        texts = []
+        metadatas = []
+        ids = []
+        if chunk_size > 0:
+            new_passages = self.text_merger.merge_passages(structure['passages'], chunk_size=chunk_size)
+        else:
+            new_passages = structure['passages']
+        for passage in new_passages:
+            biblio_copy = copy.copy(biblio)
+            if len(str.strip(passage['text'])) > 0:
+                texts.append(passage['text'])
+                biblio_copy['type'] = passage['type']
+                biblio_copy['section'] = passage['section']
+                biblio_copy['subSection'] = passage['subSection']
+                biblio_copy['coordinates'] = passage['coordinates']
+                metadatas.append(biblio_copy)
+                # ids.append(passage['passage_id'])
+            ids = [id for id, t in enumerate(new_passages)]
+        return texts, metadatas, ids
+    def create_memory_embeddings(
+            self,
+            pdf_path,
+            doc_id=None,
+            chunk_size=500,
+            perc_overlap=0.1
+    ):
+        """Parse a PDF and create an in-memory vector collection.
+        This is the main entry-point for ingesting a new document.  It
+        calls GROBID, chunks the text, embeds it, and stores everything in `data_storage`.
+        Args:
+            pdf_path: Path to the PDF file.
+            doc_id: Optional explicit document ID.  When ``None``, the
+                MD5 hash extracted by GROBID is used.
+            chunk_size: Token count per chunk (default 500).  Use ``-1``
+                to keep GROBID paragraphs intact.
+            perc_overlap: Reserved for future overlap support.
+        Returns:
+            str: The document ID.
+        """
+        texts, metadata, ids = self.get_text_from_document(
+            pdf_path,
+            chunk_size=chunk_size,
+            perc_overlap=perc_overlap)
+        if doc_id:
+            hash = doc_id
+        else:
+            hash = metadata[0]['hash'] if len(metadata) > 0 and 'hash' in metadata[0] else ""
+        self.data_storage.embed_document(hash, texts, metadata)
+        return hash
+    def create_embeddings(
+            self,
+            pdfs_dir_path: Path,
+            chunk_size=500,
+            perc_overlap=0.1,
+            include_biblio=False
+    ):
+        """Batch-process a directory of PDFs and persist their embeddings.
+        Walks *pdfs_dir_path*, processes each ``.pdf`` file through GROBID,
+        creates embeddings, and persists the resulting ChromaDB collection
+        to a subdirectory named after the file's MD5.
+        Args:
+            pdfs_dir_path: Directory containing PDF files.
+            chunk_size: Token count per chunk (default 500).
+            perc_overlap: Reserved for future overlap support.
+            include_biblio: Reserved flag (currently unused).
+        """
+        input_files = []
+        for root, dirs, files in os.walk(pdfs_dir_path, followlinks=False):
+            for file_ in files:
+                if not (file_.lower().endswith(".pdf")):
+                    continue
+                input_files.append(os.path.join(root, file_))
+        for input_file in tqdm(input_files, total=len(input_files), unit='document',
+                               desc="Grobid + embeddings processing"):
+            md5 = self.calculate_md5(input_file)
+            data_path = os.path.join(self.data_storage.embeddings_root_path, md5)
+            if os.path.exists(data_path):
+                print(data_path, "exists. Skipping it ")
+                continue
+            # include = ["biblio"] if include_biblio else []
+            texts, metadata, ids = self.get_text_from_document(
+                input_file,
+                chunk_size=chunk_size,
+                perc_overlap=perc_overlap)
+            filename = metadata[0]['filename']
+            vector_db_document = Chroma.from_texts(texts,
+                                                   metadatas=metadata,
+                                                   embedding=self.embedding_function,
+                                                   persist_directory=data_path)
+            vector_db_document.persist()
+            with open(os.path.join(data_path, filename + ".storage_filename"), 'w') as fo:
+                fo.write("")
+    @staticmethod
+    def calculate_md5(input_file: Union[Path, str]):
+        """Return the uppercase hex MD5 digest of *input_file*."""
+        import hashlib
+        md5_hash = hashlib.md5()
+        with open(input_file, 'rb') as fi:
+            md5_hash.update(fi.read())
+        return md5_hash.hexdigest().upper()

document_qa/grobid_processors.py ADDED Viewed

	@@ -0,0 +1,999 @@

+"""GROBID-based processors for scientific text extraction.
+This module provides processors that interact with GROBID services to:
+- **Extract structured text** from scientific PDFs (:class:`GrobidProcessor`)
+  — parses TEI-XML into passages with section labels and PDF coordinates.
+- **Annotate physical quantities** (:class:`GrobidQuantitiesProcessor`)
+  — identifies measurements via the grobid-quantities service.
+- **Annotate materials** (:class:`GrobidMaterialsProcessor`)
+  — identifies material mentions via grobid-superconductors.
+- **Aggregate NER results** (:class:`GrobidAggregationProcessor`)
+  — combines quantity and material annotations with overlap pruning.
+"""
+import re
+from collections import OrderedDict
+from html import escape
+from pathlib import Path
+import dateparser
+import grobid_tei_xml
+from bs4 import BeautifulSoup
+from grobid_client.grobid_client import GrobidClient
+def get_span_start(type, title=None):
+    """Return an opening ``<span>`` tag for an annotation of the given *type*."""
+    title_ = ' title="' + title + '"' if title is not None else ""
+    return '<span class="label ' + type + '"' + title_ + '>'
+def get_span_end():
+    return '</span>'
+def get_rs_start(type):
+    return '<rs type="' + type + '">'
+def get_rs_end():
+    return '</rs>'
+def has_space_between_value_and_unit(quantity):
+    return quantity['offsetEnd'] < quantity['rawUnit']['offsetStart']
+def decorate_text_with_annotations(text, spans, tag="span"):
+    """Wrap recognised entity spans in markup tags.
+    Produces either HTML (``<span class="label …">``) or TEI-XML
+    (``<rs type="…">``) depending on *tag*.
+    Args:
+        text: The original plain-text string.
+        spans: List of span dicts with at least ``offset_start``,
+            ``offset_end``, and ``type`` keys.
+        tag: ``"span"`` (default) for HTML output, ``"rs"`` for XML.
+    Returns:
+        str: The text with inline annotation markup.
+    """
+    sorted_spans = list(sorted(spans, key=lambda item: item['offset_start']))
+    annotated_text = ""
+    start = 0
+    for span in sorted_spans:
+        type = span['type'].replace("<", "").replace(">", "")
+        if 'unit_type' in span and span['unit_type'] is not None:
+            type = span['unit_type'].replace(" ", "_")
+        annotated_text += escape(text[start: span['offset_start']])
+        title = span['quantified'] if 'quantified' in span else None
+        annotated_text += get_span_start(type, title) if tag == "span" else get_rs_start(type)
+        annotated_text += escape(text[span['offset_start']: span['offset_end']])
+        annotated_text += get_span_end() if tag == "span" else get_rs_end()
+        start = span['offset_end']
+    annotated_text += escape(text[start: len(text)])
+    return annotated_text
+def get_parsed_value_type(quantity):
+    if 'parsedValue' in quantity and 'structure' in quantity['parsedValue']:
+        return quantity['parsedValue']['structure']['type']
+class BaseProcessor(object):
+    """Shared post-processing logic for all GROBID-derived processors.
+    Fixes common character-encoding artefacts produced by PDF extraction
+    (e.g. ``À`` → ``-``, ``¼`` → ``=``).  All processor subclasses
+    inherit :meth:`post_process` from here.
+    """
+    patterns = [
+        r'\d+e\d+'
+    ]
+    def post_process(self, text):
+        """Clean encoding artefacts and normalise special characters.
+        Args:
+            text: Raw extracted text from GROBID.
+        Returns:
+            str: Cleaned text.
+        """
+        output = text.replace('À', '-')
+        output = output.replace('¼', '=')
+        output = output.replace('þ', '+')
+        output = output.replace('Â', 'x')
+        output = output.replace('$', '~')
+        output = output.replace('−', '-')
+        output = output.replace('–', '-')
+        for pattern in self.patterns:
+            output = re.sub(pattern, lambda match: match.group().replace('e', '-'), output)
+        return output
+class GrobidProcessor(BaseProcessor):
+    """Extract structured text and coordinates from PDFs via GROBID.
+    Sends a PDF to a running GROBID server, parses the returned TEI-XML,
+    and produces a list of passage dicts with text content, section labels,
+    and bounding-box coordinates for each paragraph.
+    Args:
+        grobid_url: Full URL of the GROBID server
+            (e.g. ``"https://grobid.example.com"``).
+        ping_server: If ``True`` (default), verify the server is alive
+            on init.
+    Raises:
+        ServerUnavailableException: If *ping_server* is ``True`` and the
+            GROBID server does not respond.
+    """
+    def __init__(self, grobid_url, ping_server=True):
+        grobid_client = GrobidClient(
+            grobid_server=grobid_url,
+            batch_size=5,
+            coordinates=["p", "title", "persName"],
+            sleep_time=5,
+            timeout=60,
+            check_server=ping_server
+        )
+        self.grobid_client = grobid_client
+    def process_structure(self, input_path, coordinates=False):
+        """Send a PDF to GROBID and return structured content.
+        Args:
+            input_path: Path to the PDF file.
+            coordinates: If ``True``, include bounding-box coordinate
+                strings in each passage (needed for PDF highlighting).
+        Returns:
+            dict or None: A dict with keys:
+            - ``"biblio"`` — bibliographic metadata (title, authors, DOI, …).
+            - ``"passages"`` — list of passage dicts, each containing
+              ``text``, ``type``, ``section``, ``subSection``,
+              ``passage_id``, and ``coordinates``.
+            - ``"filename"`` — stem of the PDF filename.
+            Returns ``None`` if GROBID returns a non-200 status.
+        """
+        pdf_file, status, text = self.grobid_client.process_pdf("processFulltextDocument",
+                                                                input_path,
+                                                                consolidate_header=True,
+                                                                consolidate_citations=False,
+                                                                segment_sentences=False,
+                                                                tei_coordinates=coordinates,
+                                                                include_raw_citations=False,
+                                                                include_raw_affiliations=False,
+                                                                generateIDs=True)
+        if status != 200:
+            return
+        document_object = self.parse_grobid_xml(text, coordinates=coordinates)
+        document_object['filename'] = Path(pdf_file).stem.replace(".tei", "")
+        return document_object
+    def process_single(self, input_file):
+        doc = self.process_structure(input_file)
+        for paragraph in doc['passages']:
+            entities = self.process_single_text(paragraph['text'])
+            paragraph['spans'] = entities
+        return doc
+    def parse_grobid_xml(self, text, coordinates=False):
+        """Parse GROBID TEI-XML into a structured passage dict.
+        Extracts title, abstract, body paragraphs, back-matter, and
+        figure descriptions from the XML, post-processes encoding
+        artefacts, and attaches coordinate metadata.
+        Args:
+            text: Raw TEI-XML string returned by GROBID.
+            coordinates: Whether to extract ``coords`` attributes.
+        Returns:
+            dict: ``{"biblio": {…}, "passages": […]}``
+        """
+        output_data = OrderedDict()
+        doc_biblio = grobid_tei_xml.parse_document_xml(text)
+        biblio = {
+            "doi": doc_biblio.header.doi if doc_biblio.header.doi is not None else "",
+            "authors": ", ".join([author.full_name for author in doc_biblio.header.authors]),
+            "title": doc_biblio.header.title,
+            "hash": doc_biblio.pdf_md5
+        }
+        try:
+            year = dateparser.parse(doc_biblio.header.date).year
+            biblio["publication_year"] = year
+        except:
+            pass
+        output_data['biblio'] = biblio
+        passages = []
+        output_data['passages'] = passages
+        passage_type = "paragraph"
+        soup = BeautifulSoup(text, 'xml')
+        blocks_header = get_xml_nodes_header(soup, use_paragraphs=True)
+        # passages.append({
+        #     "text": f"authors: {biblio['authors']}",
+        #     "type": passage_type,
+        #     "section": "<header>",
+        #     "subSection": "<authors>",
+        #     "passage_id": "hauthors",
+        #     "coordinates": ";".join([node['coords'] if coordinates and node.has_attr('coords') else "" for node in
+        #                              blocks_header['authors']])
+        # })
+        passages.append({
+            "text": self.post_process(" ".join([node.text for node in blocks_header['title']])),
+            "type": passage_type,
+            "section": "<header>",
+            "subSection": "<title>",
+            "passage_id": "htitle",
+            "coordinates": ";".join([node['coords'] if coordinates and node.has_attr('coords') else "" for node in
+                                     blocks_header['title']])
+        })
+        passages.append({
+            "text": self.post_process(
+                ''.join(node.text for node in blocks_header['abstract'] for text in node.find_all(text=True) if
+                        text.parent.name != "ref" or (
+                                text.parent.name == "ref" and text.parent.attrs[
+                            'type'] != 'bibr'))),
+            "type": passage_type,
+            "section": "<header>",
+            "subSection": "<abstract>",
+            "passage_id": "habstract",
+            "coordinates": ";".join([node['coords'] if coordinates and node.has_attr('coords') else "" for node in
+                                     blocks_header['abstract']])
+        })
+        text_blocks_body = get_xml_nodes_body(soup, verbose=False, use_paragraphs=True)
+        text_blocks_body.extend(get_xml_nodes_back(soup, verbose=False, use_paragraphs=True))
+        use_paragraphs = True
+        if not use_paragraphs:
+            passages.extend([
+                {
+                    "text": self.post_process(''.join(text for text in sentence.find_all(text=True) if
+                                                      text.parent.name != "ref" or (
+                                                              text.parent.name == "ref" and text.parent.attrs[
+                                                          'type'] != 'bibr'))),
+                    "type": passage_type,
+                    "section": "<body>",
+                    "subSection": "<paragraph>",
+                    "passage_id": str(paragraph_id),
+                    "coordinates": paragraph['coords'] if coordinates and sentence.has_attr('coords') else ""
+                }
+                for paragraph_id, paragraph in enumerate(text_blocks_body) for
+                sentence_id, sentence in enumerate(paragraph)
+            ])
+        else:
+            passages.extend([
+                {
+                    "text": self.post_process(''.join(text for text in paragraph.find_all(text=True) if
+                                                      text.parent.name != "ref" or (
+                                                              text.parent.name == "ref" and text.parent.attrs[
+                                                          'type'] != 'bibr'))),
+                    "type": passage_type,
+                    "section": "<body>",
+                    "subSection": "<paragraph>",
+                    "passage_id": str(paragraph_id),
+                    "coordinates": paragraph['coords'] if coordinates and paragraph.has_attr('coords') else ""
+                }
+                for paragraph_id, paragraph in enumerate(text_blocks_body)
+            ])
+        text_blocks_figures = get_xml_nodes_figures(soup, verbose=False)
+        if not use_paragraphs:
+            passages.extend([
+                {
+                    "text": self.post_process(''.join(text for text in sentence.find_all(text=True) if
+                                                      text.parent.name != "ref" or (
+                                                              text.parent.name == "ref" and text.parent.attrs[
+                                                          'type'] != 'bibr'))),
+                    "type": passage_type,
+                    "section": "<body>",
+                    "subSection": "<figure>",
+                    "passage_id": str(paragraph_id) + str(sentence_id),
+                    "coordinates": sentence['coords'] if coordinates and 'coords' in sentence else ""
+                }
+                for paragraph_id, paragraph in enumerate(text_blocks_figures) for
+                sentence_id, sentence in enumerate(paragraph)
+            ])
+        else:
+            passages.extend([
+                {
+                    "text": self.post_process(''.join(text for text in paragraph.find_all(text=True) if
+                                                      text.parent.name != "ref" or (
+                                                              text.parent.name == "ref" and text.parent.attrs[
+                                                          'type'] != 'bibr'))),
+                    "type": passage_type,
+                    "section": "<body>",
+                    "subSection": "<figure>",
+                    "passage_id": str(paragraph_id),
+                    "coordinates": paragraph['coords'] if coordinates and paragraph.has_attr('coords') else ""
+                }
+                for paragraph_id, paragraph in enumerate(text_blocks_figures)
+            ])
+        return output_data
+class GrobidQuantitiesProcessor(BaseProcessor):
+    """NER processor for physical quantities (measurements, units).
+    Wraps the `grobid-quantities <https://github.com/kermitt2/grobid-quantities>`_
+    service to identify and normalise measurements in text.
+    Args:
+        grobid_quantities_client: A configured quantities API client
+    """
+    def __init__(self, grobid_quantities_client):
+        self.grobid_quantities_client = grobid_quantities_client
+    def process(self, text) -> list:
+        """Extract quantity spans from *text*.
+        Args:
+            text: Plain text to analyse.
+        Returns:
+            list[dict]: Span dicts with ``offset_start``, ``offset_end``,
+            ``type`` (``"property"``), and optional ``unit_type`` /
+            ``quantified`` keys.
+        """
+        status, result = self.grobid_quantities_client.process_text(text.strip())
+        if status != 200:
+            result = {}
+        spans = []
+        if 'measurements' in result:
+            found_measurements = self.parse_measurements_output(result)
+            for m in found_measurements:
+                item = {
+                    "text": text[m['offset_start']:m['offset_end']],
+                    'offset_start': m['offset_start'],
+                    'offset_end': m['offset_end']
+                }
+                if 'raw' in m and m['raw'] != item['text']:
+                    item['text'] = m['raw']
+                if 'quantified_substance' in m:
+                    item['quantified'] = m['quantified_substance']
+                if 'type' in m:
+                    item["unit_type"] = m['type']
+                item['type'] = 'property'
+                # if 'raw_value' in m:
+                #     item['raw_value'] = m['raw_value']
+                spans.append(item)
+        return spans
+    @staticmethod
+    def parse_measurements_output(result):
+        measurements_output = []
+        for measurement in result['measurements']:
+            type = measurement['type']
+            measurement_output_object = {}
+            quantity_type = None
+            has_unit = False
+            parsed_value_type = None
+            if 'quantified' in measurement:
+                if 'normalizedName' in measurement['quantified']:
+                    quantified_substance = measurement['quantified']['normalizedName']
+                    measurement_output_object["quantified_substance"] = quantified_substance
+            if 'measurementOffsets' in measurement:
+                measurement_output_object["offset_start"] = measurement["measurementOffsets"]['start']
+                measurement_output_object["offset_end"] = measurement["measurementOffsets"]['end']
+            else:
+                # If there are no offsets we skip the measurement
+                continue
+            # if 'measurementRaw' in measurement:
+            #     measurement_output_object['raw_value'] = measurement['measurementRaw']
+            if type == 'value':
+                quantity = measurement['quantity']
+                parsed_value = GrobidQuantitiesProcessor.get_parsed(quantity)
+                if parsed_value:
+                    measurement_output_object['parsed'] = parsed_value
+                normalized_value = GrobidQuantitiesProcessor.get_normalized(quantity)
+                if normalized_value:
+                    measurement_output_object['normalized'] = normalized_value
+                raw_value = GrobidQuantitiesProcessor.get_raw(quantity)
+                if raw_value:
+                    measurement_output_object['raw'] = raw_value
+                if 'type' in quantity:
+                    quantity_type = quantity['type']
+                if 'rawUnit' in quantity:
+                    has_unit = True
+                parsed_value_type = get_parsed_value_type(quantity)
+            elif type == 'interval':
+                if 'quantityMost' in measurement:
+                    quantityMost = measurement['quantityMost']
+                    if 'type' in quantityMost:
+                        quantity_type = quantityMost['type']
+                    if 'rawUnit' in quantityMost:
+                        has_unit = True
+                    parsed_value_type = get_parsed_value_type(quantityMost)
+                if 'quantityLeast' in measurement:
+                    quantityLeast = measurement['quantityLeast']
+                    if 'type' in quantityLeast:
+                        quantity_type = quantityLeast['type']
+                    if 'rawUnit' in quantityLeast:
+                        has_unit = True
+                    parsed_value_type = get_parsed_value_type(quantityLeast)
+            elif type == 'listc':
+                quantities = measurement['quantities']
+                if 'type' in quantities[0]:
+                    quantity_type = quantities[0]['type']
+                if 'rawUnit' in quantities[0]:
+                    has_unit = True
+                parsed_value_type = get_parsed_value_type(quantities[0])
+            if quantity_type is not None or has_unit:
+                measurement_output_object['type'] = quantity_type
+            if parsed_value_type is None or parsed_value_type not in ['ALPHABETIC', 'TIME']:
+                measurements_output.append(measurement_output_object)
+        return measurements_output
+    @staticmethod
+    def get_parsed(quantity):
+        parsed_value = parsed_unit = None
+        if 'parsedValue' in quantity and 'parsed' in quantity['parsedValue']:
+            parsed_value = quantity['parsedValue']['parsed']
+        if 'parsedUnit' in quantity and 'name' in quantity['parsedUnit']:
+            parsed_unit = quantity['parsedUnit']['name']
+        if parsed_value and parsed_unit:
+            if has_space_between_value_and_unit(quantity):
+                return str(parsed_value) + str(parsed_unit)
+            else:
+                return str(parsed_value) + " " + str(parsed_unit)
+    @staticmethod
+    def get_normalized(quantity):
+        normalized_value = normalized_unit = None
+        if 'normalizedQuantity' in quantity:
+            normalized_value = quantity['normalizedQuantity']
+        if 'normalizedUnit' in quantity and 'name' in quantity['normalizedUnit']:
+            normalized_unit = quantity['normalizedUnit']['name']
+        if normalized_value and normalized_unit:
+            if has_space_between_value_and_unit(quantity):
+                return str(normalized_value) + " " + str(normalized_unit)
+            else:
+                return str(normalized_value) + str(normalized_unit)
+    @staticmethod
+    def get_raw(quantity):
+        raw_value = raw_unit = None
+        if 'rawValue' in quantity:
+            raw_value = quantity['rawValue']
+        if 'rawUnit' in quantity and 'name' in quantity['rawUnit']:
+            raw_unit = quantity['rawUnit']['name']
+        if raw_value and raw_unit:
+            if has_space_between_value_and_unit(quantity):
+                return str(raw_value) + " " + str(raw_unit)
+            else:
+                return str(raw_value) + str(raw_unit)
+class GrobidMaterialsProcessor(BaseProcessor):
+    """NER processor for material mentions (chemical compounds, etc.).
+    Wraps the `grobid-superconductors <https://github.com/lfoppiano/grobid-superconductors>`_
+    service.
+    Args:
+        grobid_superconductors_client: A configured
+            :class:`~document_qa.ner_client_generic.NERClientGeneric` instance.
+    """
+    def __init__(self, grobid_superconductors_client):
+        self.grobid_superconductors_client = grobid_superconductors_client
+    def process(self, text):
+        """Extract material-mention spans from *text*.
+        Args:
+            text: Plain text to analyse.
+        Returns:
+            list[dict]: Span dicts with ``offset_start``, ``offset_end``,
+            ``type`` (``"material"``), and optional ``formula`` keys.
+        """
+        preprocessed_text = text.strip()
+        status, result = self.grobid_superconductors_client.process_text(preprocessed_text,
+                                                                         "processText_disable_linking")
+        if status != 200:
+            result = {}
+        spans = []
+        if 'passages' in result:
+            materials = self.parse_superconductors_output(result, preprocessed_text)
+            for m in materials:
+                item = {"text": preprocessed_text[m['offset_start']:m['offset_end']]}
+                item['offset_start'] = m['offset_start']
+                item['offset_end'] = m['offset_end']
+                if 'formula' in m:
+                    item["formula"] = m['formula']
+                item['type'] = 'material'
+                item['raw_value'] = m['text']
+                spans.append(item)
+        return spans
+    def parse_materials(self, text):
+        status, result = self.grobid_superconductors_client.process_texts(text.strip(), "parseMaterials")
+        if status != 200:
+            result = []
+        results = []
+        for position_material in result:
+            compositions = []
+            for material in position_material:
+                if 'resolvedFormulas' in material:
+                    for resolved_formula in material['resolvedFormulas']:
+                        if 'formulaComposition' in resolved_formula:
+                            compositions.append(resolved_formula['formulaComposition'])
+                elif 'formula' in material:
+                    if 'formulaComposition' in material['formula']:
+                        compositions.append(material['formula']['formulaComposition'])
+            results.append(compositions)
+        return results
+    def parse_material(self, text):
+        status, result = self.grobid_superconductors_client.process_text(text.strip(), "parseMaterial")
+        if status != 200:
+            result = []
+        compositions = self.output_info(result)
+        return compositions
+    def output_info(self, result):
+        compositions = []
+        for material in result:
+            if 'resolvedFormulas' in material:
+                for resolved_formula in material['resolvedFormulas']:
+                    if 'formulaComposition' in resolved_formula:
+                        compositions.append(resolved_formula['formulaComposition'])
+            elif 'formula' in material:
+                if 'formulaComposition' in material['formula']:
+                    compositions.append(material['formula']['formulaComposition'])
+            if 'name' in material:
+                compositions.append(material['name'])
+        return compositions
+    @staticmethod
+    def parse_superconductors_output(result, original_text):
+        materials = []
+        for passage in result['passages']:
+            sentence_offset = original_text.index(passage['text'])
+            if 'spans' in passage:
+                spans = passage['spans']
+                for material_span in filter(lambda s: s['type'] == '<material>', spans):
+                    text_ = material_span['text']
+                    base_material_information = {
+                        "text": text_,
+                        "offset_start": sentence_offset + material_span['offset_start'],
+                        'offset_end': sentence_offset + material_span['offset_end']
+                    }
+                    materials.append(base_material_information)
+        return materials
+class GrobidAggregationProcessor(GrobidQuantitiesProcessor, GrobidMaterialsProcessor):
+    """Combined NER processor that merges quantity and material annotations.
+    Runs both :class:`GrobidQuantitiesProcessor` and
+    :class:`GrobidMaterialsProcessor`, then prunes overlapping spans so
+    that the output is clean and non-overlapping.
+    Args:
+        grobid_quantities_client: Optional quantities API client.
+        grobid_superconductors_client: Optional materials NER client.
+    Either or both clients may be ``None``; only the provided services
+    will be called.
+    """
+    def __init__(self, grobid_quantities_client=None, grobid_superconductors_client=None):
+        if grobid_quantities_client:
+            self.gqp = GrobidQuantitiesProcessor(grobid_quantities_client)
+        if grobid_superconductors_client:
+            self.gmp = GrobidMaterialsProcessor(grobid_superconductors_client)
+    def process_single_text(self, text):
+        """Run both NER services on *text* and return merged, deduplicated spans.
+        Args:
+            text: Plain text to process.
+        Returns:
+            list[dict]: Non-overlapping span dicts sorted by offset.
+        """
+        extracted_quantities_spans = self.process_properties(text)
+        extracted_materials_spans = self.process_materials(text)
+        all_entities = extracted_quantities_spans + extracted_materials_spans
+        entities = self.prune_overlapping_annotations(all_entities)
+        return entities
+    def process_properties(self, text):
+        if self.gqp:
+            return self.gqp.process(text)
+        else:
+            return []
+    def process_materials(self, text):
+        if self.gmp:
+            return self.gmp.process(text)
+        else:
+            return []
+    @staticmethod
+    def box_to_dict(box, color=None, type=None, border=None):
+        """Convert a GROBID coordinate list into an annotation dict.
+        Args:
+            box: List or tuple of ``[page, x, y, width, height]``.
+            color: Optional hex colour string for the annotation.
+            type: Optional annotation type label.
+            border: Optional border style (e.g. ``"dotted"``).
+        Returns:
+            dict: Annotation dict suitable for ``streamlit-pdf-viewer``,
+            or empty dict if *box* is invalid.
+        """
+        if box is None or box == "" or len(box) < 5:
+            return {}
+        item = {"page": box[0], "x": box[1], "y": box[2], "width": box[3], "height": box[4]}
+        if color:
+            item['color'] = color
+        if type:
+            item['type'] = type
+        if border:
+            item['border'] = border
+        return item
+    @staticmethod
+    def prune_overlapping_annotations(entities: list) -> list:
+        """Remove overlapping spans, keeping the most informative one.
+        When two spans overlap, the longer span is preferred.  Adjacent
+        spans of the same type may be merged (e.g. a split decimal number).
+        Args:
+            entities: List of span dicts with ``offset_start``,
+                ``offset_end``, ``type``, and ``text`` keys.
+        Returns:
+            list[dict]: Pruned, non-overlapping spans sorted by offset.
+        """
+        # Sorting by offsets
+        sorted_entities = sorted(entities, key=lambda d: d['offset_start'])
+        if len(entities) <= 1:
+            return sorted_entities
+        to_be_removed = []
+        previous = None
+        first = True
+        for current in sorted_entities:
+            if first:
+                first = False
+                previous = current
+                continue
+            if previous['offset_start'] < current['offset_start'] \
+                    and previous['offset_end'] < current['offset_end'] \
+                    and (previous['offset_end'] < current['offset_start'] \
+                         and not (previous['text'] == "-" and current['text'][0].isdigit())):
+                previous = current
+                continue
+            if previous['offset_end'] < current['offset_end']:
+                if current['type'] == previous['type']:
+                    # Type is the same
+                    if current['offset_start'] == previous['offset_end']:
+                        if current['type'] == 'property':
+                            if current['text'].startswith("."):
+                                print(
+                                    f"Merging. {current['text']} <{current['type']}> with {previous['text']} <{previous['type']}>")
+                                # current entity starts with a ".", suspiciously look like a truncated value
+                                to_be_removed.append(previous)
+                                current['text'] = previous['text'] + current['text']
+                                current['raw_value'] = current['text']
+                                current['offset_start'] = previous['offset_start']
+                            elif previous['text'].endswith(".") and current['text'][0].isdigit():
+                                print(
+                                    f"Merging. {current['text']} <{current['type']}> with {previous['text']} <{previous['type']}>")
+                                # previous entity ends with ".", current entity starts with a number
+                                to_be_removed.append(previous)
+                                current['text'] = previous['text'] + current['text']
+                                current['raw_value'] = current['text']
+                                current['offset_start'] = previous['offset_start']
+                            elif previous['text'].startswith("-"):
+                                print(
+                                    f"Merging. {current['text']} <{current['type']}> with {previous['text']} <{previous['type']}>")
+                                # previous starts with a `-`, sherlock this is another truncated value
+                                current['text'] = previous['text'] + current['text']
+                                current['raw_value'] = current['text']
+                                current['offset_start'] = previous['offset_start']
+                                to_be_removed.append(previous)
+                            else:
+                                print("Other cases to be considered: ", previous, current)
+                        else:
+                            if current['text'].startswith("-"):
+                                print(
+                                    f"Merging. {current['text']} <{current['type']}> with {previous['text']} <{previous['type']}>")
+                                # previous starts with a `-`, sherlock this is another truncated value
+                                current['text'] = previous['text'] + current['text']
+                                current['raw_value'] = current['text']
+                                current['offset_start'] = previous['offset_start']
+                                to_be_removed.append(previous)
+                            else:
+                                print("Other cases to be considered: ", previous, current)
+                    elif previous['text'] == "-" and current['text'][0].isdigit():
+                        print(
+                            f"Merging. {current['text']} <{current['type']}> with {previous['text']} <{previous['type']}>")
+                        # previous starts with a `-`, sherlock this is another truncated value
+                        current['text'] = previous['text'] + " " * (current['offset_start'] - previous['offset_end']) + \
+                                          current['text']
+                        current['raw_value'] = current['text']
+                        current['offset_start'] = previous['offset_start']
+                        to_be_removed.append(previous)
+                    else:
+                        print(
+                            f"Overlapping. {current['text']} <{current['type']}> with {previous['text']} <{previous['type']}>")
+                        # take the largest one
+                        if len(previous['text']) > len(current['text']):
+                            to_be_removed.append(current)
+                        elif len(previous['text']) < len(current['text']):
+                            to_be_removed.append(previous)
+                        else:
+                            to_be_removed.append(previous)
+                elif current['type'] != previous['type']:
+                    print(
+                        f"Overlapping. {current['text']} <{current['type']}> with {previous['text']} <{previous['type']}>")
+                    if len(previous['text']) > len(current['text']):
+                        to_be_removed.append(current)
+                    elif len(previous['text']) < len(current['text']):
+                        to_be_removed.append(previous)
+                    else:
+                        if current['type'] == "material":
+                            to_be_removed.append(previous)
+                        else:
+                            to_be_removed.append(current)
+                previous = current
+            elif previous['offset_end'] > current['offset_end']:
+                to_be_removed.append(current)
+                # the previous goes after the current, so we keep the previous and we discard the current
+            else:
+                if current['type'] == "material":
+                    to_be_removed.append(previous)
+                else:
+                    to_be_removed.append(current)
+                previous = current
+        new_sorted_entities = [e for e in sorted_entities if e not in to_be_removed]
+        return new_sorted_entities
+class XmlProcessor(BaseProcessor):
+    def __init__(self):
+        super().__init__()
+    def process_structure(self, input_file):
+        text = ""
+        with open(input_file, encoding='utf-8') as fi:
+            text = fi.read()
+        output_data = self.parse_xml(text)
+        output_data['filename'] = Path(input_file).stem.replace(".tei", "")
+        return output_data
+    # def process_single(self, input_file):
+    #     doc = self.process_structure(input_file)
+    #
+    #     for paragraph in doc['passages']:
+    #         entities = self.process_single_text(paragraph['text'])
+    #         paragraph['spans'] = entities
+    #
+    #     return doc
+    def process(self, text):
+        output_data = OrderedDict()
+        soup = BeautifulSoup(text, 'xml')
+        text_blocks_children = get_children_list_supermat(soup, verbose=False)
+        passages = []
+        output_data['passages'] = passages
+        passages.extend([
+            {
+                "text": self.post_process(''.join(text for text in sentence.find_all(text=True) if
+                                                  text.parent.name != "ref" or (
+                                                          text.parent.name == "ref" and text.parent.attrs[
+                                                      'type'] != 'bibr'))),
+                "type": "paragraph",
+                "section": "<body>",
+                "subSection": "<paragraph>",
+                "passage_id": str(paragraph_id) + str(sentence_id)
+            }
+            for paragraph_id, paragraph in enumerate(text_blocks_children) for
+            sentence_id, sentence in enumerate(paragraph)
+        ])
+        return output_data
+def get_children_list_supermat(soup, use_paragraphs=False, verbose=False):
+    children = []
+    child_name = "p" if use_paragraphs else "s"
+    for child in soup.tei.children:
+        if child.name == 'teiHeader':
+            pass
+            children.append(child.find_all("title"))
+            children.extend([subchild.find_all(child_name) for subchild in child.find_all("abstract")])
+            children.extend([subchild.find_all(child_name) for subchild in child.find_all("ab", {"type": "keywords"})])
+        elif child.name == 'text':
+            children.extend([subchild.find_all(child_name) for subchild in child.find_all("body")])
+    if verbose:
+        print(str(children))
+    return children
+def get_children_list_grobid(soup: object, use_paragraphs: object = True, verbose: object = False) -> object:
+    children = []
+    child_name = "p" if use_paragraphs else "s"
+    for child in soup.TEI.children:
+        if child.name == 'teiHeader':
+            pass
+            # children.extend(child.find_all("title", attrs={"level": "a"}, limit=1))
+            # children.extend([subchild.find_all(child_name) for subchild in child.find_all("abstract")])
+        elif child.name == 'text':
+            children.extend([subchild.find_all(child_name) for subchild in child.find_all("body")])
+            children.extend([subchild.find_all("figDesc") for subchild in child.find_all("body")])
+    if verbose:
+        print(str(children))
+    return children
+def get_xml_nodes_header(soup: object, use_paragraphs: bool = True) -> list:
+    sub_tag = "p" if use_paragraphs else "s"
+    header_elements = {
+        "authors": [persNameNode for persNameNode in soup.teiHeader.find_all("persName")],
+        "abstract": [p_in_abstract for abstractNodes in soup.teiHeader.find_all("abstract") for p_in_abstract in
+                     abstractNodes.find_all(sub_tag)],
+        "title": [soup.teiHeader.fileDesc.title]
+    }
+    return header_elements
+def get_xml_nodes_body(soup: object, use_paragraphs: bool = True, verbose: bool = False) -> list:
+    nodes = []
+    tag_name = "p" if use_paragraphs else "s"
+    for child in soup.TEI.children:
+        if child.name == 'text':
+            # nodes.extend([subchild.find_all(tag_name) for subchild in child.find_all("body")])
+            nodes.extend(
+                [subsubchild for subchild in child.find_all("body") for subsubchild in subchild.find_all(tag_name)])
+    if verbose:
+        print(str(nodes))
+    return nodes
+def get_xml_nodes_back(soup: object, use_paragraphs: bool = True, verbose: bool = False) -> list:
+    nodes = []
+    tag_name = "p" if use_paragraphs else "s"
+    for child in soup.TEI.children:
+        if child.name == 'text':
+            nodes.extend(
+                [subsubchild for subchild in child.find_all("back") for subsubchild in subchild.find_all(tag_name)])
+    if verbose:
+        print(str(nodes))
+    return nodes
+def get_xml_nodes_figures(soup: object, verbose: bool = False) -> list:
+    children = []
+    for child in soup.TEI.children:
+        if child.name == 'text':
+            children.extend(
+                [subchild for subchilds in child.find_all("body") for subchild in subchilds.find_all("figDesc")])
+    if verbose:
+        print(str(children))
+    return children

document_qa/langchain.py ADDED Viewed

	@@ -0,0 +1,222 @@

+"""LangChain vector store extensions for document-qa.
+Extends ChromaDB with support for returning similarity scores **and**
+raw embedding vectors alongside retrieved documents.  This enables
+the Streamlit frontend to compute relevance gradients and the
+``question_coefficient`` analysis mode.
+"""
+from typing import Any, Optional, List, Dict, Tuple, ClassVar, Collection
+from langchain.schema import Document
+from langchain_community.vectorstores.chroma import Chroma, DEFAULT_K
+from langchain_core.callbacks import CallbackManagerForRetrieverRun
+from langchain_core.utils import xor_args
+from langchain_core.vectorstores import VectorStore, VectorStoreRetriever
+class AdvancedVectorStoreRetriever(VectorStoreRetriever):
+    """Retriever that can enrich documents with similarity scores and embeddings.
+    Extends LangChain's ``VectorStoreRetriever`` with a
+    ``"similarity_with_embeddings"`` search type.  When used, each
+    returned document's ``metadata`` dict gains ``__similarity`` (float)
+    and ``__embeddings`` (list[float]) keys.
+    """
+    allowed_search_types: ClassVar[Collection[str]] = (
+        "similarity",
+        "similarity_score_threshold",
+        "mmr",
+        "similarity_with_embeddings"
+    )
+    def _get_relevant_documents(
+            self, query: str, *, run_manager: CallbackManagerForRetrieverRun
+    ) -> List[Document]:
+        """Fetch relevant documents for the configured search type.
+        Supports all standard search types plus
+        ``"similarity_with_embeddings"`` which attaches score and
+        embedding vector metadata to each document.
+        Args:
+            query: The search query string.
+            run_manager: LangChain callback manager.
+        Returns:
+            list[Document]: Retrieved documents, optionally enriched
+            with similarity scores and embeddings.
+        """
+        if self.search_type == "similarity_with_embeddings":
+            docs_scores_and_embeddings = (
+                self.vectorstore.advanced_similarity_search(
+                    query, **self.search_kwargs
+                )
+            )
+            for doc, score, embeddings in docs_scores_and_embeddings:
+                if '__embeddings' not in doc.metadata.keys():
+                    doc.metadata['__embeddings'] = embeddings
+                if '__similarity' not in doc.metadata.keys():
+                    doc.metadata['__similarity'] = score
+            docs = [doc for doc, _, _ in docs_scores_and_embeddings]
+        elif self.search_type == "similarity_score_threshold":
+            docs_and_similarities = (
+                self.vectorstore.similarity_search_with_relevance_scores(
+                    query, **self.search_kwargs
+                )
+            )
+            for doc, similarity in docs_and_similarities:
+                if '__similarity' not in doc.metadata.keys():
+                    doc.metadata['__similarity'] = similarity
+            docs = [doc for doc, _ in docs_and_similarities]
+        else:
+            docs = super()._get_relevant_documents(query, run_manager=run_manager)
+        return docs
+class AdvancedVectorStore(VectorStore):
+    """
+    Extension of LangChain's VectorStore that returns a custom retriever
+    supporting advanced search features.
+    """
+    def as_retriever(self, **kwargs: Any) -> AdvancedVectorStoreRetriever:
+        """Create a retriever supporting ``similarity_with_embeddings``.
+        Accepts the same keyword arguments as the base ``as_retriever``.
+        """
+        tags = kwargs.pop("tags", None) or []
+        tags.extend(self._get_retriever_tags())
+        return AdvancedVectorStoreRetriever(vectorstore=self, **kwargs, tags=tags)
+class ChromaAdvancedRetrieval(Chroma, AdvancedVectorStore):
+    """Chroma vector store with support for embeddings + similarity scores.
+    Extends the standard LangChain ``Chroma`` store with
+    `advanced_similarity_search` which returns ``(Document, score,
+    embedding)`` triples.
+    """
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+    @xor_args(("query_texts", "query_embeddings"))
+    def __query_collection(
+            self,
+            query_texts: Optional[List[str]] = None,
+            query_embeddings: Optional[List[List[float]]] = None,
+            n_results: int = 4,
+            where: Optional[Dict[str, str]] = None,
+            where_document: Optional[Dict[str, str]] = None,
+            **kwargs: Any,
+    ) -> List[Document]:
+        """Query the chroma collection."""
+        try:
+            import chromadb  # noqa: F401
+        except ImportError:
+            raise ValueError(
+                "Could not import chromadb python package. "
+                "Please install it with `pip install chromadb`."
+            )
+        return self._collection.query(
+            query_texts=query_texts,
+            query_embeddings=query_embeddings,
+            n_results=n_results,
+            where=where,
+            where_document=where_document,
+            **kwargs,
+        )
+    def advanced_similarity_search(
+            self,
+            query: str,
+            k: int = DEFAULT_K,
+            filter: Optional[Dict[str, str]] = None,
+            **kwargs: Any,
+    ) -> List[Tuple[Document, float, List[float]]]:
+        """Return documents, similarity scores, and embeddings for *query*.
+        Args:
+            query: The search query.
+            k: Number of results to return.
+            filter: Optional Chroma metadata filter.
+        Returns:
+            list[tuple[Document, float, list[float]]]: Triples of
+            (document, distance, embedding_vector).
+        """
+        docs_scores_and_embeddings = self.similarity_search_with_scores_and_embeddings(query, k, filter=filter)
+        return docs_scores_and_embeddings
+    def similarity_search_with_scores_and_embeddings(
+            self,
+            query: str,
+            k: int = DEFAULT_K,
+            filter: Optional[Dict[str, str]] = None,
+            where_document: Optional[Dict[str, str]] = None,
+            **kwargs: Any,
+    ) -> List[Tuple[Document, float, List[float]]]:
+        """Low-level search returning docs with scores and embeddings.
+        Queries the Chroma collection requesting ``distances`` and
+        ``embeddings`` in addition to the usual documents and metadata.
+        Args:
+            query: The search query.
+            k: Number of results.
+            filter: Optional metadata filter.
+            where_document: Optional document-content filter.
+        Returns:
+            list[tuple[Document, float, list[float]]]: Triples of
+            (document, distance, embedding_vector).
+        """
+        if self._embedding_function is None:
+            results = self.__query_collection(
+                query_texts=[query],
+                n_results=k,
+                where=filter,
+                where_document=where_document,
+                include=['metadatas', 'documents', 'embeddings', 'distances']
+            )
+        else:
+            query_embedding = self._embedding_function.embed_query(query)
+            results = self.__query_collection(
+                query_embeddings=[query_embedding],
+                n_results=k,
+                where=filter,
+                where_document=where_document,
+                include=['metadatas', 'documents', 'embeddings', 'distances']
+            )
+        return _results_to_docs_scores_and_embeddings(results)
+def _results_to_docs_scores_and_embeddings(results: Any) -> List[Tuple[Document, float, List[float]]]:
+    """Unpack raw Chroma query results into ``(Document, score, embedding)`` tuples.
+    Args:
+        results: Dict returned by ``Collection.query()`` with
+            ``include=['documents', 'metadatas', 'distances', 'embeddings']``.
+    Returns:
+        list[tuple[Document, float, list[float]]]: One tuple per result.
+    """
+    return [
+        (Document(page_content=result[0], metadata=result[1] or {}), result[2], result[3])
+        for result in zip(
+            results["documents"][0],
+            results["metadatas"][0],
+            results["distances"][0],
+            results["embeddings"][0],
+        )
+    ]

document_qa/ner_client_generic.py ADDED Viewed

	@@ -0,0 +1,461 @@

+import os
+import time
+import yaml
+'''
+This client is a generic client for any Grobid application and sub-modules.
+At the moment, it supports only single document processing.
+Source: https://github.com/kermitt2/grobid-client-python
+'''
+""" Generic API Client """
+from copy import deepcopy
+import json
+import requests
+try:
+    from urlparse import urljoin
+except ImportError:
+    from urllib.parse import urljoin
+class ApiClient(object):
+    """ Client to interact with a generic Rest API.
+    Subclasses should implement functionality accordingly with the provided
+    service methods, i.e. ``get``, ``post``, ``put`` and ``delete``.
+    """
+    accept_type = 'application/xml'
+    api_base = None
+    def __init__(
+            self,
+            base_url,
+            username=None,
+            api_key=None,
+            status_endpoint=None,
+            timeout=60
+    ):
+        """ Initialise client.
+        Args:
+            base_url (str): The base URL to the service being used.
+            username (str): The username to authenticate with.
+            api_key (str): The API key to authenticate with.
+            timeout (int): Maximum time before timing out.
+        """
+        self.base_url = base_url
+        self.username = username
+        self.api_key = api_key
+        self.status_endpoint = urljoin(self.base_url, status_endpoint)
+        self.timeout = timeout
+    @staticmethod
+    def encode(request, data):
+        """ Add request content data to request body, set Content-type header.
+        Should be overridden by subclasses if not using JSON encoding.
+        Args:
+            request (HTTPRequest): The request object.
+            data (dict, None): Data to be encoded.
+        Returns:
+            HTTPRequest: The request object.
+        """
+        if data is None:
+            return request
+        request.add_header('Content-Type', 'application/json')
+        request.extracted_data = json.dumps(data)
+        return request
+    @staticmethod
+    def decode(response):
+        """ Decode the returned data in the response.
+        Should be overridden by subclasses if something else than JSON is
+        expected.
+        Args:
+            response (HTTPResponse): The response object.
+        Returns:
+            dict or None.
+        """
+        try:
+            return response.json()
+        except ValueError as e:
+            return e.message
+    def get_credentials(self):
+        """ Returns parameters to be added to authenticate the request.
+        This lives on its own to make it easier to re-implement it if needed.
+        Returns:
+            dict: A dictionary containing the credentials.
+        """
+        return {"username": self.username, "api_key": self.api_key}
+    def call_api(
+            self,
+            method,
+            url,
+            headers=None,
+            params=None,
+            data=None,
+            files=None,
+            timeout=None,
+    ):
+        """ Call API.
+        This returns object containing data, with error details if applicable.
+        Args:
+            method (str): The HTTP method to use.
+            url (str): Resource location relative to the base URL.
+            headers (dict or None): Extra request headers to set.
+            params (dict or None): Query-string parameters.
+            data (dict or None): Request body contents for POST or PUT requests.
+            files (dict or None: Files to be passed to the request.
+            timeout (int): Maximum time before timing out.
+        Returns:
+            ResultParser or ErrorParser.
+        """
+        headers = deepcopy(headers) or {}
+        headers['Accept'] = self.accept_type if 'Accept' not in headers else headers['Accept']
+        params = deepcopy(params) or {}
+        data = data or {}
+        files = files or {}
+        # if self.username is not None and self.api_key is not None:
+        #    params.update(self.get_credentials())
+        r = requests.request(
+            method,
+            url,
+            headers=headers,
+            params=params,
+            files=files,
+            data=data,
+            timeout=timeout,
+        )
+        return r, r.status_code
+    def get(self, url, params=None, **kwargs):
+        """ Call the API with a GET request.
+        Args:
+            url (str): Resource location relative to the base URL.
+            params (dict or None): Query-string parameters.
+        Returns:
+            ResultParser or ErrorParser.
+        """
+        return self.call_api(
+            "GET",
+            url,
+            params=params,
+            **kwargs
+        )
+    def delete(self, url, params=None, **kwargs):
+        """ Call the API with a DELETE request.
+        Args:
+            url (str): Resource location relative to the base URL.
+            params (dict or None): Query-string parameters.
+        Returns:
+            ResultParser or ErrorParser.
+        """
+        return self.call_api(
+            "DELETE",
+            url,
+            params=params,
+            **kwargs
+        )
+    def put(self, url, params=None, data=None, files=None, **kwargs):
+        """ Call the API with a PUT request.
+        Args:
+            url (str): Resource location relative to the base URL.
+            params (dict or None): Query-string parameters.
+            data (dict or None): Request body contents.
+            files (dict or None: Files to be passed to the request.
+        Returns:
+            An instance of ResultParser or ErrorParser.
+        """
+        return self.call_api(
+            "PUT",
+            url,
+            params=params,
+            data=data,
+            files=files,
+            **kwargs
+        )
+    def post(self, url, params=None, data=None, files=None, **kwargs):
+        """ Call the API with a POST request.
+        Args:
+            url (str): Resource location relative to the base URL.
+            params (dict or None): Query-string parameters.
+            data (dict or None): Request body contents.
+            files (dict or None: Files to be passed to the request.
+        Returns:
+            An instance of ResultParser or ErrorParser.
+        """
+        return self.call_api(
+            method="POST",
+            url=url,
+            params=params,
+            data=data,
+            files=files,
+            **kwargs
+        )
+    def service_status(self, **kwargs):
+        """ Call the API to get the status of the service.
+        Returns:
+            An instance of ResultParser or ErrorParser.
+        """
+        return self.call_api(
+            'GET',
+            self.status_endpoint,
+            params={'format': 'json'},
+            **kwargs
+        )
+class NERClientGeneric(ApiClient):
+    def __init__(self, config_path=None, ping=False):
+        self.config = None
+        if config_path is not None:
+            self.config = self._load_yaml_config_from_file(path=config_path)
+            super().__init__(self.config['grobid']['server'])
+            if ping:
+                result = self.ping_service()
+                if not result:
+                    raise Exception("Grobid is down.")
+        os.environ['NO_PROXY'] = "nims.go.jp"
+    @staticmethod
+    def _load_json_config_from_file(path='./config.json'):
+        """
+        Load the json configuration
+        """
+        config = {}
+        with open(path, 'r') as fp:
+            config = json.load(fp)
+        return config
+    @staticmethod
+    def _load_yaml_config_from_file(path='./config.yaml'):
+        """
+        Load the YAML configuration
+        """
+        config = {}
+        try:
+            with open(path, 'r') as the_file:
+                raw_configuration = the_file.read()
+            config = yaml.safe_load(raw_configuration)
+        except Exception as e:
+            print("Configuration could not be loaded: ", str(e))
+            exit(1)
+        return config
+    def set_config(self, config, ping=False):
+        self.config = config
+        if ping:
+            try:
+                result = self.ping_service()
+                if not result:
+                    raise Exception("Grobid is down.")
+            except Exception as e:
+                raise Exception("Grobid is down or other problems were encountered. ", e)
+    def ping_service(self):
+        # test if the server is up and running...
+        ping_url = self.get_url("ping")
+        r = requests.get(ping_url)
+        status = r.status_code
+        if status != 200:
+            print('GROBID server does not appear up and running ' + str(status))
+            return False
+        else:
+            print("GROBID server is up and running")
+            return True
+    def get_url(self, action):
+        grobid_config = self.config['grobid']
+        base_url = grobid_config['server']
+        action_url = base_url + grobid_config['url_mapping'][action]
+        return action_url
+    def process_texts(self, input, method_name='superconductors', params={}, headers={"Accept": "application/json"}):
+        files = {
+            'texts': input
+        }
+        the_url = self.get_url(method_name)
+        params, the_url = self.get_params_from_url(the_url)
+        res, status = self.post(
+            url=the_url,
+            files=files,
+            data=params,
+            headers=headers
+        )
+        if status == 503:
+            time.sleep(self.config['sleep_time'])
+            return self.process_texts(input, method_name, params, headers)
+        elif status != 200:
+            print('Processing failed with error ' + str(status))
+            return status, None
+        else:
+            return status, json.loads(res.text)
+    def process_text(self, input, method_name='superconductors', params={}, headers={"Accept": "application/json"}):
+        files = {
+            'text': input
+        }
+        the_url = self.get_url(method_name)
+        params, the_url = self.get_params_from_url(the_url)
+        res, status = self.post(
+            url=the_url,
+            files=files,
+            data=params,
+            headers=headers
+        )
+        if status == 503:
+            time.sleep(self.config['sleep_time'])
+            return self.process_text(input, method_name, params, headers)
+        elif status != 200:
+            print('Processing failed with error ' + str(status))
+            return status, None
+        else:
+            return status, json.loads(res.text)
+    def process_pdf(self,
+                    form_data: dict,
+                    method_name='superconductors',
+                    params={},
+                    headers={"Accept": "application/json"}
+                    ):
+        the_url = self.get_url(method_name)
+        params, the_url = self.get_params_from_url(the_url)
+        res, status = self.post(
+            url=the_url,
+            files=form_data,
+            data=params,
+            headers=headers
+        )
+        if status == 503:
+            time.sleep(self.config['sleep_time'])
+            return self.process_text(input, method_name, params, headers)
+        elif status != 200:
+            print('Processing failed with error ' + str(status))
+        else:
+            return res.text
+    def process_pdfs(self, pdf_files, params={}):
+        pass
+    def process_pdf(
+            self,
+            pdf_file,
+            method_name,
+            params={},
+            headers={"Accept": "application/json"},
+            verbose=False,
+            retry=None
+    ):
+        files = {
+            'input': (
+                pdf_file,
+                open(pdf_file, 'rb'),
+                'application/pdf',
+                {'Expires': '0'}
+            )
+        }
+        the_url = self.get_url(method_name)
+        params, the_url = self.get_params_from_url(the_url)
+        res, status = self.post(
+            url=the_url,
+            files=files,
+            data=params,
+            headers=headers
+        )
+        if status == 503 or status == 429:
+            if retry is None:
+                retry = self.config['max_retry'] - 1
+            else:
+                if retry - 1 == 0:
+                    if verbose:
+                        print("re-try exhausted. Aborting request")
+                    return None, status
+                else:
+                    retry -= 1
+            sleep_time = self.config['sleep_time']
+            if verbose:
+                print("Server is saturated, waiting", sleep_time, "seconds and trying again. ")
+            time.sleep(sleep_time)
+            return self.process_pdf(pdf_file, method_name, params, headers, verbose=verbose, retry=retry)
+        elif status != 200:
+            desc = None
+            if res.content:
+                c = json.loads(res.text)
+                desc = c['description'] if 'description' in c else None
+            return desc, status
+        elif status == 204:
+            # print('No content returned. Moving on. ')
+            return None, status
+        else:
+            return res.text, status
+    def get_params_from_url(self, the_url):
+        """
+        This method is used to pass to the URL predefined parameters, which are added in the URL format
+        """
+        params = {}
+        if "?" in the_url:
+            split = the_url.split("?")
+            the_url = split[0]
+            params = split[1]
+            params = {param.split("=")[0]: param.split("=")[1] for param in params.split("&")}
+        return params, the_url

pyproject.toml ADDED Viewed

	@@ -0,0 +1,41 @@

+[build-system]
+requires = ["setuptools", "setuptools-scm"]
+build-backend = "setuptools.build_meta"
+[tool.bumpversion]
+current_version = "0.5.1"
+commit = "true"
+tag = "true"
+tag_name = "v{new_version}"
+#[[tool.bumpversion.files]]
+#filename = "version.txt"
+#search = "{current_version}"
+#replace = "{new_version}"
+[project]
+name = "document-qa-engine"
+license = { file = "LICENSE" }
+authors = [
+    { name = "Luca Foppiano", email = "lucanoro@duck.com" },
+]
+maintainers = [
+    { name = "Luca Foppiano", email = "lucanoro@duck.com" }
+]
+description = "Scientific Document Insight Q/A"
+readme = "README.md"
+dynamic = ['version', "dependencies"]
+[tool.setuptools]
+license-files = []
+[tool.setuptools.dynamic]
+dependencies = {file = ["requirements.txt"]}
+[tool.setuptools_scm]
+[project.urls]
+Homepage = "https://document-insights.streamlit.app"
+Repository = "https://github.com/lfoppiano/document-qa"
+Changelog = "https://github.com/lfoppiano/document-qa/blob/main/CHANGELOG.md"

pytest.ini ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ [pytest]
2	+ testpaths = tests

requirements.txt ADDED Viewed

	@@ -0,0 +1,34 @@

+# Grobid
+grobid-quantities-client==0.4.0
+grobid-client-python==0.0.9
+grobid-tei-xml==0.1.3
+# Utils
+tqdm==4.66.3
+pyyaml==6.0.1
+pytest==8.1.1
+streamlit==1.45.1
+lxml==5.2.1
+beautifulsoup4==4.12.3
+python-dotenv==1.0.1
+watchdog==4.0.0
+dateparser==1.2.0
+requests>=2.31.0
+numpy==1.26.4
+# LLM
+chromadb==0.4.24
+tiktoken==0.9.0
+openai==1.82.0
+langchain==0.3.25
+langchain-core==0.3.61
+langchain-openai==0.3.18
+langchain-huggingface==0.2.0
+langchain-community==0.3.21
+typing-inspect==0.9.0
+typing_extensions==4.12.2
+pydantic==2.10.6
+sentence-transformers==2.6.1
+streamlit-pdf-viewer==0.0.25
+umap-learn==0.5.6
+plotly==5.20.0

streamlit_app.py ADDED Viewed

	@@ -0,0 +1,490 @@

+"""Streamlit frontend for the Document Q/A system.
+This module implements the web UI for uploading scientific PDFs,
+asking questions via an LLM-powered RAG pipeline, and viewing
+highlighted PDF passages.  It is the main entry-point when running::
+    streamlit run streamlit_app.py
+Configuration is loaded from environment variables (see ``.env.example``).
+"""
+import os
+import re
+from hashlib import blake2b
+from tempfile import NamedTemporaryFile
+import dotenv
+from grobid_quantities.quantities import QuantitiesAPI
+from langchain.memory import ConversationBufferMemory
+from langchain_openai import ChatOpenAI
+from streamlit_pdf_viewer import pdf_viewer
+from document_qa.custom_embeddings import ModalEmbeddings
+from document_qa.ner_client_generic import NERClientGeneric
+dotenv.load_dotenv(override=True)
+import streamlit as st
+from document_qa.document_qa_engine import DocumentQAEngine, DataStorage
+from document_qa.grobid_processors import GrobidAggregationProcessor, decorate_text_with_annotations
+API_MODELS = {
+    "microsoft/Phi-4-mini-instruct": os.environ["PHI_URL"],
+    "Qwen/Qwen3-0.6B": os.environ["QWEN_URL"]
+}
+API_EMBEDDINGS = {
+    'intfloat/multilingual-e5-large-instruct-modal': os.environ['EMBEDS_URL']
+}
+if 'rqa' not in st.session_state:
+    st.session_state['rqa'] = {}
+if 'model' not in st.session_state:
+    st.session_state['model'] = None
+if 'api_keys' not in st.session_state:
+    st.session_state['api_keys'] = {}
+if 'doc_id' not in st.session_state:
+    st.session_state['doc_id'] = None
+if 'loaded_embeddings' not in st.session_state:
+    st.session_state['loaded_embeddings'] = None
+if 'hash' not in st.session_state:
+    st.session_state['hash'] = None
+if 'git_rev' not in st.session_state:
+    st.session_state['git_rev'] = "unknown"
+    if os.path.exists("revision.txt"):
+        with open("revision.txt", 'r') as fr:
+            from_file = fr.read()
+            st.session_state['git_rev'] = from_file if len(from_file) > 0 else "unknown"
+if "messages" not in st.session_state:
+    st.session_state.messages = []
+if 'ner_processing' not in st.session_state:
+    st.session_state['ner_processing'] = False
+if 'uploaded' not in st.session_state:
+    st.session_state['uploaded'] = False
+if 'memory' not in st.session_state:
+    st.session_state['memory'] = None
+if 'binary' not in st.session_state:
+    st.session_state['binary'] = None
+if 'annotations' not in st.session_state:
+    st.session_state['annotations'] = None
+if 'should_show_annotations' not in st.session_state:
+    st.session_state['should_show_annotations'] = True
+if 'pdf' not in st.session_state:
+    st.session_state['pdf'] = None
+if 'embeddings' not in st.session_state:
+    st.session_state['embeddings'] = None
+if 'scroll_to_first_annotation' not in st.session_state:
+    st.session_state['scroll_to_first_annotation'] = False
+st.set_page_config(
+    page_title="Scientific Document Insights Q/A",
+    page_icon="📝",
+    initial_sidebar_state="expanded",
+    layout="wide",
+    menu_items={
+        'Get Help': 'https://github.com/lfoppiano/document-qa',
+        'Report a bug': "https://github.com/lfoppiano/document-qa/issues",
+        'About': "Upload a scientific article in PDF, ask questions, get insights."
+    }
+)
+st.markdown(
+    """
+        <style>
+               .block-container {
+                    padding-top: 3rem;
+                    padding-bottom: 1rem;
+                    padding-left: 1rem;
+                    padding-right: 1rem;
+                }
+        </style>
+        """,
+    unsafe_allow_html=True
+)
+def new_file():
+    """Reset session state when a new file is uploaded.
+    Clears previous embeddings, annotations, and conversation memory
+    so the pipeline starts fresh for the new document.
+    """
+    st.session_state['loaded_embeddings'] = None
+    st.session_state['doc_id'] = None
+    st.session_state['uploaded'] = True
+    st.session_state['annotations'] = []
+    if st.session_state['memory']:
+        st.session_state['memory'].clear()
+def clear_memory():
+    """Clear the conversation buffer memory (chat history)."""
+    st.session_state['memory'].clear()
+# @st.cache_resource
+def init_qa(model_name, embeddings_name):
+    """Initialise the Q/A engine with the selected LLM and embedding models.
+    Args:
+        model_name: Key from ``API_MODELS`` selecting the LLM.
+        embeddings_name: Key from ``API_EMBEDDINGS`` selecting the
+            embedding model.
+    Returns:
+        DocumentQAEngine: Ready-to-use engine instance.
+    """
+    st.session_state['memory'] = ConversationBufferMemory(
+        memory_key="chat_history",
+        return_messages=True
+    )
+    chat = ChatOpenAI(
+        model=model_name,
+        temperature=0.0,
+        base_url=API_MODELS[model_name],
+        api_key=os.environ.get('API_KEY')
+    )
+    embeddings = ModalEmbeddings(
+        url=API_EMBEDDINGS[embeddings_name],
+        model_name=embeddings_name,
+        api_key=os.environ.get('EMBEDS_API_KEY')
+    )
+    storage = DataStorage(embeddings)
+    return DocumentQAEngine(chat, storage, grobid_url=os.environ['GROBID_URL'], memory=st.session_state['memory'])
+@st.cache_resource
+def init_ner():
+    """Initialise the NER aggregation processor (quantities + materials).
+    Uses ``GROBID_QUANTITIES_URL`` and ``GROBID_MATERIALS_URL`` from
+    the environment.  Results are cached across Streamlit reruns.
+    Returns:
+        GrobidAggregationProcessor: Configured processor instance.
+    """
+    quantities_client = QuantitiesAPI(os.environ['GROBID_QUANTITIES_URL'], check_server=True)
+    materials_client = NERClientGeneric(ping=True)
+    config_materials = {
+        'grobid': {
+            "server": os.environ['GROBID_MATERIALS_URL'],
+            'sleep_time': 5,
+            'timeout': 60,
+            'url_mapping': {
+                'processText_disable_linking': "/service/process/text?disableLinking=True",
+                # 'processText_disable_linking': "/service/process/text"
+            }
+        }
+    }
+    materials_client.set_config(config_materials)
+    gqa = GrobidAggregationProcessor(grobid_quantities_client=quantities_client,
+                                     grobid_superconductors_client=materials_client)
+    return gqa
+gqa = init_ner()
+def get_file_hash(fname):
+    """Compute a BLAKE2b hex digest for the file at *fname*.
+    Used to generate deterministic document IDs from file content.
+    """
+    hash_md5 = blake2b()
+    with open(fname, "rb") as f:
+        for chunk in iter(lambda: f.read(4096), b""):
+            hash_md5.update(chunk)
+    return hash_md5.hexdigest()
+def play_old_messages(container):
+    """Re-render previous chat messages into *container*.
+    Called on Streamlit reruns to restore the visible conversation
+    history from ``st.session_state['messages']``.
+    """
+    if st.session_state['messages']:
+        for message in st.session_state['messages']:
+            if message['role'] == 'user':
+                container.chat_message("user").markdown(message['content'])
+            elif message['role'] == 'assistant':
+                if mode == "LLM":
+                    container.chat_message("assistant").markdown(message['content'], unsafe_allow_html=True)
+                else:
+                    container.chat_message("assistant").write(message['content'])
+# is_api_key_provided = st.session_state['api_key']
+with st.sidebar:
+    st.title("📝 Document Q/A")
+    st.markdown("Upload a scientific article in PDF, ask questions, get insights.")
+    st.markdown(
+        ":warning: [Usage disclaimer](https://github.com/lfoppiano/document-qa?tab=readme-ov-file#disclaimer-on-data-security-and-privacy-%EF%B8%8F) :warning: ")
+    st.markdown("LM and Embeddings are powered by [Modal.com](https://modal.com/)")
+    st.divider()
+    st.session_state['model'] = model = st.selectbox(
+        "Model:",
+        options=API_MODELS.keys(),
+        index=(list(API_MODELS.keys())).index(
+            os.environ["DEFAULT_MODEL"]) if "DEFAULT_MODEL" in os.environ and os.environ["DEFAULT_MODEL"] else 0,
+        placeholder="Select model",
+        help="Select the LLM model:",
+        disabled=st.session_state['doc_id'] is not None or st.session_state['uploaded']
+    )
+    st.session_state['embeddings'] = embedding_name = st.selectbox(
+        "Embeddings:",
+        options=API_EMBEDDINGS.keys(),
+        index=(list(API_EMBEDDINGS.keys())).index(
+            os.environ["DEFAULT_EMBEDDING"]) if "DEFAULT_EMBEDDING" in os.environ and os.environ[
+            "DEFAULT_EMBEDDING"] else 0,
+        placeholder="Select embedding",
+        help="Select the Embedding function:",
+        disabled=st.session_state['doc_id'] is not None or st.session_state['uploaded']
+    )
+    api_key = os.environ['API_KEY']
+    if model not in st.session_state['rqa'] or model not in st.session_state['api_keys']:
+        with st.spinner("Preparing environment"):
+            st.session_state['rqa'][model] = init_qa(model, st.session_state['embeddings'])
+            st.session_state['api_keys'][model] = api_key
+left_column, right_column = st.columns([5, 4])
+right_column = right_column.container(border=True)
+left_column = left_column.container(border=True)
+with right_column:
+    uploaded_file = st.file_uploader(
+        "Upload a scientific article",
+        type=("pdf"),
+        on_change=new_file,
+        disabled=st.session_state['model'] is not None and st.session_state['model'] not in
+                 st.session_state['api_keys'],
+        help="The full-text is extracted using Grobid."
+    )
+    placeholder = st.empty()
+    messages = st.container(height=300)
+    question = st.chat_input(
+        "Ask something about the article",
+        # placeholder="Can you give me a short summary?",
+        disabled=not uploaded_file
+    )
+query_modes = {
+    "llm": "LLM Q/A",
+    "embeddings": "Embeddings",
+    "question_coefficient": "Question coefficient"
+}
+with st.sidebar:
+    st.header("Settings")
+    mode = st.radio(
+        "Query mode",
+        ("llm", "embeddings", "question_coefficient"),
+        disabled=not uploaded_file,
+        index=0,
+        horizontal=True,
+        format_func=lambda x: query_modes[x],
+        help="LLM will respond the question, Embedding will show the "
+             "relevant paragraphs to the question in the paper. "
+             "Question coefficient attempt to estimate how effective the question will be answered."
+    )
+    st.session_state['scroll_to_first_annotation'] = st.checkbox(
+        "Scroll to context",
+        help='The PDF viewer will automatically scroll to the first relevant passage in the document.'
+    )
+    st.session_state['ner_processing'] = st.checkbox(
+        "Identify materials and properties.",
+        help='The LLM responses undergo post-processing to extract physical quantities, measurements, and materials mentions.'
+    )
+    # Add a checkbox for showing annotations
+    # st.session_state['show_annotations'] = st.checkbox("Show annotations", value=True)
+    # st.session_state['should_show_annotations'] = st.checkbox("Show annotations", value=True)
+    chunk_size = st.slider("Text chunks size", -1, 2000, value=-1,
+                           help="Size of chunks in which split the document. -1: use paragraphs, > 0 paragraphs are aggregated.",
+                           disabled=uploaded_file is not None)
+    if chunk_size == -1:
+        context_size = st.slider("Context size (paragraphs)", 3, 20, value=10,
+                                 help="Number of paragraphs to consider when answering a question",
+                                 disabled=not uploaded_file)
+    else:
+        context_size = st.slider("Context size (chunks)", 3, 10, value=4,
+                                 help="Number of chunks to consider when answering a question",
+                                 disabled=not uploaded_file)
+    st.divider()
+    st.header("Documentation")
+    st.markdown("https://github.com/lfoppiano/document-qa")
+    st.markdown(
+        """Upload a scientific article as PDF document. Once the spinner stops, you can proceed to ask your questions.""")
+    if st.session_state['git_rev'] != "unknown":
+        st.markdown("**Revision number**: [" + st.session_state[
+            'git_rev'] + "](https://github.com/lfoppiano/document-qa/commit/" + st.session_state['git_rev'] + ")")
+if uploaded_file and not st.session_state.loaded_embeddings:
+    if model not in st.session_state['api_keys']:
+        st.error("Before uploading a document, you must enter the API key. ")
+        st.stop()
+    with left_column:
+        with st.spinner('Reading file, calling Grobid, and creating in-memory embeddings...'):
+            binary = uploaded_file.getvalue()
+            tmp_file = NamedTemporaryFile()
+            tmp_file.write(bytearray(binary))
+            st.session_state['binary'] = binary
+            st.session_state['doc_id'] = hash = st.session_state['rqa'][model].create_memory_embeddings(
+                tmp_file.name,
+                chunk_size=chunk_size,
+                perc_overlap=0.1
+            )
+            st.session_state['loaded_embeddings'] = True
+            st.session_state.messages = []
+def rgb_to_hex(rgb):
+    """Convert an ``(R, G, B)`` tuple to a ``#rrggbb`` hex string."""
+    return "#{:02x}{:02x}{:02x}".format(*rgb)
+def generate_color_gradient(num_elements):
+    """Generate a warm-to-cold hex colour gradient for annotation ranking.
+    The first colour (most relevant passage) is orange; the last (least
+    relevant) is blue.  Intermediate colours are linearly interpolated.
+    Args:
+        num_elements: Number of gradient stops to produce.
+    Returns:
+        list[str]: Hex colour strings, e.g. ``['#ffa500', …, '#0000ff']``.
+    """
+    # Define warm and cold colors in RGB format
+    warm_color = (255, 165, 0)  # Orange
+    cold_color = (0, 0, 255)  # Blue
+    # Generate a linear gradient of colors
+    color_gradient = [
+        rgb_to_hex(tuple(int(warm * (1 - i / num_elements) + cold * (i / num_elements)) for warm, cold in
+                         zip(warm_color, cold_color)))
+        for i in range(num_elements)
+    ]
+    return color_gradient
+with right_column:
+    if st.session_state.loaded_embeddings and question and len(question) > 0 and st.session_state.doc_id:
+        st.session_state.messages.append({"role": "user", "mode": mode, "content": question})
+        for message in st.session_state.messages:
+            # with messages.chat_message(message["role"]):
+            if message['mode'] == "llm":
+                messages.chat_message(message["role"]).markdown(message["content"], unsafe_allow_html=True)
+            elif message['mode'] == "embeddings":
+                messages.chat_message(message["role"]).write(message["content"])
+            elif message['mode'] == "question_coefficient":
+                messages.chat_message(message["role"]).markdown(message["content"], unsafe_allow_html=True)
+        if model not in st.session_state['rqa']:
+            st.error("The API Key for the " + model + " is  missing. Please add it before sending any query. `")
+            st.stop()
+        text_response = None
+        if mode == "embeddings":
+            with placeholder:
+                with st.spinner("Fetching the relevant context..."):
+                    text_response, coordinates = st.session_state['rqa'][model].query_storage(
+                        question,
+                        st.session_state.doc_id,
+                        context_size=context_size
+                    )
+        elif mode == "llm":
+            with placeholder:
+                with st.spinner("Generating LLM response..."):
+                    _, text_response, coordinates = st.session_state['rqa'][model].query_document(
+                        question,
+                        st.session_state.doc_id,
+                        context_size=context_size
+                    )
+        elif mode == "question_coefficient":
+            with st.spinner("Estimate question/context relevancy..."):
+                text_response, coordinates = st.session_state['rqa'][model].analyse_query(
+                    question,
+                    st.session_state.doc_id,
+                    context_size=context_size
+                )
+        annotations = [[GrobidAggregationProcessor.box_to_dict([cs for cs in c.split(",")]) for c in coord_doc]
+                       for coord_doc in coordinates]
+        gradients = generate_color_gradient(len(annotations))
+        for i, color in enumerate(gradients):
+            for annotation in annotations[i]:
+                annotation['color'] = color
+                if i == 0:
+                    annotation['border'] = "dotted"
+        st.session_state['annotations'] = [annotation for annotation_doc in annotations for annotation in
+                                           annotation_doc]
+        if not text_response:
+            st.error("Something went wrong. Contact info AT sciencialab.com to report the issue through GitHub.")
+        if mode == "llm":
+            if st.session_state['ner_processing']:
+                with st.spinner("Processing NER on LLM response..."):
+                    entities = gqa.process_single_text(text_response)
+                    decorated_text = decorate_text_with_annotations(text_response.strip(), entities)
+                    decorated_text = decorated_text.replace('class="label material"', 'style="color:green"')
+                    decorated_text = re.sub(r'class="label[^"]+"', 'style="color:orange"', decorated_text)
+                    text_response = decorated_text
+            messages.chat_message("assistant").markdown(text_response, unsafe_allow_html=True)
+        else:
+            messages.chat_message("assistant").write(text_response)
+        st.session_state.messages.append({"role": "assistant", "mode": mode, "content": text_response})
+    elif st.session_state.loaded_embeddings and st.session_state.doc_id:
+        play_old_messages(messages)
+with left_column:
+    if st.session_state['binary']:
+        with st.container(height=600):
+            pdf_viewer(
+                input=st.session_state['binary'],
+                annotation_outline_size=2,
+                annotations=st.session_state['annotations'] if st.session_state['annotations'] else [],
+                render_text=True,
+                scroll_to_annotation=1 if (st.session_state['annotations'] and st.session_state[
+                    'scroll_to_first_annotation']) else None
+            )