lfoppiano commited on
Commit
916dea4
·
verified ·
1 Parent(s): 290feff

Upload folder using huggingface_hub

Browse files
.devcontainer/devcontainer.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "Python 3",
3
+ // Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile
4
+ "image": "mcr.microsoft.com/devcontainers/python:1-3.11-bullseye",
5
+ "customizations": {
6
+ "codespaces": {
7
+ "openFiles": [
8
+ "README.md",
9
+ "streamlit_app.py"
10
+ ]
11
+ },
12
+ "vscode": {
13
+ "settings": {},
14
+ "extensions": [
15
+ "ms-python.python",
16
+ "ms-python.vscode-pylance"
17
+ ]
18
+ }
19
+ },
20
+ "updateContentCommand": "[ -f packages.txt ] && sudo apt update && sudo apt upgrade -y && sudo xargs apt install -y <packages.txt; [ -f requirements.txt ] && pip3 install --user -r requirements.txt; pip3 install --user streamlit; echo '✅ Packages installed and Requirements met'",
21
+ "postAttachCommand": {
22
+ "server": "streamlit run streamlit_app.py --server.enableCORS false --server.enableXsrfProtection false"
23
+ },
24
+ "portsAttributes": {
25
+ "8501": {
26
+ "label": "Application",
27
+ "onAutoForward": "openPreview"
28
+ }
29
+ },
30
+ "forwardPorts": [
31
+ 8501
32
+ ]
33
+ }
.env.example ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ PHI_URL=....
2
+ QWEN_URL=...
3
+
4
+ EMBEDS_URL=...
5
+ DEFAULT_MODEL=microsoft/Phi-4-mini-instruct
6
+ DEFAULT_EMBEDDING=intfloat/multilingual-e5-large-instruct-modal
7
+
8
+ API_KEY=...
9
+ EMBEDS_API_KEY=...
10
+
11
+ GROBID_URL=...
12
+ GROBID_QUANTITIES_URL=...
13
+
14
+
15
+ QWEN_URL=...
16
+ GROBID_MATERIALS_URL=...
17
+ API_KEY=...
18
+ EMBEDS_API_KEY=...
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ docs/images/screenshot1.png filter=lfs diff=lfs merge=lfs -text
37
+ docs/images/screenshot2.png filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ .idea
2
+ .env
3
+ .env.docker
4
+ **/**/.chroma
5
+ resources/db
6
+ build
7
+ dist
8
+ __pycache__
9
+ document_qa/__pycache__
10
+ document_qa_engine.egg-info/
.streamlit/config.toml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ [logger]
2
+ level = "info"
3
+
4
+ [browser]
5
+ gatherUsageStats = true
6
+
7
+ [ui]
8
+ hideTopBar = true
CHANGELOG.md ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Changelog
2
+
3
+ All notable changes to this project will be documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
6
+
7
+ ## [0.4.2] - 2024-08-23
8
+
9
+ ### Fixed
10
+ + Correct invalid dependency of promptlayer slipped in the build
11
+
12
+ ## [0.4.1] - 2024-08-23
13
+
14
+ ### Added
15
+ + Scroll to the first relevant context passage, if the most relevant context passage is at the end, it will scroll to the end of the document
16
+ + Added Mistral NEMO as default model
17
+
18
+ ### Changed
19
+ + Rearranged the interface to get more space
20
+ + Updated libraries to the latest versions
21
+
22
+ ### Fixed
23
+ + Fixed the chat messages sequence that were buggy
24
+ + Updated the PDF viewer to the latest version
25
+
26
+ ## [0.4.0] - 2024-06-24
27
+
28
+ ### Added
29
+ + Add selection of embedding functions
30
+ + Add selection of text from the pdf viewer (provided by https://github.com/lfoppiano/streamlit-pdf-viewer)
31
+ + Added an experimental feature for calculating the coefficient that relate the question and the embedding database
32
+ + Added the data availability statement in the searchable text
33
+
34
+ ### Changed
35
+ + Removed obsolete and non-working models zephyr and mistral v0.1
36
+ + The underlying library was refactored to make it easier to maintain
37
+ + Removed the native PDF viewer
38
+ + Updated langchain and streamlit to the latest versions
39
+ + Removed conversational memory which was causing more problems than bringing benefits
40
+ + Rearranged the interface to get more space
41
+
42
+ ### Fixed
43
+ + Updated and removed models that were not working
44
+ + Fixed problems with langchain and other libraries
45
+
46
+ ## [0.3.4] - 2023-12-26
47
+
48
+ ### Added
49
+
50
+ + Add gpt4 and gpt4-turbo
51
+
52
+ ### Changed
53
+
54
+ + improved UI: replace combo boxes with dropdown box
55
+
56
+ ### Fixed
57
+
58
+ + Fixed dependencies when installing as library
59
+
60
+ ## [0.3.3] - 2023-12-14
61
+
62
+ ### Added
63
+
64
+ + Add experimental PDF rendering in the page
65
+
66
+ ### Fixed
67
+
68
+ + Fix GrobidProcessors API implementation
69
+
70
+ ## [0.3.2] - 2023-12-01
71
+
72
+ ### Fixed
73
+
74
+ + Remove memory when using Zephyr-7b-beta, that easily hallucinate
75
+
76
+ ## [0.3.1] - 2023-11-22
77
+
78
+ ### Added
79
+
80
+ + Include biblio in embeddings by @lfoppiano in #21
81
+
82
+ ### Fixed
83
+
84
+ + Fix conversational memory by @lfoppiano in #20
85
+
86
+ ## [0.3.0] - 2023-11-18
87
+
88
+ ### Added
89
+
90
+ + add zephyr-7b by @lfoppiano in #15
91
+ + add conversational memory in #18
92
+
93
+ ## [0.2.1] - 2023-11-01
94
+
95
+ ### Fixed
96
+
97
+ + fix env variables by @lfoppiano in #9
98
+
99
+ ## [0.2.0] – 2023-10-31
100
+
101
+ ### Added
102
+
103
+ + Selection of chunk size on which embeddings are created upon
104
+ + Mistral model to be used freely via the Huggingface free API
105
+
106
+ ### Changed
107
+
108
+ + Improved documentation, adding privacy statement
109
+ + Moved settings on the sidebar
110
+ + Disable NER extraction by default, and allow user to activate it
111
+ + Read API KEY from the environment variables and if present, avoid asking the user
112
+ + Avoid changing model after update
113
+
114
+ ## [0.1.3] – 2023-10-30
115
+
116
+ ### Fixed
117
+
118
+ + ChromaDb accumulating information even when new papers were uploaded
119
+
120
+ ## [0.1.2] – 2023-10-26
121
+
122
+ ### Fixed
123
+
124
+ + docker build
125
+
126
+ ## [0.1.1] – 2023-10-26
127
+
128
+ ### Fixed
129
+
130
+ + Github action build
131
+ + dependencies of langchain and chromadb
132
+
133
+ ## [0.1.0] – 2023-10-26
134
+
135
+ ### Added
136
+
137
+ + pypi package
138
+ + docker package release
139
+
140
+ ## [0.0.1] – 2023-10-26
141
+
142
+ ### Added
143
+
144
+ + Kick off application
145
+ + Support for GPT-3.5
146
+ + Support for Mistral + SentenceTransformer
147
+ + Streamlit application
148
+ + Docker image
149
+ + pypi package
150
+
151
+ <!-- markdownlint-disable-file MD024 MD033 -->
Dockerfile CHANGED
@@ -1,11 +1,30 @@
1
- FROM lfoppiano/document-insights-qa:latest-develop
2
- USER root
3
  WORKDIR /app
4
- RUN mkdir -m 777 -p /app/.cache
5
- RUN mkdir -m 777 -p /.cache
6
 
7
- COPY --chown=lfoppiano config.toml .streamlit/config.toml
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
  ENV PYTHONPATH "${PYTHONPATH}:."
10
 
11
- ENTRYPOINT ["streamlit", "run", "streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]
 
1
+ FROM python:3.12-slim
2
+
3
  WORKDIR /app
 
 
4
 
5
+ RUN apt-get update && apt-get install -y \
6
+ build-essential \
7
+ curl \
8
+ git \
9
+ && rm -rf /var/lib/apt/lists/*
10
+
11
+ COPY requirements.txt .
12
+
13
+ RUN pip3 install -r requirements.txt
14
+
15
+ COPY .streamlit ./.streamlit
16
+ COPY document_qa ./document_qa
17
+ COPY streamlit_app.py .
18
+
19
+ # extract version
20
+ COPY .git ./.git
21
+ RUN git rev-parse --short HEAD > revision.txt
22
+ RUN rm -rf ./.git
23
+
24
+ EXPOSE 8501
25
+
26
+ HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
27
 
28
  ENV PYTHONPATH "${PYTHONPATH}:."
29
 
30
+ ENTRYPOINT ["streamlit", "run", "streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]
LICENSE ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright [yyyy] [name of copyright owner]
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
README.md CHANGED
@@ -1,12 +1,116 @@
1
  ---
2
- title: Scientific Document Insights Q/A - Develop
3
- emoji: 📊
4
- colorFrom: blue
5
- colorTo: yellow
6
- sdk: docker
 
 
7
  pinned: false
8
  license: apache-2.0
9
  app_port: 8501
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Scientific Document Insights Q/A
3
+ emoji: 📝
4
+ colorFrom: yellow
5
+ colorTo: pink
6
+ sdk: streamlit
7
+ sdk_version: 1.37.1
8
+ app_file: streamlit_app.py
9
  pinned: false
10
  license: apache-2.0
11
  app_port: 8501
12
  ---
13
 
14
+ # DocumentIQA: Scientific Document Insights Q/A
15
+
16
+ **Work in progress** :construction_worker:
17
+
18
+ <img src="https://github.com/lfoppiano/document-qa/assets/15426/f0a04a86-96b3-406e-8303-904b93f00015" width=300 align="right" />
19
+
20
+ https://lfoppiano-document-qa.hf.space/
21
+
22
+ **NOTE**: The LLM API is kindly provided by [Modal.com](https://www.modal.com) which offers 30$/month for computing. When these are done, the app will stop answering. 😅
23
+
24
+ ## Introduction
25
+
26
+ Question/Answering on scientific documents using LLMs. The tool can be customized to use different types of LLM APIs.
27
+ The streamlit application demonstrates the implementation of a RAG (Retrieval Augmented Generation) on scientific documents.
28
+ **Different from most of the projects**, we focus on scientific articles and extract text from a structured document.
29
+ We target only the full text using [Grobid](https://github.com/kermitt2/grobid) which provides cleaner results than the raw PDF2Text converter (which is comparable with most of the other solutions).
30
+
31
+ Additionally, this frontend provides the visualisation of named entities on LLM responses to extract <span stype="color:yellow">physical quantities, measurements</span> (with [grobid-quantities](https://github.com/kermitt2/grobid-quantities)) and <span stype="color:blue">materials</span> mentions (with [grobid-superconductors](https://github.com/lfoppiano/grobid-superconductors)).
32
+
33
+ (The image on the right was generated with https://huggingface.co/spaces/stabilityai/stable-diffusion)
34
+
35
+ [<img src="https://img.youtube.com/vi/M4UaYs5WKGs/hqdefault.jpg" height="300" align="right"
36
+ />](https://www.youtube.com/embed/M4UaYs5WKGs)
37
+
38
+ ## Getting started
39
+
40
+ - Upload a scientific article as a PDF document. You will see a spinner or loading indicator while the processing is in progress.
41
+ - Once the spinner disappears, you can proceed to ask your questions
42
+
43
+ ![screenshot2.png](docs%2Fimages%2Fscreenshot2.png)
44
+
45
+ ## Documentation
46
+
47
+ **For full technical documentation** of the `document-qa-engine` library **[`docs/README.md`](docs/README.md)**.
48
+
49
+ ### Embedding selection
50
+ In the latest version, there is the possibility to select both embedding functions and LLMs. There are some limitations, OpenAI embeddings cannot be used with open source models, and vice-versa.
51
+
52
+ ### Context size
53
+ Allow to change the number of blocks from the original document that are considered for responding.
54
+ The default size of each block is 250 tokens (which can be changed before uploading the first document).
55
+ With default settings, each question uses around 1000 tokens.
56
+
57
+ **NOTE**: if the chat answers something like "the information is not provided in the given context", **changing the context size will likely help**.
58
+
59
+ ### Chunks size
60
+ When uploaded, each document is split into blocks of a determined size (250 tokens by default).
61
+ This setting allows users to modify the size of such blocks.
62
+ Smaller blocks will result in a smaller context, yielding more precise sections of the document.
63
+ Larger blocks will result in a larger context less constrained around the question.
64
+
65
+ ### Query mode
66
+ Indicates whether sending a question to the LLM (Language Model) or the vector storage.
67
+ - **LLM** (default) enables question/answering related to the document content.
68
+ - **Embeddings**: the response will consist of the raw text from the document related to the question (based on the embeddings). This mode helps to test why sometimes the answers are not satisfying or incomplete.
69
+ - **Question coefficient** (experimental): provide a coefficient that indicates how the question has been far or closed to the retrieved context
70
+
71
+ ### NER (Named Entities Recognition)
72
+ This feature is specifically crafted for people working with scientific documents in materials science.
73
+ It enables to run NER on the response from the LLM, to identify materials mentions and properties (quantities, measurements).
74
+ This feature leverages both [grobid-quantities](https://github.com/kermitt2/grobid-quanities) and [grobid-superconductors](https://github.com/lfoppiano/grobid-superconductors) external services.
75
+
76
+ ### Troubleshooting
77
+ Error: `streamlit: Your system has an unsupported version of sqlite3. Chroma requires sqlite3 >= 3.35.0`.
78
+ Here is the [solution on Linux](https://stackoverflow.com/questions/76958817/streamlit-your-system-has-an-unsupported-version-of-sqlite3-chroma-requires-sq).
79
+ For more information, see the [details](https://docs.trychroma.com/troubleshooting#sqlite) on the Chroma website.
80
+
81
+ ## Disclaimer on Data, Security, and Privacy ⚠️
82
+
83
+ Please read carefully:
84
+
85
+ - Avoid uploading sensitive data. We temporarily store text from the uploaded PDF documents only for processing your request, and we disclaim any responsibility for subsequent use or handling of the submitted data by third-party LLMs.
86
+ - Mistral and Zephyr are FREE to use and do not require any API, but as we leverage the free API entrypoint, there is no guarantee that all requests will go through. Use at your own risk.
87
+ - We do not assume responsibility for how the data is utilized by the LLM end-points API.
88
+
89
+ ## Development notes
90
+
91
+ To release a new version:
92
+
93
+ - `bump-my-version bump patch`
94
+ - `git push --tags`
95
+
96
+ To use docker:
97
+
98
+ - docker run `lfoppiano/document-insights-qa:{latest_version}`
99
+
100
+ - docker run `lfoppiano/document-insights-qa:latest-develop` for the latest development version
101
+
102
+ To install the library with Pypi:
103
+
104
+ - `pip install document-qa-engine`
105
+
106
+
107
+ ## Acknowledgement
108
+
109
+ The project was initiated at the [National Institute for Materials Science](https://www.nims.go.jp) (NIMS) in Japan.
110
+ Currently, the development is possible thanks to [ScienciLAB](https://www.sciencialab.com).
111
+ This project was contributed by [Guillaume Lambard](https://github.com/GLambard) and the [Lambard-ML-Team](https://github.com/Lambard-ML-Team), [Pedro Ortiz Suarez](https://github.com/pjox), and [Tomoya Mato](https://github.com/t29mato).
112
+ Thanks also to [Patrice Lopez](https://www.science-miner.com), the author of [Grobid](https://github.com/kermitt2/grobid).
113
+
114
+
115
+
116
+
docs/README.md ADDED
@@ -0,0 +1,249 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 📝 document-qa-engine documentation
2
+
3
+ > **License**: Apache 2.0 · **PyPI**: `pip install document-qa-engine`
4
+
5
+ A Python library and Streamlit application for **Question/Answering on scientific PDF documents** using Retrieval-Augmented Generation (RAG). It uses [GROBID](https://github.com/kermitt2/grobid) for structured text extraction, [ChromaDB](https://www.trychroma.com/) for vector storage, and any OpenAI-compatible LLM for answering.
6
+
7
+
8
+ ## Overview
9
+
10
+ Most PDF Q/A tools feed raw extracted text to an LLM, which is noisy and loses document structure. **document-qa-engine** takes a different approach:
11
+
12
+ 1. **Structured extraction** Sends the PDF to a GROBID server, which returns TEI-XML with separate sections (title, abstract, body paragraphs, figures, back matter) and precise bounding-box coordinates for every paragraph.
13
+ 2. **Smart chunking** Paragraphs can be kept as-is or merged into larger chunks using token-aware merging, while preserving coordinate metadata.
14
+ 3. **Vector embeddings** Each chunk is embedded (via a remote API or local model) and stored in an in-memory ChromaDB collection.
15
+ 4. **Retrieval + LLM answering** User questions are embedded, the most similar chunks are retrieved, and an LLM generates an answer from that context.
16
+ 5. **PDF highlighting** The Streamlit frontend highlights the exact PDF regions the LLM used, with a color gradient (orange = most relevant, blue = least relevant).
17
+ 6. **NER post-processing** *(optional)* LLM responses are scanned for physical quantities (via grobid-quantities) and materials mentions (via grobid-superconductors), then annotated inline.
18
+
19
+
20
+ ## Installation
21
+
22
+ ### Option 1: PyPI (library only)
23
+
24
+ ```bash
25
+ pip install document-qa-engine
26
+ ```
27
+
28
+ ### Option 2: From source (full app)
29
+
30
+ ```bash
31
+ git clone https://github.com/lfoppiano/document-qa.git
32
+ cd document-qa
33
+ pip install -r requirements.txt
34
+ ```
35
+
36
+ ### Option 3: Docker
37
+
38
+ ```bash
39
+ # Latest stable release
40
+ docker run -p 8501:8501 lfoppiano/document-insights-qa:latest
41
+
42
+ # Latest development build
43
+ docker run -p 8501:8501 lfoppiano/document-insights-qa:latest-develop
44
+ ```
45
+
46
+ ### Prerequisites
47
+
48
+ You need access to:
49
+
50
+ | Service | Required? | Purpose |
51
+ |---------|-----------|---------|
52
+ | **GROBID server** | ✅ Yes | Parses PDFs into structured text |
53
+ | **Embedding API** | ✅ Yes | Converts text to vectors |
54
+ | **LLM API** (OpenAI-compatible) | ✅ Yes | Answers questions |
55
+ | **grobid-quantities** | ❌ Optional | NER for measurements |
56
+ | **grobid-superconductors** | ❌ Optional | NER for materials |
57
+
58
+
59
+
60
+ ## Configuration
61
+
62
+ All configuration is through environment variables. Create a `.env` file in the project root:
63
+
64
+ ```env
65
+ # ── LLM Endpoints ────────────────────────────────────────
66
+ # Each key in API_MODELS maps a model name to its base URL.
67
+ PHI_URL=http://localhost:1234/v1 # Phi-4-mini-instruct endpoint
68
+ QWEN_URL=http://localhost:1234/v1 # Qwen3-0.6B endpoint
69
+ API_KEY=your-llm-api-key # Auth key for LLM APIs
70
+
71
+ # ── Embedding Endpoint ───────────────────────────────────
72
+ EMBEDS_URL=http://127.0.0.1:1234/v1 # Embedding service URL
73
+ EMBEDS_API_KEY=your-embedding-api-key # Auth key for embedding API
74
+
75
+ # ── Defaults ─────────────────────────────────────────────
76
+ DEFAULT_MODEL=microsoft/Phi-4-mini-instruct
77
+ DEFAULT_EMBEDDING=intfloat/multilingual-e5-large-instruct-modal
78
+
79
+ # ── GROBID Services ──────────────────────────────────────
80
+ GROBID_URL=https://your-grobid-url
81
+ GROBID_QUANTITIES_URL=https://your-grobid-quantities-url/
82
+ GROBID_MATERIALS_URL=https://your-grobid-superconductors-url/
83
+ ```
84
+
85
+ ### Variable Reference
86
+
87
+ | Variable | Description |
88
+ |----------|-------------|
89
+ | `PHI_URL` | Base URL for the Phi-4-mini-instruct vLLM server (OpenAI-compatible) |
90
+ | `QWEN_URL` | Base URL for the Qwen3-0.6B vLLM server (OpenAI-compatible) |
91
+ | `API_KEY` | Bearer token for authenticating with the LLM endpoints |
92
+ | `EMBEDS_URL` | Base URL for the embedding service (must expose `/embeddings` endpoint) |
93
+ | `EMBEDS_API_KEY` | Bearer token for authenticating with the embedding service |
94
+ | `DEFAULT_MODEL` | Model name pre-selected in the UI dropdown |
95
+ | `DEFAULT_EMBEDDING` | Embedding name pre-selected in the UI dropdown |
96
+ | `GROBID_URL` | Full URL to a running GROBID server |
97
+ | `GROBID_QUANTITIES_URL` | URL to a grobid-quantities server (for measurement NER) |
98
+ | `GROBID_MATERIALS_URL` | URL to a grobid-superconductors server (for materials NER) |
99
+
100
+ ---
101
+
102
+ ## Quick Start — Streamlit App
103
+
104
+ ```bash
105
+ # 1. Set up environment
106
+ cp .env.example .env # Edit with your endpoints
107
+
108
+ # 2. Run the app
109
+ streamlit run streamlit_app.py
110
+ ```
111
+
112
+ Then open `http://localhost:8501`, upload a PDF, and ask questions.
113
+
114
+ ---
115
+
116
+ ## Quick Start — As a Python Library
117
+
118
+ ```python
119
+ from langchain_openai import ChatOpenAI
120
+ from document_qa.custom_embeddings import ModalEmbeddings
121
+ from document_qa.document_qa_engine import DocumentQAEngine, DataStorage
122
+
123
+ # 1. Set up the LLM
124
+ llm = ChatOpenAI(
125
+ model="microsoft/Phi-4-mini-instruct",
126
+ temperature=0.0,
127
+ base_url="http://localhost:1234/v1",
128
+ api_key="your-api-key"
129
+ )
130
+
131
+ # 2. Set up embeddings
132
+ embeddings = ModalEmbeddings(
133
+ url="http://localhost:1234/v1",
134
+ model_name="intfloat/multilingual-e5-large-instruct",
135
+ api_key="your-embedding-key"
136
+ )
137
+
138
+ # 3. Create the storage and engine
139
+ storage = DataStorage(embeddings)
140
+ engine = DocumentQAEngine(
141
+ llm=llm,
142
+ data_storage=storage,
143
+ grobid_url="https://lfoppiano-grobid.hf.space/"
144
+ )
145
+
146
+ # 4. Load a PDF (creates in-memory embeddings)
147
+ doc_id = engine.create_memory_embeddings(
148
+ pdf_path="path/to/paper.pdf",
149
+ chunk_size=500 # tokens per chunk (-1 = keep paragraphs)
150
+ )
151
+
152
+ # 5. Ask a question
153
+ _, answer, coordinates = engine.query_document(
154
+ query="What is the main contribution of this paper?",
155
+ doc_id=doc_id,
156
+ context_size=10 # number of chunks to use as context
157
+ )
158
+ print(answer)
159
+
160
+ # 6. Or just retrieve relevant passages (no LLM)
161
+ passages, coordinates = engine.query_storage(
162
+ query="What materials were studied?",
163
+ doc_id=doc_id,
164
+ context_size=5
165
+ )
166
+ for p in passages:
167
+ print(p)
168
+ ```
169
+
170
+
171
+ ## Streamlit App Features
172
+
173
+ ### Query Modes
174
+
175
+ | Mode | What It Does | When to Use |
176
+ |------|-------------|-------------|
177
+ | **LLM Q/A** | Retrieves context → sends to LLM → returns a natural language answer | Default — for asking questions |
178
+ | **Embeddings** | Returns the raw text passages most similar to your question | Debugging — to see what context the LLM would receive |
179
+ | **Question Coefficient** | Computes `min_similarity - mean_similarity` as a quality estimate | Experimental — to predict answer reliability |
180
+
181
+ ### Settings
182
+
183
+ | Setting | Default | Description |
184
+ |---------|---------|-------------|
185
+ | Chunk size | `-1` (paragraphs) | Token count per text chunk. `-1` keeps GROBID paragraphs intact. |
186
+ | Context size | `10` (paragraphs) / `4` (chunks) | Number of chunks sent to the LLM as context |
187
+ | Scroll to context | Off | Auto-scroll the PDF viewer to the most relevant passage |
188
+ | NER processing | Off | Run grobid-quantities + grobid-superconductors on LLM responses |
189
+
190
+ ### PDF Annotations
191
+
192
+ After each query, the PDF viewer highlights the passages used as context:
193
+ - **Orange** (warm) = most relevant passage
194
+ - **Blue** (cold) = least relevant passage
195
+ - **Dotted border** = the single most relevant passage
196
+
197
+
198
+
199
+ ## Troubleshooting
200
+
201
+ ### SQLite version error
202
+
203
+ ```
204
+ streamlit: Your system has an unsupported version of sqlite3.
205
+ Chroma requires sqlite3 >= 3.35.0.
206
+ ```
207
+
208
+ **Linux fix**: See [this StackOverflow answer](https://stackoverflow.com/questions/76958817/streamlit-your-system-has-an-unsupported-version-of-sqlite3-chroma-requires-sq).
209
+ **More info**: [Chroma troubleshooting docs](https://docs.trychroma.com/troubleshooting#sqlite).
210
+
211
+ ### "The information is not provided in the given context"
212
+
213
+ The LLM couldn't find the answer in the retrieved passages. Try:
214
+ 1. **Increase context size** — use the sidebar slider to retrieve more passages
215
+ 2. **Decrease chunk size** — smaller chunks may match more precisely
216
+ 3. **Use Embeddings mode** — switch to "Embeddings" query mode to see what passages are being retrieved and verify they contain the answer
217
+
218
+ ### MissingSchema error on embeddings
219
+
220
+ ```
221
+ requests.exceptions.MissingSchema: Invalid URL
222
+ ```
223
+
224
+ Ensure `EMBEDS_URL` in your `.env` starts with `https://` or `http://`. Example:
225
+ ```env
226
+ EMBEDS_URL=https://your-modal-endpoint.modal.run/v1
227
+ ```
228
+
229
+ ### GROBID connection errors
230
+
231
+ Make sure your GROBID server is running and accessible:
232
+ ```bash
233
+ curl https://grobid.hf.space/api/isalive
234
+ ```
235
+
236
+ If using a local GROBID instance:
237
+ ```bash
238
+ docker run --rm -p 8070:8070 lfoppiano/grobid:0.8.0
239
+ # Then set GROBID_URL=http://localhost:8070
240
+ ```
241
+
242
+ ### Embedding API returning empty results
243
+
244
+ - Verify the API is running: `curl {EMBEDS_URL}/embeddings`
245
+ - Check that `EMBEDS_API_KEY` matches the server's expected key
246
+ - Ensure the URL does **not** have a trailing `/embeddings` (the client appends it automatically)
247
+
248
+ ---
249
+
docs/images/screenshot1.png ADDED

Git LFS Details

  • SHA256: cf082a5479180a7699c1799775e9f24b92cb2c43fbaaa2c3c83d4f85e26a3565
  • Pointer size: 131 Bytes
  • Size of remote file: 275 kB
docs/images/screenshot2.png ADDED

Git LFS Details

  • SHA256: 1b624732c58ce0d5f1a7ef67cd4893f70fc2d9a7dcdec44b2dbcb76a245e89f6
  • Pointer size: 131 Bytes
  • Size of remote file: 155 kB
document_qa/custom_embeddings.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Custom LangChain-compatible embedding client.
2
+
3
+ Provides :class:`ModalEmbeddings`, a drop-in ``Embeddings`` implementation
4
+ that calls any service exposing an ``/embeddings`` endpoint (OpenAI,
5
+ vLLM, Modal, LM Studio, etc.).
6
+
7
+ """
8
+
9
+ from typing import List
10
+ import requests
11
+ from langchain_core.embeddings import Embeddings
12
+
13
+
14
+ class ModalEmbeddings(Embeddings):
15
+ """LangChain ``Embeddings`` backed by an OpenAI-compatible HTTP API.
16
+
17
+ The service must expose a ``POST /embeddings`` endpoint that accepts
18
+ ``{"model": "…", "input": ["…"]}`` and returns the standard OpenAI
19
+ response shape.
20
+
21
+ Args:
22
+ url: Base URL of the embedding service (e.g. ``"http://localhost:1234/v1"``).
23
+ model_name: Model identifier(e.g. ``"intfloat/multilingual-e5-large-instruct"``).
24
+ api_key: Optional bearer token for authenticated endpoints.
25
+ """
26
+
27
+ def __init__(self, url: str, model_name: str, api_key: str = None):
28
+ self.url = url
29
+ self.model_name = model_name
30
+ self.api_key = api_key
31
+
32
+ def embed(self, text: List[str]) -> List[List[float]]:
33
+ """Embed a list of texts via the configured API.
34
+
35
+ Newlines are replaced with spaces before sending, since most
36
+ embedding models treat them as noise.
37
+
38
+ Args:
39
+ text: Strings to embed.
40
+
41
+ Returns:
42
+ list[list[float]]: One embedding vector per input string.
43
+
44
+ Raises:
45
+ requests.HTTPError: If the API returns a non-2xx status.
46
+ """
47
+ # Newlines degrade embedding quality for most models
48
+ cleaned_text = [t.replace("\n", " ") for t in text]
49
+
50
+ headers = {
51
+ "Content-Type": "application/json"
52
+ }
53
+
54
+ if self.api_key:
55
+ headers["Authorization"] = f"Bearer {self.api_key}"
56
+
57
+ response = requests.post(
58
+ f"{self.url}/embeddings",
59
+ json={
60
+ "model": self.model_name,
61
+ "input": cleaned_text
62
+ },
63
+ headers=headers
64
+ )
65
+
66
+ response.raise_for_status()
67
+
68
+ data = response.json()["data"]
69
+ return [item["embedding"] for item in data]
70
+
71
+ def embed_documents(self, text: List[str]) -> List[List[float]]:
72
+ """Embed multiple documents (LangChain interface).
73
+
74
+ Args:
75
+ text: Document strings to embed.
76
+
77
+ Returns:
78
+ list[list[float]]: One embedding vector per document.
79
+ """
80
+ return self.embed(text)
81
+
82
+ def embed_query(self, text: str) -> List[float]:
83
+ """Embed a single query string (LangChain interface).
84
+
85
+ Args:
86
+ text: The query string.
87
+
88
+ Returns:
89
+ list[float]: The embedding vector for *text*.
90
+ """
91
+ return self.embed([text])[0]
92
+
93
+ def get_model_name(self) -> str:
94
+ """Return the model identifier used for embedding requests."""
95
+ return self.model_name
96
+
97
+
98
+ if __name__ == "__main__":
99
+ embeds = ModalEmbeddings(
100
+ url="https://lfoppiano--intfloat-multilingual-e5-large-instruct-embed-5da184.modal.run/",
101
+ model_name="intfloat/multilingual-e5-large-instruct"
102
+ )
103
+
104
+ print(embeds.embed(
105
+ ["We are surrounded by stupid kids",
106
+ "We are interested in the future of AI"]
107
+ ))
document_qa/deployment/modal_embeddings.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import Annotated, List
3
+ from fastapi import Request, HTTPException, Form
4
+
5
+ import modal
6
+ import torch
7
+ import torch.nn.functional as F
8
+ from torch import Tensor
9
+ from transformers import AutoTokenizer, AutoModel
10
+
11
+ image = (
12
+ modal.Image.debian_slim(python_version="3.11")
13
+ .pip_install(
14
+ "transformers",
15
+ "huggingface_hub[hf_transfer]==0.26.2",
16
+ "flashinfer-python==0.2.0.post2", # pinning, very unstable
17
+ "fastapi[standard]",
18
+ extra_index_url="https://flashinfer.ai/whl/cu124/torch2.5",
19
+ )
20
+ .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}) # faster model transfers
21
+ )
22
+
23
+ MODELS_DIR = "/llamas"
24
+ MODEL_NAME = "intfloat/multilingual-e5-large-instruct"
25
+ MODEL_REVISION = "84344a23ee1820ac951bc365f1e91d094a911763"
26
+
27
+ hf_cache_vol = modal.Volume.from_name("huggingface-cache", create_if_missing=True)
28
+ vllm_cache_vol = modal.Volume.from_name("vllm-cache", create_if_missing=True)
29
+
30
+ app = modal.App("intfloat-multilingual-e5-large-instruct-embeddings")
31
+
32
+
33
+ def get_device():
34
+ return torch.device('cuda' if torch.cuda.is_available() else 'cpu')
35
+
36
+ def load_model():
37
+ print("Loading model...")
38
+ device = get_device()
39
+ print(f"Using device: {device}")
40
+
41
+ tokenizer = AutoTokenizer.from_pretrained('intfloat/multilingual-e5-large-instruct')
42
+ model = AutoModel.from_pretrained('intfloat/multilingual-e5-large-instruct').to(device)
43
+ print("Model loaded successfully.")
44
+
45
+ return tokenizer, model, device
46
+
47
+
48
+ N_GPU = 1
49
+ MINUTES = 60 # seconds
50
+ VLLM_PORT = 8000
51
+
52
+
53
+ def average_pool(last_hidden_states: Tensor,
54
+ attention_mask: Tensor) -> Tensor:
55
+ last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
56
+ return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
57
+
58
+
59
+ @app.function(
60
+ image=image,
61
+ gpu=f"L40S:{N_GPU}",
62
+ # gpu=f"A10G:{N_GPU}",
63
+ # how long should we stay up with no requests?
64
+ scaledown_window=3 * MINUTES,
65
+ volumes={
66
+ "/root/.cache/huggingface": hf_cache_vol,
67
+ "/root/.cache/vllm": vllm_cache_vol,
68
+ },
69
+ secrets=[modal.Secret.from_name("document-qa-embedding-key")]
70
+ )
71
+ @modal.concurrent(
72
+ max_inputs=5
73
+ ) # how many requests can one replica handle? tune carefully!
74
+ @modal.fastapi_endpoint(method="POST")
75
+ def embed(request: Request, text: Annotated[str, Form()]):
76
+ api_key = request.headers.get("x-api-key")
77
+ expected_key = os.environ["API_KEY"]
78
+
79
+ if api_key != expected_key:
80
+ raise HTTPException(status_code=401, detail="Unauthorized")
81
+
82
+
83
+ texts = [t for t in text.split("\n") if t.strip()]
84
+ if not texts:
85
+ return []
86
+
87
+ tokenizer, model, device = load_model()
88
+ model.eval()
89
+
90
+ print(f"Start embedding {len(texts)} texts")
91
+ try:
92
+ with torch.no_grad():
93
+ # Move inputs to the same device as model
94
+ batch_dict = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
95
+ batch_dict = {k: v.to(device) for k, v in batch_dict.items()}
96
+
97
+ # Forward pass
98
+ outputs = model(**batch_dict)
99
+
100
+ # Process embeddings
101
+ embeddings = average_pool(
102
+ outputs.last_hidden_state,
103
+ batch_dict['attention_mask']
104
+ )
105
+ embeddings = F.normalize(embeddings, p=2, dim=1)
106
+
107
+ # Move to CPU and convert to list for serialization
108
+ embeddings = embeddings.cpu().numpy().tolist()
109
+
110
+ print("Finished embedding texts.")
111
+ return embeddings
112
+
113
+ except RuntimeError as e:
114
+ print(f"Error during embedding: {str(e)}")
115
+ if "CUDA out of memory" in str(e):
116
+ print("CUDA out of memory error. Try reducing batch size or using a smaller model.")
117
+ raise
document_qa/deployment/modal_inference_phi.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import modal
4
+
5
+ vllm_image = (
6
+ modal.Image.debian_slim(python_version="3.10")
7
+ .pip_install(
8
+ "vllm",
9
+ "huggingface_hub[hf_transfer]==0.26.2",
10
+ "flashinfer-python==0.2.0.post2", # pinning, very unstable
11
+ extra_index_url="https://flashinfer.ai/whl/cu124/torch2.5",
12
+ )
13
+ .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}) # faster model transfers
14
+ )
15
+
16
+ MODELS_DIR = "/llamas"
17
+ MODEL_NAME = "microsoft/Phi-4-mini-instruct"
18
+ MODEL_REVISION = "c0fb9e74abda11b496b7907a9c6c9009a7a0488f"
19
+
20
+ FAST_BOOT = True
21
+
22
+ hf_cache_vol = modal.Volume.from_name("huggingface-cache", create_if_missing=True)
23
+ vllm_cache_vol = modal.Volume.from_name("vllm-cache", create_if_missing=True)
24
+
25
+
26
+ app = modal.App("phi-4-mini-instruct-qa-vllm")
27
+
28
+ N_GPU = 1
29
+ MINUTES = 60 # seconds
30
+ VLLM_PORT = 8000
31
+
32
+
33
+ @app.function(
34
+ image=vllm_image,
35
+ # gpu=f"L40S:{N_GPU}",
36
+ gpu=f"A10G:{N_GPU}",
37
+ # how long should we stay up with no requests?
38
+ scaledown_window=5 * MINUTES,
39
+ volumes={
40
+ "/root/.cache/huggingface": hf_cache_vol,
41
+ "/root/.cache/vllm": vllm_cache_vol,
42
+ },
43
+ secrets=[modal.Secret.from_name("document-qa-api-key")]
44
+ )
45
+ @modal.concurrent(
46
+ max_inputs=5
47
+ ) # how many requests can one replica handle? tune carefully!
48
+ @modal.web_server(port=VLLM_PORT, startup_timeout=5 * MINUTES)
49
+ def serve():
50
+ import subprocess
51
+
52
+ cmd = [
53
+ "vllm",
54
+ "serve",
55
+ "--uvicorn-log-level=info",
56
+ MODEL_NAME,
57
+ "--revision",
58
+ MODEL_REVISION,
59
+ "--max-model-len",
60
+ "32768",
61
+ "--host",
62
+ "0.0.0.0",
63
+ "--port",
64
+ str(VLLM_PORT),
65
+ "--api-key",
66
+ os.environ["API_KEY"],
67
+ ]
68
+
69
+ # enforce-eager disables both Torch compilation and CUDA graph capture
70
+ # default is no-enforce-eager. see the --compilation-config flag for tighter control
71
+ cmd += ["--enforce-eager" if FAST_BOOT else "--no-enforce-eager"]
72
+
73
+ # assume multiple GPUs are for splitting up large matrix multiplications
74
+ cmd += ["--tensor-parallel-size", str(N_GPU)]
75
+
76
+ subprocess.Popen(" ".join(cmd), shell=True)
document_qa/deployment/modal_inference_qwen.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import modal
4
+
5
+ vllm_image = (
6
+ modal.Image.debian_slim(python_version="3.11")
7
+ .pip_install(
8
+ "vllm",
9
+ "transformers>=4.51.0",
10
+ "huggingface_hub[hf_transfer]>=0.26.2",
11
+ "flashinfer-python==0.2.0.post2", # pinning, very unstable
12
+ extra_index_url="https://flashinfer.ai/whl/cu124/torch2.5",
13
+ )
14
+ .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}) # faster model transfers
15
+ )
16
+
17
+ MODELS_DIR = "/llamas"
18
+ MODEL_NAME = "Qwen/Qwen3-0.6B"
19
+ MODEL_REVISION = "e6de91484c29aa9480d55605af694f39b081c455"
20
+
21
+ hf_cache_vol = modal.Volume.from_name("huggingface-cache", create_if_missing=True)
22
+ vllm_cache_vol = modal.Volume.from_name("vllm-cache", create_if_missing=True)
23
+
24
+
25
+ app = modal.App("gwen-0.6b-qa-vllm")
26
+
27
+ N_GPU = 1
28
+ MINUTES = 60 # seconds
29
+ VLLM_PORT = 8000
30
+
31
+
32
+ @app.function(
33
+ image=vllm_image,
34
+ # gpu=f"L40S:{N_GPU}",
35
+ gpu=f"A10G:{N_GPU}",
36
+ # how long should we stay up with no requests?
37
+ scaledown_window=5 * MINUTES,
38
+ volumes={
39
+ "/root/.cache/huggingface": hf_cache_vol,
40
+ "/root/.cache/vllm": vllm_cache_vol,
41
+ },
42
+ secrets=[modal.Secret.from_name("document-qa-api-key")]
43
+ )
44
+ @modal.concurrent(
45
+ max_inputs=5
46
+ ) # how many requests can one replica handle? tune carefully!
47
+ @modal.web_server(port=VLLM_PORT, startup_timeout=5 * MINUTES)
48
+ def serve():
49
+ import subprocess
50
+
51
+ cmd = [
52
+ "vllm",
53
+ "serve",
54
+ "--uvicorn-log-level=info",
55
+ MODEL_NAME,
56
+ "--revision",
57
+ MODEL_REVISION,
58
+ "--enable-reasoning",
59
+ "--reasoning-parser",
60
+ "deepseek_r1",
61
+ "--max-model-len",
62
+ "32768",
63
+ "--host",
64
+ "0.0.0.0",
65
+ "--port",
66
+ str(VLLM_PORT),
67
+ "--api-key",
68
+ os.environ["API_KEY"],
69
+ ]
70
+
71
+ subprocess.Popen(" ".join(cmd), shell=True)
document_qa/document_qa_engine.py ADDED
@@ -0,0 +1,676 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Core Q/A engine for scientific PDF documents.
2
+
3
+ This module provides the main classes for building a Retrieval-Augmented
4
+ Generation (RAG) pipeline over scientific PDFs.
5
+ """
6
+
7
+ import copy
8
+ import os
9
+ from pathlib import Path
10
+ from typing import Union, Any, List
11
+
12
+ import tiktoken
13
+ from langchain.chains import create_extraction_chain
14
+ from langchain.chains.combine_documents import create_stuff_documents_chain
15
+ from langchain.chains.question_answering import stuff_prompt, refine_prompts, map_reduce_prompt, \
16
+ map_rerank_prompt
17
+ from langchain.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate
18
+ from langchain.retrievers import MultiQueryRetriever
19
+ from langchain.schema import Document
20
+ from langchain_community.vectorstores.chroma import Chroma
21
+ from langchain_core.vectorstores import VectorStore
22
+ from tqdm import tqdm
23
+
24
+ from document_qa.grobid_processors import GrobidProcessor
25
+ from document_qa.langchain import ChromaAdvancedRetrieval
26
+
27
+
28
+ class TextMerger:
29
+ """Token-aware text merger that preserves PDF coordinate metadata.
30
+
31
+ Unlike LangChain's ``RecursiveTextSplitter``, this merger keeps the
32
+ bounding-box coordinates extracted by GROBID so that downstream
33
+ consumers (e.g. the PDF viewer) can highlight the exact regions.
34
+
35
+ Args:
36
+ model_name: A tiktoken model name (e.g. ``"gpt-4"``). When given,
37
+ the tokenizer for that model is used.
38
+ encoding_name: A tiktoken encoding name (default ``"gpt2"``).
39
+ Ignored when *model_name* is provided.
40
+ """
41
+
42
+ def __init__(self, model_name=None, encoding_name="gpt2"):
43
+ if model_name is not None:
44
+ self.enc = tiktoken.encoding_for_model(model_name)
45
+ else:
46
+ self.enc = tiktoken.get_encoding(encoding_name)
47
+
48
+ def encode(self, text, allowed_special=set(), disallowed_special="all"):
49
+ """Tokenize *text* and return a list of token IDs.
50
+
51
+ Thin wrapper around ``tiktoken.Encoding.encode`` that exposes the
52
+ same special-token controls.
53
+
54
+ Args:
55
+ text: The string to tokenize.
56
+ allowed_special: Set of special tokens allowed in *text*.
57
+ disallowed_special: Special-token handling policy.
58
+
59
+ Returns:
60
+ list[int]: Token IDs produced by the configured tokenizer.
61
+ """
62
+ return self.enc.encode(
63
+ text,
64
+ allowed_special=allowed_special,
65
+ disallowed_special=disallowed_special,
66
+ )
67
+
68
+ def merge_passages(self, passages, chunk_size, tolerance=0.2):
69
+ """Merge consecutive passages into chunks of approximately *chunk_size* tokens.
70
+
71
+ Args:
72
+ passages: List of dicts, each with ``"text"`` (str) and
73
+ ``"coordinates"`` (str) keys — as returned by
74
+ method:`GrobidProcessor.process_structure`.
75
+ chunk_size: Target number of tokens per merged chunk.
76
+ tolerance: Fraction of *chunk_size* allowed as overflow
77
+ (default ``0.2``).
78
+
79
+ Returns:
80
+ list[dict]: Merged passages. Each dict has:
81
+
82
+ - ``"text"`` — concatenated paragraph texts.
83
+ - ``"coordinates"`` — semicolon-joined coordinate strings.
84
+ - ``"type"`` — always ``"aggregated chunks"``.
85
+ - ``"section"`` / ``"subSection"`` — always ``"mixed"``.
86
+ """
87
+ new_passages = []
88
+ new_coordinates = []
89
+ current_texts = []
90
+ current_coordinates = []
91
+ for idx, passage in enumerate(passages):
92
+ text = passage['text']
93
+ coordinates = passage['coordinates']
94
+ current_texts.append(text)
95
+ current_coordinates.append(coordinates)
96
+
97
+ accumulated_text = " ".join(current_texts)
98
+
99
+ encoded_accumulated_text = self.encode(accumulated_text)
100
+
101
+ if len(encoded_accumulated_text) > chunk_size + chunk_size * tolerance:
102
+ if len(current_texts) > 1:
103
+ new_passages.append(current_texts[:-1])
104
+ new_coordinates.append(current_coordinates[:-1])
105
+ current_texts = [current_texts[-1]]
106
+ current_coordinates = [current_coordinates[-1]]
107
+ else:
108
+ new_passages.append(current_texts)
109
+ new_coordinates.append(current_coordinates)
110
+ current_texts = []
111
+ current_coordinates = []
112
+
113
+ elif chunk_size <= len(encoded_accumulated_text) < chunk_size + chunk_size * tolerance:
114
+ new_passages.append(current_texts)
115
+ new_coordinates.append(current_coordinates)
116
+ current_texts = []
117
+ current_coordinates = []
118
+
119
+ if len(current_texts) > 0:
120
+ new_passages.append(current_texts)
121
+ new_coordinates.append(current_coordinates)
122
+
123
+ new_passages_struct = []
124
+ for i, passages in enumerate(new_passages):
125
+ text = " ".join(passages)
126
+ coordinates = ";".join(new_coordinates[i])
127
+
128
+ new_passages_struct.append(
129
+ {
130
+ "text": text,
131
+ "coordinates": coordinates,
132
+ "type": "aggregated chunks",
133
+ "section": "mixed",
134
+ "subSection": "mixed"
135
+ }
136
+ )
137
+
138
+ return new_passages_struct
139
+
140
+
141
+ class BaseRetrieval:
142
+ """Abstract base for retrieval backends.
143
+ """
144
+
145
+ def __init__(
146
+ self,
147
+ persist_directory: Path,
148
+ embedding_function
149
+ ):
150
+ self.embedding_function = embedding_function
151
+ self.persist_directory = persist_directory
152
+
153
+
154
+ class NER_Retrival(VectorStore):
155
+ """
156
+ This class implement a retrieval based on NER models.
157
+ This is an alternative retrieval to embeddings that relies on extracted entities.
158
+ """
159
+ pass
160
+
161
+
162
+ engines = {
163
+ 'chroma': ChromaAdvancedRetrieval,
164
+ 'ner': NER_Retrival
165
+ }
166
+
167
+
168
+ class DataStorage:
169
+ """Manages per-document vector-store collections.
170
+
171
+ Each uploaded PDF gets its own ChromaDB collection,
172
+ keyed by a document ID (typically an MD5 hash). Collections can live
173
+ in memory or be persisted to disk.
174
+
175
+ Args:
176
+ embedding_function: A LangChain-compatible ``Embeddings`` instance
177
+ root_path: Optional directory for persisted embeddings.
178
+ engine: The vector-store class to use.
179
+
180
+ """
181
+
182
+ embeddings_dict = {}
183
+ embeddings_map_from_md5 = {}
184
+ embeddings_map_to_md5 = {}
185
+
186
+ def __init__(
187
+ self,
188
+ embedding_function,
189
+ root_path: Path = None,
190
+ engine=ChromaAdvancedRetrieval,
191
+ ) -> None:
192
+ self.root_path = root_path
193
+ self.engine = engine
194
+ self.embedding_function = embedding_function
195
+
196
+ if root_path is not None:
197
+ self.embeddings_root_path = root_path
198
+ if not os.path.exists(root_path):
199
+ os.makedirs(root_path)
200
+ else:
201
+ self.load_embeddings(self.embeddings_root_path)
202
+
203
+ def load_embeddings(self, embeddings_root_path: Union[str, Path]) -> None:
204
+ """
205
+ Load the vector storage assuming they are all persisted and stored in a single directory.
206
+ The root path of the embeddings containing one data store for each document in each subdirectory
207
+ """
208
+
209
+ embeddings_directories = [f for f in os.scandir(embeddings_root_path) if f.is_dir()]
210
+
211
+ if len(embeddings_directories) == 0:
212
+ print("No available embeddings")
213
+ return
214
+
215
+ for embedding_document_dir in embeddings_directories:
216
+ self.embeddings_dict[embedding_document_dir.name] = self.engine(
217
+ persist_directory=embedding_document_dir.path,
218
+ embedding_function=self.embedding_function
219
+ )
220
+
221
+ filename_list = list(Path(embedding_document_dir).glob('*.storage_filename'))
222
+ if filename_list:
223
+ filenam = filename_list[0].name.replace(".storage_filename", "")
224
+ self.embeddings_map_from_md5[embedding_document_dir.name] = filenam
225
+ self.embeddings_map_to_md5[filenam] = embedding_document_dir.name
226
+
227
+ print("Embedding loaded: ", len(self.embeddings_dict.keys()))
228
+
229
+ def get_loaded_embeddings_ids(self):
230
+ """Return the document IDs (MD5 hashes) of all loaded collections."""
231
+ return list(self.embeddings_dict.keys())
232
+
233
+ def get_md5_from_filename(self, filename):
234
+ """Look up the MD5 document ID for a given original *filename*."""
235
+ return self.embeddings_map_to_md5[filename]
236
+
237
+ def get_filename_from_md5(self, md5):
238
+ """Look up the original filename for a given *md5* document ID."""
239
+ return self.embeddings_map_from_md5[md5]
240
+
241
+ def embed_document(self, doc_id, texts, metadatas):
242
+ """Create (or replace) an in-memory vector collection for a document.
243
+
244
+ Args:
245
+ doc_id: Unique identifier for the document.
246
+ texts: List of text chunks to embed.
247
+ metadatas: List of metadata dicts (one per chunk).
248
+ """
249
+ if doc_id not in self.embeddings_dict.keys():
250
+ self.embeddings_dict[doc_id] = self.engine.from_texts(
251
+ texts,
252
+ embedding=self.embedding_function,
253
+ metadatas=metadatas,
254
+ collection_name=doc_id)
255
+ else:
256
+ # Workaround Chroma (?) breaking change
257
+ self.embeddings_dict[doc_id].delete_collection()
258
+ self.embeddings_dict[doc_id] = self.engine.from_texts(
259
+ texts,
260
+ embedding=self.embedding_function,
261
+ metadatas=metadatas,
262
+ collection_name=doc_id)
263
+
264
+ self.embeddings_root_path = None
265
+
266
+
267
+ class DocumentQAEngine:
268
+ """End-to-end RAG engine for scientific PDF documents.
269
+
270
+ Orchestrates the full pipeline:
271
+
272
+ 1. **PDF parsing** via a GROBID server (structured text + coordinates).
273
+ 2. **Chunking** — paragraphs kept as-is or merged with :class:`TextMerger`.
274
+ 3. **Embedding and storage** chunks are embedded and stored.
275
+ 4. **Retrieval + LLM** — relevant chunks are retrieved and fed to an LLM
276
+ to produce an answer.
277
+
278
+ Args:
279
+ llm: A LangChain chat model (e.g. ``ChatOpenAI``).
280
+ data_storage: A `DataStorage` instance for managing embeddings.
281
+ grobid_url: URL of the GROBID server.
282
+ memory: Optional ``ConversationBufferMemory`` for multi-turn context.
283
+
284
+ """
285
+
286
+ llm = None
287
+ qa_chain_type = None
288
+
289
+ default_prompts = {
290
+ 'stuff': stuff_prompt,
291
+ 'refine': refine_prompts,
292
+ "map_reduce": map_reduce_prompt,
293
+ "map_rerank": map_rerank_prompt
294
+ }
295
+
296
+ def __init__(self,
297
+ llm,
298
+ data_storage: DataStorage,
299
+ grobid_url=None,
300
+ memory=None
301
+ ):
302
+
303
+ self.llm = llm
304
+ self.memory = memory
305
+ self.chain = create_stuff_documents_chain(llm, self.default_prompts['stuff'].PROMPT)
306
+ self.text_merger = TextMerger()
307
+ self.data_storage = data_storage
308
+
309
+ if grobid_url:
310
+ self.grobid_processor = GrobidProcessor(grobid_url)
311
+
312
+ def query_document(
313
+ self,
314
+ query: str,
315
+ doc_id,
316
+ output_parser=None,
317
+ context_size=4,
318
+ extraction_schema=None,
319
+ verbose=False
320
+ ) -> tuple[Any, str]:
321
+ """Ask a question and get an LLM-generated answer.
322
+
323
+ Retrieves the most relevant chunks from the vector store, feeds
324
+ them as context to the LLM, and returns the response.
325
+
326
+ Args:
327
+ query: The natural-language question.
328
+ doc_id: Document identifier returned by create_memory_embeddings`.
329
+ output_parser: Optional LangChain output parser. If provided the
330
+ raw LLM response is re-processed into structured output.
331
+ context_size: Number of chunks to retrieve as context (default 4).
332
+ extraction_schema: Optional extraction schema.
333
+ verbose: Print debug information.
334
+
335
+ Returns:
336
+ tuple: ``(parsed_output | None, raw_text_response, coordinates)``
337
+
338
+ - *parsed_output* — structured data if a parser/schema was given,
339
+ otherwise ``None``.
340
+ - *raw_text_response* — the LLM's raw text answer.
341
+ - *coordinates* — list of lists of coordinate strings for each
342
+ retrieved chunk (for PDF highlighting).
343
+ """
344
+ # self.load_embeddings(self.embeddings_root_path)
345
+
346
+ if verbose:
347
+ print(query)
348
+
349
+ response, coordinates = self._run_query(doc_id, query, context_size=context_size)
350
+ response = response['output_text'] if 'output_text' in response else response
351
+
352
+ if verbose:
353
+ print(doc_id, "->", response)
354
+
355
+ if output_parser:
356
+ try:
357
+ return self._parse_json(response, output_parser), response
358
+ except Exception as oe:
359
+ print("Failing to parse the response", oe)
360
+ return None, response, coordinates
361
+ elif extraction_schema:
362
+ try:
363
+ chain = create_extraction_chain(extraction_schema, self.llm)
364
+ parsed = chain.run(response)
365
+ return parsed, response, coordinates
366
+ except Exception as oe:
367
+ print("Failing to parse the response", oe)
368
+ return None, response, coordinates
369
+ else:
370
+ return None, response, coordinates
371
+
372
+ def query_storage(self, query: str, doc_id, context_size=4) -> tuple[List[Document], list]:
373
+ """Retrieve relevant text passages without calling the LLM.
374
+
375
+ Useful for debugging which chunks would be used as context, or for
376
+ building custom pipelines on top of the retrieval step.
377
+
378
+ Args:
379
+ query: The natural-language question.
380
+ doc_id: Document identifier.
381
+ context_size: Number of chunks to retrieve (default 4).
382
+
383
+ Returns:
384
+ tuple: ``(texts, coordinates)``
385
+
386
+ - *texts* — list of passage strings.
387
+ - *coordinates* — list of lists of coordinate strings.
388
+ """
389
+ documents, coordinates = self._get_context(doc_id, query, context_size)
390
+
391
+ context_as_text = [doc.page_content for doc in documents]
392
+ return context_as_text, coordinates
393
+
394
+ def query_storage_and_embeddings(self, query: str, doc_id, context_size=4) -> List[Document]:
395
+ """Retrieve passages with their similarity scores and raw embeddings.
396
+
397
+ Each returned ``Document`` has extra metadata keys:
398
+
399
+ - ``__similarity`` — cosine distance to the query.
400
+ - ``__embeddings`` — the chunk's embedding vector.
401
+
402
+ Args:
403
+ query: The natural-language question.
404
+ doc_id: Document identifier.
405
+ context_size: Number of chunks to retrieve (default 4).
406
+
407
+ Returns:
408
+ list[Document]: Retrieved documents enriched with similarity and
409
+ embedding metadata.
410
+ """
411
+ db = self.data_storage.embeddings_dict[doc_id]
412
+ retriever = db.as_retriever(
413
+ search_kwargs={"k": context_size},
414
+ search_type="similarity_with_embeddings"
415
+ )
416
+ relevant_documents = retriever.invoke(query)
417
+
418
+ return relevant_documents
419
+
420
+ def analyse_query(self, query, doc_id, context_size=4):
421
+ """Compute a relevance coefficient for *query* against *doc_id*.
422
+
423
+ The coefficient is ``min_similarity - mean_similarity`` over the
424
+ top-k retrieved chunks. A value close to zero suggests the
425
+ question matches multiple passages equally well.
426
+
427
+ Args:
428
+ query: The natural-language question.
429
+ doc_id: Document identifier.
430
+ context_size: Number of chunks to consider (default 4).
431
+
432
+ Returns:
433
+ tuple: ``(summary_string, coordinates)``
434
+ """
435
+ db = self.data_storage.embeddings_dict[doc_id]
436
+ # retriever = db.as_retriever(
437
+ # search_kwargs={"k": context_size, 'score_threshold': 0.0},
438
+ # search_type="similarity_score_threshold"
439
+ # )
440
+ retriever = db.as_retriever(search_kwargs={"k": context_size}, search_type="similarity_with_embeddings")
441
+ relevant_documents = retriever.invoke(query)
442
+ relevant_document_coordinates = [doc.metadata['coordinates'].split(";") if 'coordinates' in doc.metadata else []
443
+ for doc in
444
+ relevant_documents]
445
+ all_documents = db.get(include=['documents', 'metadatas', 'embeddings'])
446
+ # all_documents_embeddings = all_documents["embeddings"]
447
+ # query_embedding = db._embedding_function.embed_query(query)
448
+
449
+ # distance_evaluator = load_evaluator("pairwise_embedding_distance",
450
+ # embeddings=db._embedding_function,
451
+ # distance_metric=EmbeddingDistance.EUCLIDEAN)
452
+
453
+ # distance_evaluator.evaluate_string_pairs(query=query_embedding, documents="")
454
+
455
+ similarities = [doc.metadata['__similarity'] for doc in relevant_documents]
456
+ min_similarity = min(similarities)
457
+ mean_similarity = sum(similarities) / len(similarities)
458
+ coefficient = min_similarity - mean_similarity
459
+
460
+ return f"Coefficient: {coefficient}, (Min similarity {min_similarity}, Mean similarity: {mean_similarity})", relevant_document_coordinates
461
+
462
+ def _parse_json(self, response, output_parser):
463
+ system_message = "You are an useful assistant expert in materials science, physics, and chemistry " \
464
+ "that can process text and transform it to JSON."
465
+ human_message = """Transform the text between three double quotes in JSON.\n\n\n\n
466
+ {format_instructions}\n\nText: \"\"\"{text}\"\"\""""
467
+
468
+ system_message_prompt = SystemMessagePromptTemplate.from_template(system_message)
469
+ human_message_prompt = HumanMessagePromptTemplate.from_template(human_message)
470
+
471
+ prompt_template = ChatPromptTemplate.from_messages([system_message_prompt, human_message_prompt])
472
+
473
+ results = self.llm(
474
+ prompt_template.format_prompt(
475
+ text=response,
476
+ format_instructions=output_parser.get_format_instructions()
477
+ ).to_messages()
478
+ )
479
+ parsed_output = output_parser.parse(results.content)
480
+
481
+ return parsed_output
482
+
483
+ def _run_query(self, doc_id, query, context_size=4) -> tuple[List[Document], list]:
484
+ relevant_documents, relevant_document_coordinates = self._get_context(doc_id, query, context_size)
485
+ response = self.chain.invoke({"context": relevant_documents, "question": query})
486
+ return response, relevant_document_coordinates
487
+
488
+ def _get_context(self, doc_id, query, context_size=4) -> tuple[List[Document], list]:
489
+ db = self.data_storage.embeddings_dict[doc_id]
490
+ retriever = db.as_retriever(search_kwargs={"k": context_size})
491
+ relevant_documents = retriever.invoke(query)
492
+ relevant_document_coordinates = [
493
+ doc.metadata['coordinates'].split(";") if 'coordinates' in doc.metadata else []
494
+ for doc in
495
+ relevant_documents
496
+ ]
497
+ if self.memory and len(self.memory.buffer_as_messages) > 0:
498
+ relevant_documents.append(
499
+ Document(
500
+ page_content="""Following, the previous question and answers. Use these information only when in the question there are unspecified references:\n{}\n\n""".format(
501
+ self.memory.buffer_as_str))
502
+ )
503
+ return relevant_documents, relevant_document_coordinates
504
+
505
+ def get_full_context_by_document(self, doc_id):
506
+ """
507
+ Return the full context from the document
508
+ """
509
+ db = self.data_storage.embeddings_dict[doc_id]
510
+ docs = db.get()
511
+ return docs['documents']
512
+
513
+ def _get_context_multiquery(self, doc_id, query, context_size=4):
514
+ db = self.data_storage.embeddings_dict[doc_id].as_retriever(search_kwargs={"k": context_size})
515
+ multi_query_retriever = MultiQueryRetriever.from_llm(retriever=db, llm=self.llm)
516
+ relevant_documents = multi_query_retriever.invoke(query)
517
+ return relevant_documents
518
+
519
+ def get_text_from_document(self, pdf_file_path, chunk_size=-1, perc_overlap=0.1, verbose=False):
520
+ """Extract and chunk text from a PDF via GROBID.
521
+
522
+ Sends the PDF to the configured GROBID server, parses the returned
523
+ TEI-XML into passages with coordinate metadata, and optionally
524
+ merges passages into larger token-based chunks.
525
+
526
+ Args:
527
+ pdf_file_path: Path to the PDF file on disk.
528
+ chunk_size: Target tokens per chunk. ``-1`` (default) keeps
529
+ GROBID paragraphs as-is; a positive value merges them.
530
+ perc_overlap: Reserved for future overlap support.
531
+ verbose: Print debug information.
532
+
533
+ Returns:
534
+ tuple: ``(texts, metadatas, ids)``
535
+
536
+ - *texts* — list of passage strings.
537
+ - *metadatas* — list of metadata dicts (coordinates, section, …).
538
+ - *ids* — list of integer chunk IDs.
539
+
540
+ Raises:
541
+ AttributeError: If ``grobid_url`` was not provided at init time.
542
+ """
543
+ if verbose:
544
+ print("File", pdf_file_path)
545
+ filename = Path(pdf_file_path).stem
546
+ coordinates = True # if chunk_size == -1 else False
547
+ structure = self.grobid_processor.process_structure(pdf_file_path, coordinates=coordinates)
548
+
549
+ biblio = structure['biblio']
550
+ biblio['filename'] = filename.replace(" ", "_")
551
+
552
+ if verbose:
553
+ print("Generating embeddings for:", hash, ", filename: ", filename)
554
+
555
+ texts = []
556
+ metadatas = []
557
+ ids = []
558
+
559
+ if chunk_size > 0:
560
+ new_passages = self.text_merger.merge_passages(structure['passages'], chunk_size=chunk_size)
561
+ else:
562
+ new_passages = structure['passages']
563
+
564
+ for passage in new_passages:
565
+ biblio_copy = copy.copy(biblio)
566
+ if len(str.strip(passage['text'])) > 0:
567
+ texts.append(passage['text'])
568
+
569
+ biblio_copy['type'] = passage['type']
570
+ biblio_copy['section'] = passage['section']
571
+ biblio_copy['subSection'] = passage['subSection']
572
+ biblio_copy['coordinates'] = passage['coordinates']
573
+ metadatas.append(biblio_copy)
574
+
575
+ # ids.append(passage['passage_id'])
576
+
577
+ ids = [id for id, t in enumerate(new_passages)]
578
+
579
+ return texts, metadatas, ids
580
+
581
+ def create_memory_embeddings(
582
+ self,
583
+ pdf_path,
584
+ doc_id=None,
585
+ chunk_size=500,
586
+ perc_overlap=0.1
587
+ ):
588
+ """Parse a PDF and create an in-memory vector collection.
589
+
590
+ This is the main entry-point for ingesting a new document. It
591
+ calls GROBID, chunks the text, embeds it, and stores everything in `data_storage`.
592
+
593
+ Args:
594
+ pdf_path: Path to the PDF file.
595
+ doc_id: Optional explicit document ID. When ``None``, the
596
+ MD5 hash extracted by GROBID is used.
597
+ chunk_size: Token count per chunk (default 500). Use ``-1``
598
+ to keep GROBID paragraphs intact.
599
+ perc_overlap: Reserved for future overlap support.
600
+
601
+ Returns:
602
+ str: The document ID.
603
+ """
604
+ texts, metadata, ids = self.get_text_from_document(
605
+ pdf_path,
606
+ chunk_size=chunk_size,
607
+ perc_overlap=perc_overlap)
608
+ if doc_id:
609
+ hash = doc_id
610
+ else:
611
+ hash = metadata[0]['hash'] if len(metadata) > 0 and 'hash' in metadata[0] else ""
612
+
613
+ self.data_storage.embed_document(hash, texts, metadata)
614
+
615
+ return hash
616
+
617
+ def create_embeddings(
618
+ self,
619
+ pdfs_dir_path: Path,
620
+ chunk_size=500,
621
+ perc_overlap=0.1,
622
+ include_biblio=False
623
+ ):
624
+ """Batch-process a directory of PDFs and persist their embeddings.
625
+
626
+ Walks *pdfs_dir_path*, processes each ``.pdf`` file through GROBID,
627
+ creates embeddings, and persists the resulting ChromaDB collection
628
+ to a subdirectory named after the file's MD5.
629
+
630
+ Args:
631
+ pdfs_dir_path: Directory containing PDF files.
632
+ chunk_size: Token count per chunk (default 500).
633
+ perc_overlap: Reserved for future overlap support.
634
+ include_biblio: Reserved flag (currently unused).
635
+ """
636
+ input_files = []
637
+ for root, dirs, files in os.walk(pdfs_dir_path, followlinks=False):
638
+ for file_ in files:
639
+ if not (file_.lower().endswith(".pdf")):
640
+ continue
641
+ input_files.append(os.path.join(root, file_))
642
+
643
+ for input_file in tqdm(input_files, total=len(input_files), unit='document',
644
+ desc="Grobid + embeddings processing"):
645
+
646
+ md5 = self.calculate_md5(input_file)
647
+ data_path = os.path.join(self.data_storage.embeddings_root_path, md5)
648
+
649
+ if os.path.exists(data_path):
650
+ print(data_path, "exists. Skipping it ")
651
+ continue
652
+ # include = ["biblio"] if include_biblio else []
653
+ texts, metadata, ids = self.get_text_from_document(
654
+ input_file,
655
+ chunk_size=chunk_size,
656
+ perc_overlap=perc_overlap)
657
+ filename = metadata[0]['filename']
658
+
659
+ vector_db_document = Chroma.from_texts(texts,
660
+ metadatas=metadata,
661
+ embedding=self.embedding_function,
662
+ persist_directory=data_path)
663
+ vector_db_document.persist()
664
+
665
+ with open(os.path.join(data_path, filename + ".storage_filename"), 'w') as fo:
666
+ fo.write("")
667
+
668
+ @staticmethod
669
+ def calculate_md5(input_file: Union[Path, str]):
670
+ """Return the uppercase hex MD5 digest of *input_file*."""
671
+
672
+ import hashlib
673
+ md5_hash = hashlib.md5()
674
+ with open(input_file, 'rb') as fi:
675
+ md5_hash.update(fi.read())
676
+ return md5_hash.hexdigest().upper()
document_qa/grobid_processors.py ADDED
@@ -0,0 +1,999 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """GROBID-based processors for scientific text extraction.
2
+
3
+ This module provides processors that interact with GROBID services to:
4
+
5
+ - **Extract structured text** from scientific PDFs (:class:`GrobidProcessor`)
6
+ — parses TEI-XML into passages with section labels and PDF coordinates.
7
+ - **Annotate physical quantities** (:class:`GrobidQuantitiesProcessor`)
8
+ — identifies measurements via the grobid-quantities service.
9
+ - **Annotate materials** (:class:`GrobidMaterialsProcessor`)
10
+ — identifies material mentions via grobid-superconductors.
11
+ - **Aggregate NER results** (:class:`GrobidAggregationProcessor`)
12
+ — combines quantity and material annotations with overlap pruning.
13
+
14
+ """
15
+
16
+ import re
17
+ from collections import OrderedDict
18
+ from html import escape
19
+ from pathlib import Path
20
+
21
+ import dateparser
22
+ import grobid_tei_xml
23
+ from bs4 import BeautifulSoup
24
+ from grobid_client.grobid_client import GrobidClient
25
+
26
+
27
+ def get_span_start(type, title=None):
28
+ """Return an opening ``<span>`` tag for an annotation of the given *type*."""
29
+ title_ = ' title="' + title + '"' if title is not None else ""
30
+ return '<span class="label ' + type + '"' + title_ + '>'
31
+
32
+
33
+ def get_span_end():
34
+ return '</span>'
35
+
36
+
37
+ def get_rs_start(type):
38
+ return '<rs type="' + type + '">'
39
+
40
+
41
+ def get_rs_end():
42
+ return '</rs>'
43
+
44
+
45
+ def has_space_between_value_and_unit(quantity):
46
+ return quantity['offsetEnd'] < quantity['rawUnit']['offsetStart']
47
+
48
+
49
+ def decorate_text_with_annotations(text, spans, tag="span"):
50
+ """Wrap recognised entity spans in markup tags.
51
+
52
+ Produces either HTML (``<span class="label …">``) or TEI-XML
53
+ (``<rs type="…">``) depending on *tag*.
54
+
55
+ Args:
56
+ text: The original plain-text string.
57
+ spans: List of span dicts with at least ``offset_start``,
58
+ ``offset_end``, and ``type`` keys.
59
+ tag: ``"span"`` (default) for HTML output, ``"rs"`` for XML.
60
+
61
+ Returns:
62
+ str: The text with inline annotation markup.
63
+ """
64
+ sorted_spans = list(sorted(spans, key=lambda item: item['offset_start']))
65
+ annotated_text = ""
66
+ start = 0
67
+ for span in sorted_spans:
68
+ type = span['type'].replace("<", "").replace(">", "")
69
+ if 'unit_type' in span and span['unit_type'] is not None:
70
+ type = span['unit_type'].replace(" ", "_")
71
+ annotated_text += escape(text[start: span['offset_start']])
72
+ title = span['quantified'] if 'quantified' in span else None
73
+ annotated_text += get_span_start(type, title) if tag == "span" else get_rs_start(type)
74
+ annotated_text += escape(text[span['offset_start']: span['offset_end']])
75
+ annotated_text += get_span_end() if tag == "span" else get_rs_end()
76
+
77
+ start = span['offset_end']
78
+ annotated_text += escape(text[start: len(text)])
79
+ return annotated_text
80
+
81
+
82
+ def get_parsed_value_type(quantity):
83
+ if 'parsedValue' in quantity and 'structure' in quantity['parsedValue']:
84
+ return quantity['parsedValue']['structure']['type']
85
+
86
+
87
+ class BaseProcessor(object):
88
+ """Shared post-processing logic for all GROBID-derived processors.
89
+
90
+ Fixes common character-encoding artefacts produced by PDF extraction
91
+ (e.g. ``À`` → ``-``, ``¼`` → ``=``). All processor subclasses
92
+ inherit :meth:`post_process` from here.
93
+ """
94
+
95
+ patterns = [
96
+ r'\d+e\d+'
97
+ ]
98
+
99
+ def post_process(self, text):
100
+ """Clean encoding artefacts and normalise special characters.
101
+
102
+ Args:
103
+ text: Raw extracted text from GROBID.
104
+
105
+ Returns:
106
+ str: Cleaned text.
107
+ """
108
+ output = text.replace('À', '-')
109
+ output = output.replace('¼', '=')
110
+ output = output.replace('þ', '+')
111
+ output = output.replace('Â', 'x')
112
+ output = output.replace('$', '~')
113
+ output = output.replace('−', '-')
114
+ output = output.replace('–', '-')
115
+
116
+ for pattern in self.patterns:
117
+ output = re.sub(pattern, lambda match: match.group().replace('e', '-'), output)
118
+
119
+ return output
120
+
121
+
122
+ class GrobidProcessor(BaseProcessor):
123
+ """Extract structured text and coordinates from PDFs via GROBID.
124
+
125
+ Sends a PDF to a running GROBID server, parses the returned TEI-XML,
126
+ and produces a list of passage dicts with text content, section labels,
127
+ and bounding-box coordinates for each paragraph.
128
+
129
+ Args:
130
+ grobid_url: Full URL of the GROBID server
131
+ (e.g. ``"https://grobid.example.com"``).
132
+ ping_server: If ``True`` (default), verify the server is alive
133
+ on init.
134
+
135
+ Raises:
136
+ ServerUnavailableException: If *ping_server* is ``True`` and the
137
+ GROBID server does not respond.
138
+
139
+ """
140
+
141
+ def __init__(self, grobid_url, ping_server=True):
142
+ grobid_client = GrobidClient(
143
+ grobid_server=grobid_url,
144
+ batch_size=5,
145
+ coordinates=["p", "title", "persName"],
146
+ sleep_time=5,
147
+ timeout=60,
148
+ check_server=ping_server
149
+ )
150
+ self.grobid_client = grobid_client
151
+
152
+ def process_structure(self, input_path, coordinates=False):
153
+ """Send a PDF to GROBID and return structured content.
154
+
155
+ Args:
156
+ input_path: Path to the PDF file.
157
+ coordinates: If ``True``, include bounding-box coordinate
158
+ strings in each passage (needed for PDF highlighting).
159
+
160
+ Returns:
161
+ dict or None: A dict with keys:
162
+
163
+ - ``"biblio"`` — bibliographic metadata (title, authors, DOI, …).
164
+ - ``"passages"`` — list of passage dicts, each containing
165
+ ``text``, ``type``, ``section``, ``subSection``,
166
+ ``passage_id``, and ``coordinates``.
167
+ - ``"filename"`` — stem of the PDF filename.
168
+
169
+ Returns ``None`` if GROBID returns a non-200 status.
170
+ """
171
+ pdf_file, status, text = self.grobid_client.process_pdf("processFulltextDocument",
172
+ input_path,
173
+ consolidate_header=True,
174
+ consolidate_citations=False,
175
+ segment_sentences=False,
176
+ tei_coordinates=coordinates,
177
+ include_raw_citations=False,
178
+ include_raw_affiliations=False,
179
+ generateIDs=True)
180
+
181
+ if status != 200:
182
+ return
183
+
184
+ document_object = self.parse_grobid_xml(text, coordinates=coordinates)
185
+ document_object['filename'] = Path(pdf_file).stem.replace(".tei", "")
186
+
187
+ return document_object
188
+
189
+ def process_single(self, input_file):
190
+ doc = self.process_structure(input_file)
191
+
192
+ for paragraph in doc['passages']:
193
+ entities = self.process_single_text(paragraph['text'])
194
+ paragraph['spans'] = entities
195
+
196
+ return doc
197
+
198
+ def parse_grobid_xml(self, text, coordinates=False):
199
+ """Parse GROBID TEI-XML into a structured passage dict.
200
+
201
+ Extracts title, abstract, body paragraphs, back-matter, and
202
+ figure descriptions from the XML, post-processes encoding
203
+ artefacts, and attaches coordinate metadata.
204
+
205
+ Args:
206
+ text: Raw TEI-XML string returned by GROBID.
207
+ coordinates: Whether to extract ``coords`` attributes.
208
+
209
+ Returns:
210
+ dict: ``{"biblio": {…}, "passages": […]}``
211
+ """
212
+ output_data = OrderedDict()
213
+
214
+ doc_biblio = grobid_tei_xml.parse_document_xml(text)
215
+ biblio = {
216
+ "doi": doc_biblio.header.doi if doc_biblio.header.doi is not None else "",
217
+ "authors": ", ".join([author.full_name for author in doc_biblio.header.authors]),
218
+ "title": doc_biblio.header.title,
219
+ "hash": doc_biblio.pdf_md5
220
+ }
221
+ try:
222
+ year = dateparser.parse(doc_biblio.header.date).year
223
+ biblio["publication_year"] = year
224
+ except:
225
+ pass
226
+
227
+ output_data['biblio'] = biblio
228
+ passages = []
229
+ output_data['passages'] = passages
230
+ passage_type = "paragraph"
231
+
232
+ soup = BeautifulSoup(text, 'xml')
233
+ blocks_header = get_xml_nodes_header(soup, use_paragraphs=True)
234
+
235
+ # passages.append({
236
+ # "text": f"authors: {biblio['authors']}",
237
+ # "type": passage_type,
238
+ # "section": "<header>",
239
+ # "subSection": "<authors>",
240
+ # "passage_id": "hauthors",
241
+ # "coordinates": ";".join([node['coords'] if coordinates and node.has_attr('coords') else "" for node in
242
+ # blocks_header['authors']])
243
+ # })
244
+
245
+ passages.append({
246
+ "text": self.post_process(" ".join([node.text for node in blocks_header['title']])),
247
+ "type": passage_type,
248
+ "section": "<header>",
249
+ "subSection": "<title>",
250
+ "passage_id": "htitle",
251
+ "coordinates": ";".join([node['coords'] if coordinates and node.has_attr('coords') else "" for node in
252
+ blocks_header['title']])
253
+ })
254
+
255
+ passages.append({
256
+ "text": self.post_process(
257
+ ''.join(node.text for node in blocks_header['abstract'] for text in node.find_all(text=True) if
258
+ text.parent.name != "ref" or (
259
+ text.parent.name == "ref" and text.parent.attrs[
260
+ 'type'] != 'bibr'))),
261
+ "type": passage_type,
262
+ "section": "<header>",
263
+ "subSection": "<abstract>",
264
+ "passage_id": "habstract",
265
+ "coordinates": ";".join([node['coords'] if coordinates and node.has_attr('coords') else "" for node in
266
+ blocks_header['abstract']])
267
+ })
268
+
269
+ text_blocks_body = get_xml_nodes_body(soup, verbose=False, use_paragraphs=True)
270
+ text_blocks_body.extend(get_xml_nodes_back(soup, verbose=False, use_paragraphs=True))
271
+
272
+ use_paragraphs = True
273
+ if not use_paragraphs:
274
+ passages.extend([
275
+ {
276
+ "text": self.post_process(''.join(text for text in sentence.find_all(text=True) if
277
+ text.parent.name != "ref" or (
278
+ text.parent.name == "ref" and text.parent.attrs[
279
+ 'type'] != 'bibr'))),
280
+ "type": passage_type,
281
+ "section": "<body>",
282
+ "subSection": "<paragraph>",
283
+ "passage_id": str(paragraph_id),
284
+ "coordinates": paragraph['coords'] if coordinates and sentence.has_attr('coords') else ""
285
+ }
286
+ for paragraph_id, paragraph in enumerate(text_blocks_body) for
287
+ sentence_id, sentence in enumerate(paragraph)
288
+ ])
289
+ else:
290
+ passages.extend([
291
+ {
292
+ "text": self.post_process(''.join(text for text in paragraph.find_all(text=True) if
293
+ text.parent.name != "ref" or (
294
+ text.parent.name == "ref" and text.parent.attrs[
295
+ 'type'] != 'bibr'))),
296
+ "type": passage_type,
297
+ "section": "<body>",
298
+ "subSection": "<paragraph>",
299
+ "passage_id": str(paragraph_id),
300
+ "coordinates": paragraph['coords'] if coordinates and paragraph.has_attr('coords') else ""
301
+ }
302
+ for paragraph_id, paragraph in enumerate(text_blocks_body)
303
+ ])
304
+
305
+ text_blocks_figures = get_xml_nodes_figures(soup, verbose=False)
306
+
307
+ if not use_paragraphs:
308
+ passages.extend([
309
+ {
310
+ "text": self.post_process(''.join(text for text in sentence.find_all(text=True) if
311
+ text.parent.name != "ref" or (
312
+ text.parent.name == "ref" and text.parent.attrs[
313
+ 'type'] != 'bibr'))),
314
+ "type": passage_type,
315
+ "section": "<body>",
316
+ "subSection": "<figure>",
317
+ "passage_id": str(paragraph_id) + str(sentence_id),
318
+ "coordinates": sentence['coords'] if coordinates and 'coords' in sentence else ""
319
+ }
320
+ for paragraph_id, paragraph in enumerate(text_blocks_figures) for
321
+ sentence_id, sentence in enumerate(paragraph)
322
+ ])
323
+ else:
324
+ passages.extend([
325
+ {
326
+ "text": self.post_process(''.join(text for text in paragraph.find_all(text=True) if
327
+ text.parent.name != "ref" or (
328
+ text.parent.name == "ref" and text.parent.attrs[
329
+ 'type'] != 'bibr'))),
330
+ "type": passage_type,
331
+ "section": "<body>",
332
+ "subSection": "<figure>",
333
+ "passage_id": str(paragraph_id),
334
+ "coordinates": paragraph['coords'] if coordinates and paragraph.has_attr('coords') else ""
335
+ }
336
+ for paragraph_id, paragraph in enumerate(text_blocks_figures)
337
+ ])
338
+
339
+ return output_data
340
+
341
+
342
+ class GrobidQuantitiesProcessor(BaseProcessor):
343
+ """NER processor for physical quantities (measurements, units).
344
+
345
+ Wraps the `grobid-quantities <https://github.com/kermitt2/grobid-quantities>`_
346
+ service to identify and normalise measurements in text.
347
+
348
+ Args:
349
+ grobid_quantities_client: A configured quantities API client
350
+ """
351
+
352
+ def __init__(self, grobid_quantities_client):
353
+ self.grobid_quantities_client = grobid_quantities_client
354
+
355
+ def process(self, text) -> list:
356
+ """Extract quantity spans from *text*.
357
+
358
+ Args:
359
+ text: Plain text to analyse.
360
+
361
+ Returns:
362
+ list[dict]: Span dicts with ``offset_start``, ``offset_end``,
363
+ ``type`` (``"property"``), and optional ``unit_type`` /
364
+ ``quantified`` keys.
365
+ """
366
+ status, result = self.grobid_quantities_client.process_text(text.strip())
367
+
368
+ if status != 200:
369
+ result = {}
370
+
371
+ spans = []
372
+
373
+ if 'measurements' in result:
374
+ found_measurements = self.parse_measurements_output(result)
375
+
376
+ for m in found_measurements:
377
+ item = {
378
+ "text": text[m['offset_start']:m['offset_end']],
379
+ 'offset_start': m['offset_start'],
380
+ 'offset_end': m['offset_end']
381
+ }
382
+
383
+ if 'raw' in m and m['raw'] != item['text']:
384
+ item['text'] = m['raw']
385
+
386
+ if 'quantified_substance' in m:
387
+ item['quantified'] = m['quantified_substance']
388
+
389
+ if 'type' in m:
390
+ item["unit_type"] = m['type']
391
+
392
+ item['type'] = 'property'
393
+ # if 'raw_value' in m:
394
+ # item['raw_value'] = m['raw_value']
395
+
396
+ spans.append(item)
397
+
398
+ return spans
399
+
400
+ @staticmethod
401
+ def parse_measurements_output(result):
402
+ measurements_output = []
403
+
404
+ for measurement in result['measurements']:
405
+ type = measurement['type']
406
+ measurement_output_object = {}
407
+ quantity_type = None
408
+ has_unit = False
409
+ parsed_value_type = None
410
+
411
+ if 'quantified' in measurement:
412
+ if 'normalizedName' in measurement['quantified']:
413
+ quantified_substance = measurement['quantified']['normalizedName']
414
+ measurement_output_object["quantified_substance"] = quantified_substance
415
+
416
+ if 'measurementOffsets' in measurement:
417
+ measurement_output_object["offset_start"] = measurement["measurementOffsets"]['start']
418
+ measurement_output_object["offset_end"] = measurement["measurementOffsets"]['end']
419
+ else:
420
+ # If there are no offsets we skip the measurement
421
+ continue
422
+
423
+ # if 'measurementRaw' in measurement:
424
+ # measurement_output_object['raw_value'] = measurement['measurementRaw']
425
+
426
+ if type == 'value':
427
+ quantity = measurement['quantity']
428
+
429
+ parsed_value = GrobidQuantitiesProcessor.get_parsed(quantity)
430
+ if parsed_value:
431
+ measurement_output_object['parsed'] = parsed_value
432
+
433
+ normalized_value = GrobidQuantitiesProcessor.get_normalized(quantity)
434
+ if normalized_value:
435
+ measurement_output_object['normalized'] = normalized_value
436
+
437
+ raw_value = GrobidQuantitiesProcessor.get_raw(quantity)
438
+ if raw_value:
439
+ measurement_output_object['raw'] = raw_value
440
+
441
+ if 'type' in quantity:
442
+ quantity_type = quantity['type']
443
+
444
+ if 'rawUnit' in quantity:
445
+ has_unit = True
446
+
447
+ parsed_value_type = get_parsed_value_type(quantity)
448
+
449
+ elif type == 'interval':
450
+ if 'quantityMost' in measurement:
451
+ quantityMost = measurement['quantityMost']
452
+ if 'type' in quantityMost:
453
+ quantity_type = quantityMost['type']
454
+
455
+ if 'rawUnit' in quantityMost:
456
+ has_unit = True
457
+
458
+ parsed_value_type = get_parsed_value_type(quantityMost)
459
+
460
+ if 'quantityLeast' in measurement:
461
+ quantityLeast = measurement['quantityLeast']
462
+
463
+ if 'type' in quantityLeast:
464
+ quantity_type = quantityLeast['type']
465
+
466
+ if 'rawUnit' in quantityLeast:
467
+ has_unit = True
468
+
469
+ parsed_value_type = get_parsed_value_type(quantityLeast)
470
+
471
+ elif type == 'listc':
472
+ quantities = measurement['quantities']
473
+
474
+ if 'type' in quantities[0]:
475
+ quantity_type = quantities[0]['type']
476
+
477
+ if 'rawUnit' in quantities[0]:
478
+ has_unit = True
479
+
480
+ parsed_value_type = get_parsed_value_type(quantities[0])
481
+
482
+ if quantity_type is not None or has_unit:
483
+ measurement_output_object['type'] = quantity_type
484
+
485
+ if parsed_value_type is None or parsed_value_type not in ['ALPHABETIC', 'TIME']:
486
+ measurements_output.append(measurement_output_object)
487
+
488
+ return measurements_output
489
+
490
+ @staticmethod
491
+ def get_parsed(quantity):
492
+ parsed_value = parsed_unit = None
493
+ if 'parsedValue' in quantity and 'parsed' in quantity['parsedValue']:
494
+ parsed_value = quantity['parsedValue']['parsed']
495
+ if 'parsedUnit' in quantity and 'name' in quantity['parsedUnit']:
496
+ parsed_unit = quantity['parsedUnit']['name']
497
+
498
+ if parsed_value and parsed_unit:
499
+ if has_space_between_value_and_unit(quantity):
500
+ return str(parsed_value) + str(parsed_unit)
501
+ else:
502
+ return str(parsed_value) + " " + str(parsed_unit)
503
+
504
+ @staticmethod
505
+ def get_normalized(quantity):
506
+ normalized_value = normalized_unit = None
507
+ if 'normalizedQuantity' in quantity:
508
+ normalized_value = quantity['normalizedQuantity']
509
+ if 'normalizedUnit' in quantity and 'name' in quantity['normalizedUnit']:
510
+ normalized_unit = quantity['normalizedUnit']['name']
511
+
512
+ if normalized_value and normalized_unit:
513
+ if has_space_between_value_and_unit(quantity):
514
+ return str(normalized_value) + " " + str(normalized_unit)
515
+ else:
516
+ return str(normalized_value) + str(normalized_unit)
517
+
518
+ @staticmethod
519
+ def get_raw(quantity):
520
+ raw_value = raw_unit = None
521
+ if 'rawValue' in quantity:
522
+ raw_value = quantity['rawValue']
523
+ if 'rawUnit' in quantity and 'name' in quantity['rawUnit']:
524
+ raw_unit = quantity['rawUnit']['name']
525
+
526
+ if raw_value and raw_unit:
527
+ if has_space_between_value_and_unit(quantity):
528
+ return str(raw_value) + " " + str(raw_unit)
529
+ else:
530
+ return str(raw_value) + str(raw_unit)
531
+
532
+
533
+ class GrobidMaterialsProcessor(BaseProcessor):
534
+ """NER processor for material mentions (chemical compounds, etc.).
535
+
536
+ Wraps the `grobid-superconductors <https://github.com/lfoppiano/grobid-superconductors>`_
537
+ service.
538
+
539
+ Args:
540
+ grobid_superconductors_client: A configured
541
+ :class:`~document_qa.ner_client_generic.NERClientGeneric` instance.
542
+ """
543
+
544
+ def __init__(self, grobid_superconductors_client):
545
+ self.grobid_superconductors_client = grobid_superconductors_client
546
+
547
+ def process(self, text):
548
+ """Extract material-mention spans from *text*.
549
+
550
+ Args:
551
+ text: Plain text to analyse.
552
+
553
+ Returns:
554
+ list[dict]: Span dicts with ``offset_start``, ``offset_end``,
555
+ ``type`` (``"material"``), and optional ``formula`` keys.
556
+ """
557
+ preprocessed_text = text.strip()
558
+ status, result = self.grobid_superconductors_client.process_text(preprocessed_text,
559
+ "processText_disable_linking")
560
+
561
+ if status != 200:
562
+ result = {}
563
+
564
+ spans = []
565
+
566
+ if 'passages' in result:
567
+ materials = self.parse_superconductors_output(result, preprocessed_text)
568
+
569
+ for m in materials:
570
+ item = {"text": preprocessed_text[m['offset_start']:m['offset_end']]}
571
+
572
+ item['offset_start'] = m['offset_start']
573
+ item['offset_end'] = m['offset_end']
574
+
575
+ if 'formula' in m:
576
+ item["formula"] = m['formula']
577
+
578
+ item['type'] = 'material'
579
+ item['raw_value'] = m['text']
580
+
581
+ spans.append(item)
582
+
583
+ return spans
584
+
585
+ def parse_materials(self, text):
586
+ status, result = self.grobid_superconductors_client.process_texts(text.strip(), "parseMaterials")
587
+
588
+ if status != 200:
589
+ result = []
590
+
591
+ results = []
592
+ for position_material in result:
593
+ compositions = []
594
+ for material in position_material:
595
+ if 'resolvedFormulas' in material:
596
+ for resolved_formula in material['resolvedFormulas']:
597
+ if 'formulaComposition' in resolved_formula:
598
+ compositions.append(resolved_formula['formulaComposition'])
599
+ elif 'formula' in material:
600
+ if 'formulaComposition' in material['formula']:
601
+ compositions.append(material['formula']['formulaComposition'])
602
+ results.append(compositions)
603
+
604
+ return results
605
+
606
+ def parse_material(self, text):
607
+ status, result = self.grobid_superconductors_client.process_text(text.strip(), "parseMaterial")
608
+
609
+ if status != 200:
610
+ result = []
611
+
612
+ compositions = self.output_info(result)
613
+
614
+ return compositions
615
+
616
+ def output_info(self, result):
617
+ compositions = []
618
+ for material in result:
619
+ if 'resolvedFormulas' in material:
620
+ for resolved_formula in material['resolvedFormulas']:
621
+ if 'formulaComposition' in resolved_formula:
622
+ compositions.append(resolved_formula['formulaComposition'])
623
+ elif 'formula' in material:
624
+ if 'formulaComposition' in material['formula']:
625
+ compositions.append(material['formula']['formulaComposition'])
626
+ if 'name' in material:
627
+ compositions.append(material['name'])
628
+ return compositions
629
+
630
+ @staticmethod
631
+ def parse_superconductors_output(result, original_text):
632
+ materials = []
633
+
634
+ for passage in result['passages']:
635
+ sentence_offset = original_text.index(passage['text'])
636
+ if 'spans' in passage:
637
+ spans = passage['spans']
638
+ for material_span in filter(lambda s: s['type'] == '<material>', spans):
639
+ text_ = material_span['text']
640
+
641
+ base_material_information = {
642
+ "text": text_,
643
+ "offset_start": sentence_offset + material_span['offset_start'],
644
+ 'offset_end': sentence_offset + material_span['offset_end']
645
+ }
646
+
647
+ materials.append(base_material_information)
648
+
649
+ return materials
650
+
651
+
652
+ class GrobidAggregationProcessor(GrobidQuantitiesProcessor, GrobidMaterialsProcessor):
653
+ """Combined NER processor that merges quantity and material annotations.
654
+
655
+ Runs both :class:`GrobidQuantitiesProcessor` and
656
+ :class:`GrobidMaterialsProcessor`, then prunes overlapping spans so
657
+ that the output is clean and non-overlapping.
658
+
659
+ Args:
660
+ grobid_quantities_client: Optional quantities API client.
661
+ grobid_superconductors_client: Optional materials NER client.
662
+
663
+ Either or both clients may be ``None``; only the provided services
664
+ will be called.
665
+ """
666
+
667
+ def __init__(self, grobid_quantities_client=None, grobid_superconductors_client=None):
668
+ if grobid_quantities_client:
669
+ self.gqp = GrobidQuantitiesProcessor(grobid_quantities_client)
670
+ if grobid_superconductors_client:
671
+ self.gmp = GrobidMaterialsProcessor(grobid_superconductors_client)
672
+
673
+ def process_single_text(self, text):
674
+ """Run both NER services on *text* and return merged, deduplicated spans.
675
+
676
+ Args:
677
+ text: Plain text to process.
678
+
679
+ Returns:
680
+ list[dict]: Non-overlapping span dicts sorted by offset.
681
+ """
682
+ extracted_quantities_spans = self.process_properties(text)
683
+ extracted_materials_spans = self.process_materials(text)
684
+ all_entities = extracted_quantities_spans + extracted_materials_spans
685
+ entities = self.prune_overlapping_annotations(all_entities)
686
+ return entities
687
+
688
+ def process_properties(self, text):
689
+ if self.gqp:
690
+ return self.gqp.process(text)
691
+ else:
692
+ return []
693
+
694
+ def process_materials(self, text):
695
+ if self.gmp:
696
+ return self.gmp.process(text)
697
+ else:
698
+ return []
699
+
700
+ @staticmethod
701
+ def box_to_dict(box, color=None, type=None, border=None):
702
+ """Convert a GROBID coordinate list into an annotation dict.
703
+
704
+ Args:
705
+ box: List or tuple of ``[page, x, y, width, height]``.
706
+ color: Optional hex colour string for the annotation.
707
+ type: Optional annotation type label.
708
+ border: Optional border style (e.g. ``"dotted"``).
709
+
710
+ Returns:
711
+ dict: Annotation dict suitable for ``streamlit-pdf-viewer``,
712
+ or empty dict if *box* is invalid.
713
+ """
714
+
715
+ if box is None or box == "" or len(box) < 5:
716
+ return {}
717
+
718
+ item = {"page": box[0], "x": box[1], "y": box[2], "width": box[3], "height": box[4]}
719
+ if color:
720
+ item['color'] = color
721
+
722
+ if type:
723
+ item['type'] = type
724
+
725
+ if border:
726
+ item['border'] = border
727
+
728
+ return item
729
+
730
+ @staticmethod
731
+ def prune_overlapping_annotations(entities: list) -> list:
732
+ """Remove overlapping spans, keeping the most informative one.
733
+
734
+ When two spans overlap, the longer span is preferred. Adjacent
735
+ spans of the same type may be merged (e.g. a split decimal number).
736
+
737
+ Args:
738
+ entities: List of span dicts with ``offset_start``,
739
+ ``offset_end``, ``type``, and ``text`` keys.
740
+
741
+ Returns:
742
+ list[dict]: Pruned, non-overlapping spans sorted by offset.
743
+ """
744
+ # Sorting by offsets
745
+ sorted_entities = sorted(entities, key=lambda d: d['offset_start'])
746
+
747
+ if len(entities) <= 1:
748
+ return sorted_entities
749
+
750
+ to_be_removed = []
751
+
752
+ previous = None
753
+ first = True
754
+
755
+ for current in sorted_entities:
756
+ if first:
757
+ first = False
758
+ previous = current
759
+ continue
760
+
761
+ if previous['offset_start'] < current['offset_start'] \
762
+ and previous['offset_end'] < current['offset_end'] \
763
+ and (previous['offset_end'] < current['offset_start'] \
764
+ and not (previous['text'] == "-" and current['text'][0].isdigit())):
765
+ previous = current
766
+ continue
767
+
768
+ if previous['offset_end'] < current['offset_end']:
769
+ if current['type'] == previous['type']:
770
+ # Type is the same
771
+ if current['offset_start'] == previous['offset_end']:
772
+ if current['type'] == 'property':
773
+ if current['text'].startswith("."):
774
+ print(
775
+ f"Merging. {current['text']} <{current['type']}> with {previous['text']} <{previous['type']}>")
776
+ # current entity starts with a ".", suspiciously look like a truncated value
777
+ to_be_removed.append(previous)
778
+ current['text'] = previous['text'] + current['text']
779
+ current['raw_value'] = current['text']
780
+ current['offset_start'] = previous['offset_start']
781
+ elif previous['text'].endswith(".") and current['text'][0].isdigit():
782
+ print(
783
+ f"Merging. {current['text']} <{current['type']}> with {previous['text']} <{previous['type']}>")
784
+ # previous entity ends with ".", current entity starts with a number
785
+ to_be_removed.append(previous)
786
+ current['text'] = previous['text'] + current['text']
787
+ current['raw_value'] = current['text']
788
+ current['offset_start'] = previous['offset_start']
789
+ elif previous['text'].startswith("-"):
790
+ print(
791
+ f"Merging. {current['text']} <{current['type']}> with {previous['text']} <{previous['type']}>")
792
+ # previous starts with a `-`, sherlock this is another truncated value
793
+ current['text'] = previous['text'] + current['text']
794
+ current['raw_value'] = current['text']
795
+ current['offset_start'] = previous['offset_start']
796
+ to_be_removed.append(previous)
797
+ else:
798
+ print("Other cases to be considered: ", previous, current)
799
+ else:
800
+ if current['text'].startswith("-"):
801
+ print(
802
+ f"Merging. {current['text']} <{current['type']}> with {previous['text']} <{previous['type']}>")
803
+ # previous starts with a `-`, sherlock this is another truncated value
804
+ current['text'] = previous['text'] + current['text']
805
+ current['raw_value'] = current['text']
806
+ current['offset_start'] = previous['offset_start']
807
+ to_be_removed.append(previous)
808
+ else:
809
+ print("Other cases to be considered: ", previous, current)
810
+
811
+ elif previous['text'] == "-" and current['text'][0].isdigit():
812
+ print(
813
+ f"Merging. {current['text']} <{current['type']}> with {previous['text']} <{previous['type']}>")
814
+ # previous starts with a `-`, sherlock this is another truncated value
815
+ current['text'] = previous['text'] + " " * (current['offset_start'] - previous['offset_end']) + \
816
+ current['text']
817
+ current['raw_value'] = current['text']
818
+ current['offset_start'] = previous['offset_start']
819
+ to_be_removed.append(previous)
820
+ else:
821
+ print(
822
+ f"Overlapping. {current['text']} <{current['type']}> with {previous['text']} <{previous['type']}>")
823
+
824
+ # take the largest one
825
+ if len(previous['text']) > len(current['text']):
826
+ to_be_removed.append(current)
827
+ elif len(previous['text']) < len(current['text']):
828
+ to_be_removed.append(previous)
829
+ else:
830
+ to_be_removed.append(previous)
831
+ elif current['type'] != previous['type']:
832
+ print(
833
+ f"Overlapping. {current['text']} <{current['type']}> with {previous['text']} <{previous['type']}>")
834
+
835
+ if len(previous['text']) > len(current['text']):
836
+ to_be_removed.append(current)
837
+ elif len(previous['text']) < len(current['text']):
838
+ to_be_removed.append(previous)
839
+ else:
840
+ if current['type'] == "material":
841
+ to_be_removed.append(previous)
842
+ else:
843
+ to_be_removed.append(current)
844
+ previous = current
845
+
846
+ elif previous['offset_end'] > current['offset_end']:
847
+ to_be_removed.append(current)
848
+ # the previous goes after the current, so we keep the previous and we discard the current
849
+ else:
850
+ if current['type'] == "material":
851
+ to_be_removed.append(previous)
852
+ else:
853
+ to_be_removed.append(current)
854
+ previous = current
855
+
856
+ new_sorted_entities = [e for e in sorted_entities if e not in to_be_removed]
857
+
858
+ return new_sorted_entities
859
+
860
+
861
+ class XmlProcessor(BaseProcessor):
862
+ def __init__(self):
863
+ super().__init__()
864
+
865
+ def process_structure(self, input_file):
866
+ text = ""
867
+ with open(input_file, encoding='utf-8') as fi:
868
+ text = fi.read()
869
+
870
+ output_data = self.parse_xml(text)
871
+ output_data['filename'] = Path(input_file).stem.replace(".tei", "")
872
+
873
+ return output_data
874
+
875
+ # def process_single(self, input_file):
876
+ # doc = self.process_structure(input_file)
877
+ #
878
+ # for paragraph in doc['passages']:
879
+ # entities = self.process_single_text(paragraph['text'])
880
+ # paragraph['spans'] = entities
881
+ #
882
+ # return doc
883
+
884
+ def process(self, text):
885
+ output_data = OrderedDict()
886
+ soup = BeautifulSoup(text, 'xml')
887
+ text_blocks_children = get_children_list_supermat(soup, verbose=False)
888
+
889
+ passages = []
890
+ output_data['passages'] = passages
891
+ passages.extend([
892
+ {
893
+ "text": self.post_process(''.join(text for text in sentence.find_all(text=True) if
894
+ text.parent.name != "ref" or (
895
+ text.parent.name == "ref" and text.parent.attrs[
896
+ 'type'] != 'bibr'))),
897
+ "type": "paragraph",
898
+ "section": "<body>",
899
+ "subSection": "<paragraph>",
900
+ "passage_id": str(paragraph_id) + str(sentence_id)
901
+ }
902
+ for paragraph_id, paragraph in enumerate(text_blocks_children) for
903
+ sentence_id, sentence in enumerate(paragraph)
904
+ ])
905
+
906
+ return output_data
907
+
908
+
909
+ def get_children_list_supermat(soup, use_paragraphs=False, verbose=False):
910
+ children = []
911
+
912
+ child_name = "p" if use_paragraphs else "s"
913
+ for child in soup.tei.children:
914
+ if child.name == 'teiHeader':
915
+ pass
916
+ children.append(child.find_all("title"))
917
+ children.extend([subchild.find_all(child_name) for subchild in child.find_all("abstract")])
918
+ children.extend([subchild.find_all(child_name) for subchild in child.find_all("ab", {"type": "keywords"})])
919
+ elif child.name == 'text':
920
+ children.extend([subchild.find_all(child_name) for subchild in child.find_all("body")])
921
+
922
+ if verbose:
923
+ print(str(children))
924
+
925
+ return children
926
+
927
+
928
+ def get_children_list_grobid(soup: object, use_paragraphs: object = True, verbose: object = False) -> object:
929
+ children = []
930
+
931
+ child_name = "p" if use_paragraphs else "s"
932
+ for child in soup.TEI.children:
933
+ if child.name == 'teiHeader':
934
+ pass
935
+ # children.extend(child.find_all("title", attrs={"level": "a"}, limit=1))
936
+ # children.extend([subchild.find_all(child_name) for subchild in child.find_all("abstract")])
937
+ elif child.name == 'text':
938
+ children.extend([subchild.find_all(child_name) for subchild in child.find_all("body")])
939
+ children.extend([subchild.find_all("figDesc") for subchild in child.find_all("body")])
940
+
941
+ if verbose:
942
+ print(str(children))
943
+
944
+ return children
945
+
946
+
947
+ def get_xml_nodes_header(soup: object, use_paragraphs: bool = True) -> list:
948
+ sub_tag = "p" if use_paragraphs else "s"
949
+
950
+ header_elements = {
951
+ "authors": [persNameNode for persNameNode in soup.teiHeader.find_all("persName")],
952
+ "abstract": [p_in_abstract for abstractNodes in soup.teiHeader.find_all("abstract") for p_in_abstract in
953
+ abstractNodes.find_all(sub_tag)],
954
+ "title": [soup.teiHeader.fileDesc.title]
955
+ }
956
+
957
+ return header_elements
958
+
959
+
960
+ def get_xml_nodes_body(soup: object, use_paragraphs: bool = True, verbose: bool = False) -> list:
961
+ nodes = []
962
+ tag_name = "p" if use_paragraphs else "s"
963
+ for child in soup.TEI.children:
964
+ if child.name == 'text':
965
+ # nodes.extend([subchild.find_all(tag_name) for subchild in child.find_all("body")])
966
+ nodes.extend(
967
+ [subsubchild for subchild in child.find_all("body") for subsubchild in subchild.find_all(tag_name)])
968
+
969
+ if verbose:
970
+ print(str(nodes))
971
+
972
+ return nodes
973
+
974
+
975
+ def get_xml_nodes_back(soup: object, use_paragraphs: bool = True, verbose: bool = False) -> list:
976
+ nodes = []
977
+ tag_name = "p" if use_paragraphs else "s"
978
+ for child in soup.TEI.children:
979
+ if child.name == 'text':
980
+ nodes.extend(
981
+ [subsubchild for subchild in child.find_all("back") for subsubchild in subchild.find_all(tag_name)])
982
+
983
+ if verbose:
984
+ print(str(nodes))
985
+
986
+ return nodes
987
+
988
+
989
+ def get_xml_nodes_figures(soup: object, verbose: bool = False) -> list:
990
+ children = []
991
+ for child in soup.TEI.children:
992
+ if child.name == 'text':
993
+ children.extend(
994
+ [subchild for subchilds in child.find_all("body") for subchild in subchilds.find_all("figDesc")])
995
+
996
+ if verbose:
997
+ print(str(children))
998
+
999
+ return children
document_qa/langchain.py ADDED
@@ -0,0 +1,222 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """LangChain vector store extensions for document-qa.
2
+
3
+ Extends ChromaDB with support for returning similarity scores **and**
4
+ raw embedding vectors alongside retrieved documents. This enables
5
+ the Streamlit frontend to compute relevance gradients and the
6
+ ``question_coefficient`` analysis mode.
7
+
8
+ """
9
+
10
+ from typing import Any, Optional, List, Dict, Tuple, ClassVar, Collection
11
+
12
+ from langchain.schema import Document
13
+ from langchain_community.vectorstores.chroma import Chroma, DEFAULT_K
14
+ from langchain_core.callbacks import CallbackManagerForRetrieverRun
15
+ from langchain_core.utils import xor_args
16
+ from langchain_core.vectorstores import VectorStore, VectorStoreRetriever
17
+
18
+
19
+ class AdvancedVectorStoreRetriever(VectorStoreRetriever):
20
+ """Retriever that can enrich documents with similarity scores and embeddings.
21
+
22
+ Extends LangChain's ``VectorStoreRetriever`` with a
23
+ ``"similarity_with_embeddings"`` search type. When used, each
24
+ returned document's ``metadata`` dict gains ``__similarity`` (float)
25
+ and ``__embeddings`` (list[float]) keys.
26
+ """
27
+
28
+ allowed_search_types: ClassVar[Collection[str]] = (
29
+ "similarity",
30
+ "similarity_score_threshold",
31
+ "mmr",
32
+ "similarity_with_embeddings"
33
+ )
34
+
35
+ def _get_relevant_documents(
36
+ self, query: str, *, run_manager: CallbackManagerForRetrieverRun
37
+ ) -> List[Document]:
38
+ """Fetch relevant documents for the configured search type.
39
+
40
+ Supports all standard search types plus
41
+ ``"similarity_with_embeddings"`` which attaches score and
42
+ embedding vector metadata to each document.
43
+
44
+ Args:
45
+ query: The search query string.
46
+ run_manager: LangChain callback manager.
47
+
48
+ Returns:
49
+ list[Document]: Retrieved documents, optionally enriched
50
+ with similarity scores and embeddings.
51
+ """
52
+
53
+ if self.search_type == "similarity_with_embeddings":
54
+ docs_scores_and_embeddings = (
55
+ self.vectorstore.advanced_similarity_search(
56
+ query, **self.search_kwargs
57
+ )
58
+ )
59
+
60
+ for doc, score, embeddings in docs_scores_and_embeddings:
61
+ if '__embeddings' not in doc.metadata.keys():
62
+ doc.metadata['__embeddings'] = embeddings
63
+ if '__similarity' not in doc.metadata.keys():
64
+ doc.metadata['__similarity'] = score
65
+
66
+ docs = [doc for doc, _, _ in docs_scores_and_embeddings]
67
+ elif self.search_type == "similarity_score_threshold":
68
+ docs_and_similarities = (
69
+ self.vectorstore.similarity_search_with_relevance_scores(
70
+ query, **self.search_kwargs
71
+ )
72
+ )
73
+ for doc, similarity in docs_and_similarities:
74
+ if '__similarity' not in doc.metadata.keys():
75
+ doc.metadata['__similarity'] = similarity
76
+
77
+ docs = [doc for doc, _ in docs_and_similarities]
78
+ else:
79
+ docs = super()._get_relevant_documents(query, run_manager=run_manager)
80
+
81
+ return docs
82
+
83
+
84
+ class AdvancedVectorStore(VectorStore):
85
+ """
86
+ Extension of LangChain's VectorStore that returns a custom retriever
87
+ supporting advanced search features.
88
+ """
89
+
90
+ def as_retriever(self, **kwargs: Any) -> AdvancedVectorStoreRetriever:
91
+ """Create a retriever supporting ``similarity_with_embeddings``.
92
+
93
+ Accepts the same keyword arguments as the base ``as_retriever``.
94
+ """
95
+ tags = kwargs.pop("tags", None) or []
96
+ tags.extend(self._get_retriever_tags())
97
+ return AdvancedVectorStoreRetriever(vectorstore=self, **kwargs, tags=tags)
98
+
99
+
100
+ class ChromaAdvancedRetrieval(Chroma, AdvancedVectorStore):
101
+ """Chroma vector store with support for embeddings + similarity scores.
102
+
103
+ Extends the standard LangChain ``Chroma`` store with
104
+ `advanced_similarity_search` which returns ``(Document, score,
105
+ embedding)`` triples.
106
+ """
107
+
108
+ def __init__(self, **kwargs):
109
+ super().__init__(**kwargs)
110
+
111
+ @xor_args(("query_texts", "query_embeddings"))
112
+ def __query_collection(
113
+ self,
114
+ query_texts: Optional[List[str]] = None,
115
+ query_embeddings: Optional[List[List[float]]] = None,
116
+ n_results: int = 4,
117
+ where: Optional[Dict[str, str]] = None,
118
+ where_document: Optional[Dict[str, str]] = None,
119
+ **kwargs: Any,
120
+ ) -> List[Document]:
121
+ """Query the chroma collection."""
122
+ try:
123
+ import chromadb # noqa: F401
124
+ except ImportError:
125
+ raise ValueError(
126
+ "Could not import chromadb python package. "
127
+ "Please install it with `pip install chromadb`."
128
+ )
129
+ return self._collection.query(
130
+ query_texts=query_texts,
131
+ query_embeddings=query_embeddings,
132
+ n_results=n_results,
133
+ where=where,
134
+ where_document=where_document,
135
+ **kwargs,
136
+ )
137
+
138
+ def advanced_similarity_search(
139
+ self,
140
+ query: str,
141
+ k: int = DEFAULT_K,
142
+ filter: Optional[Dict[str, str]] = None,
143
+ **kwargs: Any,
144
+ ) -> List[Tuple[Document, float, List[float]]]:
145
+ """Return documents, similarity scores, and embeddings for *query*.
146
+
147
+ Args:
148
+ query: The search query.
149
+ k: Number of results to return.
150
+ filter: Optional Chroma metadata filter.
151
+
152
+ Returns:
153
+ list[tuple[Document, float, list[float]]]: Triples of
154
+ (document, distance, embedding_vector).
155
+ """
156
+ docs_scores_and_embeddings = self.similarity_search_with_scores_and_embeddings(query, k, filter=filter)
157
+ return docs_scores_and_embeddings
158
+
159
+ def similarity_search_with_scores_and_embeddings(
160
+ self,
161
+ query: str,
162
+ k: int = DEFAULT_K,
163
+ filter: Optional[Dict[str, str]] = None,
164
+ where_document: Optional[Dict[str, str]] = None,
165
+ **kwargs: Any,
166
+ ) -> List[Tuple[Document, float, List[float]]]:
167
+ """Low-level search returning docs with scores and embeddings.
168
+
169
+ Queries the Chroma collection requesting ``distances`` and
170
+ ``embeddings`` in addition to the usual documents and metadata.
171
+
172
+ Args:
173
+ query: The search query.
174
+ k: Number of results.
175
+ filter: Optional metadata filter.
176
+ where_document: Optional document-content filter.
177
+
178
+ Returns:
179
+ list[tuple[Document, float, list[float]]]: Triples of
180
+ (document, distance, embedding_vector).
181
+ """
182
+
183
+ if self._embedding_function is None:
184
+ results = self.__query_collection(
185
+ query_texts=[query],
186
+ n_results=k,
187
+ where=filter,
188
+ where_document=where_document,
189
+ include=['metadatas', 'documents', 'embeddings', 'distances']
190
+ )
191
+ else:
192
+ query_embedding = self._embedding_function.embed_query(query)
193
+ results = self.__query_collection(
194
+ query_embeddings=[query_embedding],
195
+ n_results=k,
196
+ where=filter,
197
+ where_document=where_document,
198
+ include=['metadatas', 'documents', 'embeddings', 'distances']
199
+ )
200
+
201
+ return _results_to_docs_scores_and_embeddings(results)
202
+
203
+
204
+ def _results_to_docs_scores_and_embeddings(results: Any) -> List[Tuple[Document, float, List[float]]]:
205
+ """Unpack raw Chroma query results into ``(Document, score, embedding)`` tuples.
206
+
207
+ Args:
208
+ results: Dict returned by ``Collection.query()`` with
209
+ ``include=['documents', 'metadatas', 'distances', 'embeddings']``.
210
+
211
+ Returns:
212
+ list[tuple[Document, float, list[float]]]: One tuple per result.
213
+ """
214
+ return [
215
+ (Document(page_content=result[0], metadata=result[1] or {}), result[2], result[3])
216
+ for result in zip(
217
+ results["documents"][0],
218
+ results["metadatas"][0],
219
+ results["distances"][0],
220
+ results["embeddings"][0],
221
+ )
222
+ ]
document_qa/ner_client_generic.py ADDED
@@ -0,0 +1,461 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+
4
+ import yaml
5
+
6
+ '''
7
+ This client is a generic client for any Grobid application and sub-modules.
8
+ At the moment, it supports only single document processing.
9
+
10
+ Source: https://github.com/kermitt2/grobid-client-python
11
+ '''
12
+
13
+ """ Generic API Client """
14
+ from copy import deepcopy
15
+ import json
16
+ import requests
17
+
18
+ try:
19
+ from urlparse import urljoin
20
+ except ImportError:
21
+ from urllib.parse import urljoin
22
+
23
+
24
+ class ApiClient(object):
25
+ """ Client to interact with a generic Rest API.
26
+
27
+ Subclasses should implement functionality accordingly with the provided
28
+ service methods, i.e. ``get``, ``post``, ``put`` and ``delete``.
29
+ """
30
+
31
+ accept_type = 'application/xml'
32
+ api_base = None
33
+
34
+ def __init__(
35
+ self,
36
+ base_url,
37
+ username=None,
38
+ api_key=None,
39
+ status_endpoint=None,
40
+ timeout=60
41
+ ):
42
+ """ Initialise client.
43
+
44
+ Args:
45
+ base_url (str): The base URL to the service being used.
46
+ username (str): The username to authenticate with.
47
+ api_key (str): The API key to authenticate with.
48
+ timeout (int): Maximum time before timing out.
49
+ """
50
+ self.base_url = base_url
51
+ self.username = username
52
+ self.api_key = api_key
53
+ self.status_endpoint = urljoin(self.base_url, status_endpoint)
54
+ self.timeout = timeout
55
+
56
+ @staticmethod
57
+ def encode(request, data):
58
+ """ Add request content data to request body, set Content-type header.
59
+
60
+ Should be overridden by subclasses if not using JSON encoding.
61
+
62
+ Args:
63
+ request (HTTPRequest): The request object.
64
+ data (dict, None): Data to be encoded.
65
+
66
+ Returns:
67
+ HTTPRequest: The request object.
68
+ """
69
+ if data is None:
70
+ return request
71
+
72
+ request.add_header('Content-Type', 'application/json')
73
+ request.extracted_data = json.dumps(data)
74
+
75
+ return request
76
+
77
+ @staticmethod
78
+ def decode(response):
79
+ """ Decode the returned data in the response.
80
+
81
+ Should be overridden by subclasses if something else than JSON is
82
+ expected.
83
+
84
+ Args:
85
+ response (HTTPResponse): The response object.
86
+
87
+ Returns:
88
+ dict or None.
89
+ """
90
+ try:
91
+ return response.json()
92
+ except ValueError as e:
93
+ return e.message
94
+
95
+ def get_credentials(self):
96
+ """ Returns parameters to be added to authenticate the request.
97
+
98
+ This lives on its own to make it easier to re-implement it if needed.
99
+
100
+ Returns:
101
+ dict: A dictionary containing the credentials.
102
+ """
103
+ return {"username": self.username, "api_key": self.api_key}
104
+
105
+ def call_api(
106
+ self,
107
+ method,
108
+ url,
109
+ headers=None,
110
+ params=None,
111
+ data=None,
112
+ files=None,
113
+ timeout=None,
114
+ ):
115
+ """ Call API.
116
+
117
+ This returns object containing data, with error details if applicable.
118
+
119
+ Args:
120
+ method (str): The HTTP method to use.
121
+ url (str): Resource location relative to the base URL.
122
+ headers (dict or None): Extra request headers to set.
123
+ params (dict or None): Query-string parameters.
124
+ data (dict or None): Request body contents for POST or PUT requests.
125
+ files (dict or None: Files to be passed to the request.
126
+ timeout (int): Maximum time before timing out.
127
+
128
+ Returns:
129
+ ResultParser or ErrorParser.
130
+ """
131
+ headers = deepcopy(headers) or {}
132
+ headers['Accept'] = self.accept_type if 'Accept' not in headers else headers['Accept']
133
+ params = deepcopy(params) or {}
134
+ data = data or {}
135
+ files = files or {}
136
+ # if self.username is not None and self.api_key is not None:
137
+ # params.update(self.get_credentials())
138
+ r = requests.request(
139
+ method,
140
+ url,
141
+ headers=headers,
142
+ params=params,
143
+ files=files,
144
+ data=data,
145
+ timeout=timeout,
146
+ )
147
+
148
+ return r, r.status_code
149
+
150
+ def get(self, url, params=None, **kwargs):
151
+ """ Call the API with a GET request.
152
+
153
+ Args:
154
+ url (str): Resource location relative to the base URL.
155
+ params (dict or None): Query-string parameters.
156
+
157
+ Returns:
158
+ ResultParser or ErrorParser.
159
+ """
160
+ return self.call_api(
161
+ "GET",
162
+ url,
163
+ params=params,
164
+ **kwargs
165
+ )
166
+
167
+ def delete(self, url, params=None, **kwargs):
168
+ """ Call the API with a DELETE request.
169
+
170
+ Args:
171
+ url (str): Resource location relative to the base URL.
172
+ params (dict or None): Query-string parameters.
173
+
174
+ Returns:
175
+ ResultParser or ErrorParser.
176
+ """
177
+ return self.call_api(
178
+ "DELETE",
179
+ url,
180
+ params=params,
181
+ **kwargs
182
+ )
183
+
184
+ def put(self, url, params=None, data=None, files=None, **kwargs):
185
+ """ Call the API with a PUT request.
186
+
187
+ Args:
188
+ url (str): Resource location relative to the base URL.
189
+ params (dict or None): Query-string parameters.
190
+ data (dict or None): Request body contents.
191
+ files (dict or None: Files to be passed to the request.
192
+
193
+ Returns:
194
+ An instance of ResultParser or ErrorParser.
195
+ """
196
+ return self.call_api(
197
+ "PUT",
198
+ url,
199
+ params=params,
200
+ data=data,
201
+ files=files,
202
+ **kwargs
203
+ )
204
+
205
+ def post(self, url, params=None, data=None, files=None, **kwargs):
206
+ """ Call the API with a POST request.
207
+
208
+ Args:
209
+ url (str): Resource location relative to the base URL.
210
+ params (dict or None): Query-string parameters.
211
+ data (dict or None): Request body contents.
212
+ files (dict or None: Files to be passed to the request.
213
+
214
+ Returns:
215
+ An instance of ResultParser or ErrorParser.
216
+ """
217
+ return self.call_api(
218
+ method="POST",
219
+ url=url,
220
+ params=params,
221
+ data=data,
222
+ files=files,
223
+ **kwargs
224
+ )
225
+
226
+ def service_status(self, **kwargs):
227
+ """ Call the API to get the status of the service.
228
+
229
+ Returns:
230
+ An instance of ResultParser or ErrorParser.
231
+ """
232
+ return self.call_api(
233
+ 'GET',
234
+ self.status_endpoint,
235
+ params={'format': 'json'},
236
+ **kwargs
237
+ )
238
+
239
+
240
+ class NERClientGeneric(ApiClient):
241
+
242
+ def __init__(self, config_path=None, ping=False):
243
+ self.config = None
244
+ if config_path is not None:
245
+ self.config = self._load_yaml_config_from_file(path=config_path)
246
+ super().__init__(self.config['grobid']['server'])
247
+
248
+ if ping:
249
+ result = self.ping_service()
250
+ if not result:
251
+ raise Exception("Grobid is down.")
252
+
253
+ os.environ['NO_PROXY'] = "nims.go.jp"
254
+
255
+ @staticmethod
256
+ def _load_json_config_from_file(path='./config.json'):
257
+ """
258
+ Load the json configuration
259
+ """
260
+ config = {}
261
+ with open(path, 'r') as fp:
262
+ config = json.load(fp)
263
+
264
+ return config
265
+
266
+ @staticmethod
267
+ def _load_yaml_config_from_file(path='./config.yaml'):
268
+ """
269
+ Load the YAML configuration
270
+ """
271
+ config = {}
272
+ try:
273
+ with open(path, 'r') as the_file:
274
+ raw_configuration = the_file.read()
275
+
276
+ config = yaml.safe_load(raw_configuration)
277
+ except Exception as e:
278
+ print("Configuration could not be loaded: ", str(e))
279
+ exit(1)
280
+
281
+ return config
282
+
283
+ def set_config(self, config, ping=False):
284
+ self.config = config
285
+ if ping:
286
+ try:
287
+ result = self.ping_service()
288
+ if not result:
289
+ raise Exception("Grobid is down.")
290
+ except Exception as e:
291
+ raise Exception("Grobid is down or other problems were encountered. ", e)
292
+
293
+ def ping_service(self):
294
+ # test if the server is up and running...
295
+ ping_url = self.get_url("ping")
296
+
297
+ r = requests.get(ping_url)
298
+ status = r.status_code
299
+
300
+ if status != 200:
301
+ print('GROBID server does not appear up and running ' + str(status))
302
+ return False
303
+ else:
304
+ print("GROBID server is up and running")
305
+ return True
306
+
307
+ def get_url(self, action):
308
+ grobid_config = self.config['grobid']
309
+ base_url = grobid_config['server']
310
+ action_url = base_url + grobid_config['url_mapping'][action]
311
+
312
+ return action_url
313
+
314
+ def process_texts(self, input, method_name='superconductors', params={}, headers={"Accept": "application/json"}):
315
+
316
+ files = {
317
+ 'texts': input
318
+ }
319
+
320
+ the_url = self.get_url(method_name)
321
+ params, the_url = self.get_params_from_url(the_url)
322
+
323
+ res, status = self.post(
324
+ url=the_url,
325
+ files=files,
326
+ data=params,
327
+ headers=headers
328
+ )
329
+
330
+ if status == 503:
331
+ time.sleep(self.config['sleep_time'])
332
+ return self.process_texts(input, method_name, params, headers)
333
+ elif status != 200:
334
+ print('Processing failed with error ' + str(status))
335
+ return status, None
336
+ else:
337
+ return status, json.loads(res.text)
338
+
339
+ def process_text(self, input, method_name='superconductors', params={}, headers={"Accept": "application/json"}):
340
+
341
+ files = {
342
+ 'text': input
343
+ }
344
+
345
+ the_url = self.get_url(method_name)
346
+ params, the_url = self.get_params_from_url(the_url)
347
+
348
+ res, status = self.post(
349
+ url=the_url,
350
+ files=files,
351
+ data=params,
352
+ headers=headers
353
+ )
354
+
355
+ if status == 503:
356
+ time.sleep(self.config['sleep_time'])
357
+ return self.process_text(input, method_name, params, headers)
358
+ elif status != 200:
359
+ print('Processing failed with error ' + str(status))
360
+ return status, None
361
+ else:
362
+ return status, json.loads(res.text)
363
+
364
+ def process_pdf(self,
365
+ form_data: dict,
366
+ method_name='superconductors',
367
+ params={},
368
+ headers={"Accept": "application/json"}
369
+ ):
370
+
371
+ the_url = self.get_url(method_name)
372
+ params, the_url = self.get_params_from_url(the_url)
373
+
374
+ res, status = self.post(
375
+ url=the_url,
376
+ files=form_data,
377
+ data=params,
378
+ headers=headers
379
+ )
380
+
381
+ if status == 503:
382
+ time.sleep(self.config['sleep_time'])
383
+ return self.process_text(input, method_name, params, headers)
384
+ elif status != 200:
385
+ print('Processing failed with error ' + str(status))
386
+ else:
387
+ return res.text
388
+
389
+ def process_pdfs(self, pdf_files, params={}):
390
+ pass
391
+
392
+ def process_pdf(
393
+ self,
394
+ pdf_file,
395
+ method_name,
396
+ params={},
397
+ headers={"Accept": "application/json"},
398
+ verbose=False,
399
+ retry=None
400
+ ):
401
+
402
+ files = {
403
+ 'input': (
404
+ pdf_file,
405
+ open(pdf_file, 'rb'),
406
+ 'application/pdf',
407
+ {'Expires': '0'}
408
+ )
409
+ }
410
+
411
+ the_url = self.get_url(method_name)
412
+
413
+ params, the_url = self.get_params_from_url(the_url)
414
+
415
+ res, status = self.post(
416
+ url=the_url,
417
+ files=files,
418
+ data=params,
419
+ headers=headers
420
+ )
421
+
422
+ if status == 503 or status == 429:
423
+ if retry is None:
424
+ retry = self.config['max_retry'] - 1
425
+ else:
426
+ if retry - 1 == 0:
427
+ if verbose:
428
+ print("re-try exhausted. Aborting request")
429
+ return None, status
430
+ else:
431
+ retry -= 1
432
+
433
+ sleep_time = self.config['sleep_time']
434
+ if verbose:
435
+ print("Server is saturated, waiting", sleep_time, "seconds and trying again. ")
436
+ time.sleep(sleep_time)
437
+ return self.process_pdf(pdf_file, method_name, params, headers, verbose=verbose, retry=retry)
438
+ elif status != 200:
439
+ desc = None
440
+ if res.content:
441
+ c = json.loads(res.text)
442
+ desc = c['description'] if 'description' in c else None
443
+ return desc, status
444
+ elif status == 204:
445
+ # print('No content returned. Moving on. ')
446
+ return None, status
447
+ else:
448
+ return res.text, status
449
+
450
+ def get_params_from_url(self, the_url):
451
+ """
452
+ This method is used to pass to the URL predefined parameters, which are added in the URL format
453
+ """
454
+ params = {}
455
+ if "?" in the_url:
456
+ split = the_url.split("?")
457
+ the_url = split[0]
458
+ params = split[1]
459
+
460
+ params = {param.split("=")[0]: param.split("=")[1] for param in params.split("&")}
461
+ return params, the_url
pyproject.toml ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [build-system]
2
+ requires = ["setuptools", "setuptools-scm"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [tool.bumpversion]
6
+ current_version = "0.5.1"
7
+ commit = "true"
8
+ tag = "true"
9
+ tag_name = "v{new_version}"
10
+
11
+ #[[tool.bumpversion.files]]
12
+ #filename = "version.txt"
13
+ #search = "{current_version}"
14
+ #replace = "{new_version}"
15
+
16
+ [project]
17
+ name = "document-qa-engine"
18
+ license = { file = "LICENSE" }
19
+ authors = [
20
+ { name = "Luca Foppiano", email = "lucanoro@duck.com" },
21
+ ]
22
+ maintainers = [
23
+ { name = "Luca Foppiano", email = "lucanoro@duck.com" }
24
+ ]
25
+ description = "Scientific Document Insight Q/A"
26
+ readme = "README.md"
27
+
28
+ dynamic = ['version', "dependencies"]
29
+
30
+ [tool.setuptools]
31
+ license-files = []
32
+
33
+ [tool.setuptools.dynamic]
34
+ dependencies = {file = ["requirements.txt"]}
35
+
36
+ [tool.setuptools_scm]
37
+
38
+ [project.urls]
39
+ Homepage = "https://document-insights.streamlit.app"
40
+ Repository = "https://github.com/lfoppiano/document-qa"
41
+ Changelog = "https://github.com/lfoppiano/document-qa/blob/main/CHANGELOG.md"
pytest.ini ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ [pytest]
2
+ testpaths = tests
requirements.txt ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Grobid
2
+ grobid-quantities-client==0.4.0
3
+ grobid-client-python==0.0.9
4
+ grobid-tei-xml==0.1.3
5
+
6
+ # Utils
7
+ tqdm==4.66.3
8
+ pyyaml==6.0.1
9
+ pytest==8.1.1
10
+ streamlit==1.45.1
11
+ lxml==5.2.1
12
+ beautifulsoup4==4.12.3
13
+ python-dotenv==1.0.1
14
+ watchdog==4.0.0
15
+ dateparser==1.2.0
16
+ requests>=2.31.0
17
+ numpy==1.26.4
18
+
19
+ # LLM
20
+ chromadb==0.4.24
21
+ tiktoken==0.9.0
22
+ openai==1.82.0
23
+ langchain==0.3.25
24
+ langchain-core==0.3.61
25
+ langchain-openai==0.3.18
26
+ langchain-huggingface==0.2.0
27
+ langchain-community==0.3.21
28
+ typing-inspect==0.9.0
29
+ typing_extensions==4.12.2
30
+ pydantic==2.10.6
31
+ sentence-transformers==2.6.1
32
+ streamlit-pdf-viewer==0.0.25
33
+ umap-learn==0.5.6
34
+ plotly==5.20.0
streamlit_app.py ADDED
@@ -0,0 +1,490 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Streamlit frontend for the Document Q/A system.
2
+
3
+ This module implements the web UI for uploading scientific PDFs,
4
+ asking questions via an LLM-powered RAG pipeline, and viewing
5
+ highlighted PDF passages. It is the main entry-point when running::
6
+
7
+ streamlit run streamlit_app.py
8
+
9
+ Configuration is loaded from environment variables (see ``.env.example``).
10
+ """
11
+
12
+ import os
13
+ import re
14
+ from hashlib import blake2b
15
+ from tempfile import NamedTemporaryFile
16
+
17
+ import dotenv
18
+ from grobid_quantities.quantities import QuantitiesAPI
19
+ from langchain.memory import ConversationBufferMemory
20
+ from langchain_openai import ChatOpenAI
21
+ from streamlit_pdf_viewer import pdf_viewer
22
+
23
+ from document_qa.custom_embeddings import ModalEmbeddings
24
+ from document_qa.ner_client_generic import NERClientGeneric
25
+
26
+ dotenv.load_dotenv(override=True)
27
+
28
+ import streamlit as st
29
+ from document_qa.document_qa_engine import DocumentQAEngine, DataStorage
30
+ from document_qa.grobid_processors import GrobidAggregationProcessor, decorate_text_with_annotations
31
+
32
+ API_MODELS = {
33
+ "microsoft/Phi-4-mini-instruct": os.environ["PHI_URL"],
34
+ "Qwen/Qwen3-0.6B": os.environ["QWEN_URL"]
35
+ }
36
+
37
+ API_EMBEDDINGS = {
38
+ 'intfloat/multilingual-e5-large-instruct-modal': os.environ['EMBEDS_URL']
39
+ }
40
+
41
+ if 'rqa' not in st.session_state:
42
+ st.session_state['rqa'] = {}
43
+
44
+ if 'model' not in st.session_state:
45
+ st.session_state['model'] = None
46
+
47
+ if 'api_keys' not in st.session_state:
48
+ st.session_state['api_keys'] = {}
49
+
50
+ if 'doc_id' not in st.session_state:
51
+ st.session_state['doc_id'] = None
52
+
53
+ if 'loaded_embeddings' not in st.session_state:
54
+ st.session_state['loaded_embeddings'] = None
55
+
56
+ if 'hash' not in st.session_state:
57
+ st.session_state['hash'] = None
58
+
59
+ if 'git_rev' not in st.session_state:
60
+ st.session_state['git_rev'] = "unknown"
61
+ if os.path.exists("revision.txt"):
62
+ with open("revision.txt", 'r') as fr:
63
+ from_file = fr.read()
64
+ st.session_state['git_rev'] = from_file if len(from_file) > 0 else "unknown"
65
+
66
+ if "messages" not in st.session_state:
67
+ st.session_state.messages = []
68
+
69
+ if 'ner_processing' not in st.session_state:
70
+ st.session_state['ner_processing'] = False
71
+
72
+ if 'uploaded' not in st.session_state:
73
+ st.session_state['uploaded'] = False
74
+
75
+ if 'memory' not in st.session_state:
76
+ st.session_state['memory'] = None
77
+
78
+ if 'binary' not in st.session_state:
79
+ st.session_state['binary'] = None
80
+
81
+ if 'annotations' not in st.session_state:
82
+ st.session_state['annotations'] = None
83
+
84
+ if 'should_show_annotations' not in st.session_state:
85
+ st.session_state['should_show_annotations'] = True
86
+
87
+ if 'pdf' not in st.session_state:
88
+ st.session_state['pdf'] = None
89
+
90
+ if 'embeddings' not in st.session_state:
91
+ st.session_state['embeddings'] = None
92
+
93
+ if 'scroll_to_first_annotation' not in st.session_state:
94
+ st.session_state['scroll_to_first_annotation'] = False
95
+
96
+ st.set_page_config(
97
+ page_title="Scientific Document Insights Q/A",
98
+ page_icon="📝",
99
+ initial_sidebar_state="expanded",
100
+ layout="wide",
101
+ menu_items={
102
+ 'Get Help': 'https://github.com/lfoppiano/document-qa',
103
+ 'Report a bug': "https://github.com/lfoppiano/document-qa/issues",
104
+ 'About': "Upload a scientific article in PDF, ask questions, get insights."
105
+ }
106
+ )
107
+
108
+ st.markdown(
109
+ """
110
+ <style>
111
+ .block-container {
112
+ padding-top: 3rem;
113
+ padding-bottom: 1rem;
114
+ padding-left: 1rem;
115
+ padding-right: 1rem;
116
+ }
117
+ </style>
118
+ """,
119
+ unsafe_allow_html=True
120
+ )
121
+
122
+
123
+ def new_file():
124
+ """Reset session state when a new file is uploaded.
125
+
126
+ Clears previous embeddings, annotations, and conversation memory
127
+ so the pipeline starts fresh for the new document.
128
+ """
129
+ st.session_state['loaded_embeddings'] = None
130
+ st.session_state['doc_id'] = None
131
+ st.session_state['uploaded'] = True
132
+ st.session_state['annotations'] = []
133
+ if st.session_state['memory']:
134
+ st.session_state['memory'].clear()
135
+
136
+
137
+ def clear_memory():
138
+ """Clear the conversation buffer memory (chat history)."""
139
+ st.session_state['memory'].clear()
140
+
141
+
142
+ # @st.cache_resource
143
+ def init_qa(model_name, embeddings_name):
144
+ """Initialise the Q/A engine with the selected LLM and embedding models.
145
+
146
+ Args:
147
+ model_name: Key from ``API_MODELS`` selecting the LLM.
148
+ embeddings_name: Key from ``API_EMBEDDINGS`` selecting the
149
+ embedding model.
150
+
151
+ Returns:
152
+ DocumentQAEngine: Ready-to-use engine instance.
153
+ """
154
+ st.session_state['memory'] = ConversationBufferMemory(
155
+ memory_key="chat_history",
156
+ return_messages=True
157
+ )
158
+ chat = ChatOpenAI(
159
+ model=model_name,
160
+ temperature=0.0,
161
+ base_url=API_MODELS[model_name],
162
+ api_key=os.environ.get('API_KEY')
163
+ )
164
+
165
+ embeddings = ModalEmbeddings(
166
+ url=API_EMBEDDINGS[embeddings_name],
167
+ model_name=embeddings_name,
168
+ api_key=os.environ.get('EMBEDS_API_KEY')
169
+ )
170
+
171
+ storage = DataStorage(embeddings)
172
+ return DocumentQAEngine(chat, storage, grobid_url=os.environ['GROBID_URL'], memory=st.session_state['memory'])
173
+
174
+
175
+ @st.cache_resource
176
+ def init_ner():
177
+ """Initialise the NER aggregation processor (quantities + materials).
178
+
179
+ Uses ``GROBID_QUANTITIES_URL`` and ``GROBID_MATERIALS_URL`` from
180
+ the environment. Results are cached across Streamlit reruns.
181
+
182
+ Returns:
183
+ GrobidAggregationProcessor: Configured processor instance.
184
+ """
185
+ quantities_client = QuantitiesAPI(os.environ['GROBID_QUANTITIES_URL'], check_server=True)
186
+
187
+ materials_client = NERClientGeneric(ping=True)
188
+ config_materials = {
189
+ 'grobid': {
190
+ "server": os.environ['GROBID_MATERIALS_URL'],
191
+ 'sleep_time': 5,
192
+ 'timeout': 60,
193
+ 'url_mapping': {
194
+ 'processText_disable_linking': "/service/process/text?disableLinking=True",
195
+ # 'processText_disable_linking': "/service/process/text"
196
+ }
197
+ }
198
+ }
199
+
200
+ materials_client.set_config(config_materials)
201
+
202
+ gqa = GrobidAggregationProcessor(grobid_quantities_client=quantities_client,
203
+ grobid_superconductors_client=materials_client)
204
+ return gqa
205
+
206
+
207
+ gqa = init_ner()
208
+
209
+
210
+ def get_file_hash(fname):
211
+ """Compute a BLAKE2b hex digest for the file at *fname*.
212
+
213
+ Used to generate deterministic document IDs from file content.
214
+ """
215
+ hash_md5 = blake2b()
216
+ with open(fname, "rb") as f:
217
+ for chunk in iter(lambda: f.read(4096), b""):
218
+ hash_md5.update(chunk)
219
+ return hash_md5.hexdigest()
220
+
221
+
222
+ def play_old_messages(container):
223
+ """Re-render previous chat messages into *container*.
224
+
225
+ Called on Streamlit reruns to restore the visible conversation
226
+ history from ``st.session_state['messages']``.
227
+ """
228
+ if st.session_state['messages']:
229
+ for message in st.session_state['messages']:
230
+ if message['role'] == 'user':
231
+ container.chat_message("user").markdown(message['content'])
232
+ elif message['role'] == 'assistant':
233
+ if mode == "LLM":
234
+ container.chat_message("assistant").markdown(message['content'], unsafe_allow_html=True)
235
+ else:
236
+ container.chat_message("assistant").write(message['content'])
237
+
238
+
239
+ # is_api_key_provided = st.session_state['api_key']
240
+
241
+ with st.sidebar:
242
+ st.title("📝 Document Q/A")
243
+ st.markdown("Upload a scientific article in PDF, ask questions, get insights.")
244
+ st.markdown(
245
+ ":warning: [Usage disclaimer](https://github.com/lfoppiano/document-qa?tab=readme-ov-file#disclaimer-on-data-security-and-privacy-%EF%B8%8F) :warning: ")
246
+ st.markdown("LM and Embeddings are powered by [Modal.com](https://modal.com/)")
247
+
248
+ st.divider()
249
+ st.session_state['model'] = model = st.selectbox(
250
+ "Model:",
251
+ options=API_MODELS.keys(),
252
+ index=(list(API_MODELS.keys())).index(
253
+ os.environ["DEFAULT_MODEL"]) if "DEFAULT_MODEL" in os.environ and os.environ["DEFAULT_MODEL"] else 0,
254
+ placeholder="Select model",
255
+ help="Select the LLM model:",
256
+ disabled=st.session_state['doc_id'] is not None or st.session_state['uploaded']
257
+ )
258
+
259
+ st.session_state['embeddings'] = embedding_name = st.selectbox(
260
+ "Embeddings:",
261
+ options=API_EMBEDDINGS.keys(),
262
+ index=(list(API_EMBEDDINGS.keys())).index(
263
+ os.environ["DEFAULT_EMBEDDING"]) if "DEFAULT_EMBEDDING" in os.environ and os.environ[
264
+ "DEFAULT_EMBEDDING"] else 0,
265
+ placeholder="Select embedding",
266
+ help="Select the Embedding function:",
267
+ disabled=st.session_state['doc_id'] is not None or st.session_state['uploaded']
268
+ )
269
+
270
+ api_key = os.environ['API_KEY']
271
+
272
+ if model not in st.session_state['rqa'] or model not in st.session_state['api_keys']:
273
+ with st.spinner("Preparing environment"):
274
+ st.session_state['rqa'][model] = init_qa(model, st.session_state['embeddings'])
275
+ st.session_state['api_keys'][model] = api_key
276
+
277
+ left_column, right_column = st.columns([5, 4])
278
+ right_column = right_column.container(border=True)
279
+ left_column = left_column.container(border=True)
280
+
281
+ with right_column:
282
+ uploaded_file = st.file_uploader(
283
+ "Upload a scientific article",
284
+ type=("pdf"),
285
+ on_change=new_file,
286
+ disabled=st.session_state['model'] is not None and st.session_state['model'] not in
287
+ st.session_state['api_keys'],
288
+ help="The full-text is extracted using Grobid."
289
+ )
290
+
291
+ placeholder = st.empty()
292
+ messages = st.container(height=300)
293
+
294
+ question = st.chat_input(
295
+ "Ask something about the article",
296
+ # placeholder="Can you give me a short summary?",
297
+ disabled=not uploaded_file
298
+ )
299
+
300
+ query_modes = {
301
+ "llm": "LLM Q/A",
302
+ "embeddings": "Embeddings",
303
+ "question_coefficient": "Question coefficient"
304
+ }
305
+
306
+ with st.sidebar:
307
+ st.header("Settings")
308
+ mode = st.radio(
309
+ "Query mode",
310
+ ("llm", "embeddings", "question_coefficient"),
311
+ disabled=not uploaded_file,
312
+ index=0,
313
+ horizontal=True,
314
+ format_func=lambda x: query_modes[x],
315
+ help="LLM will respond the question, Embedding will show the "
316
+ "relevant paragraphs to the question in the paper. "
317
+ "Question coefficient attempt to estimate how effective the question will be answered."
318
+ )
319
+ st.session_state['scroll_to_first_annotation'] = st.checkbox(
320
+ "Scroll to context",
321
+ help='The PDF viewer will automatically scroll to the first relevant passage in the document.'
322
+ )
323
+ st.session_state['ner_processing'] = st.checkbox(
324
+ "Identify materials and properties.",
325
+ help='The LLM responses undergo post-processing to extract physical quantities, measurements, and materials mentions.'
326
+ )
327
+
328
+ # Add a checkbox for showing annotations
329
+ # st.session_state['show_annotations'] = st.checkbox("Show annotations", value=True)
330
+ # st.session_state['should_show_annotations'] = st.checkbox("Show annotations", value=True)
331
+
332
+ chunk_size = st.slider("Text chunks size", -1, 2000, value=-1,
333
+ help="Size of chunks in which split the document. -1: use paragraphs, > 0 paragraphs are aggregated.",
334
+ disabled=uploaded_file is not None)
335
+ if chunk_size == -1:
336
+ context_size = st.slider("Context size (paragraphs)", 3, 20, value=10,
337
+ help="Number of paragraphs to consider when answering a question",
338
+ disabled=not uploaded_file)
339
+ else:
340
+ context_size = st.slider("Context size (chunks)", 3, 10, value=4,
341
+ help="Number of chunks to consider when answering a question",
342
+ disabled=not uploaded_file)
343
+
344
+ st.divider()
345
+
346
+ st.header("Documentation")
347
+ st.markdown("https://github.com/lfoppiano/document-qa")
348
+ st.markdown(
349
+ """Upload a scientific article as PDF document. Once the spinner stops, you can proceed to ask your questions.""")
350
+
351
+ if st.session_state['git_rev'] != "unknown":
352
+ st.markdown("**Revision number**: [" + st.session_state[
353
+ 'git_rev'] + "](https://github.com/lfoppiano/document-qa/commit/" + st.session_state['git_rev'] + ")")
354
+
355
+ if uploaded_file and not st.session_state.loaded_embeddings:
356
+ if model not in st.session_state['api_keys']:
357
+ st.error("Before uploading a document, you must enter the API key. ")
358
+ st.stop()
359
+
360
+ with left_column:
361
+ with st.spinner('Reading file, calling Grobid, and creating in-memory embeddings...'):
362
+ binary = uploaded_file.getvalue()
363
+ tmp_file = NamedTemporaryFile()
364
+ tmp_file.write(bytearray(binary))
365
+ st.session_state['binary'] = binary
366
+
367
+ st.session_state['doc_id'] = hash = st.session_state['rqa'][model].create_memory_embeddings(
368
+ tmp_file.name,
369
+ chunk_size=chunk_size,
370
+ perc_overlap=0.1
371
+ )
372
+ st.session_state['loaded_embeddings'] = True
373
+ st.session_state.messages = []
374
+
375
+
376
+ def rgb_to_hex(rgb):
377
+ """Convert an ``(R, G, B)`` tuple to a ``#rrggbb`` hex string."""
378
+ return "#{:02x}{:02x}{:02x}".format(*rgb)
379
+
380
+
381
+ def generate_color_gradient(num_elements):
382
+ """Generate a warm-to-cold hex colour gradient for annotation ranking.
383
+
384
+ The first colour (most relevant passage) is orange; the last (least
385
+ relevant) is blue. Intermediate colours are linearly interpolated.
386
+
387
+ Args:
388
+ num_elements: Number of gradient stops to produce.
389
+
390
+ Returns:
391
+ list[str]: Hex colour strings, e.g. ``['#ffa500', …, '#0000ff']``.
392
+ """
393
+ # Define warm and cold colors in RGB format
394
+ warm_color = (255, 165, 0) # Orange
395
+ cold_color = (0, 0, 255) # Blue
396
+
397
+ # Generate a linear gradient of colors
398
+ color_gradient = [
399
+ rgb_to_hex(tuple(int(warm * (1 - i / num_elements) + cold * (i / num_elements)) for warm, cold in
400
+ zip(warm_color, cold_color)))
401
+ for i in range(num_elements)
402
+ ]
403
+
404
+ return color_gradient
405
+
406
+
407
+ with right_column:
408
+ if st.session_state.loaded_embeddings and question and len(question) > 0 and st.session_state.doc_id:
409
+ st.session_state.messages.append({"role": "user", "mode": mode, "content": question})
410
+
411
+ for message in st.session_state.messages:
412
+ # with messages.chat_message(message["role"]):
413
+ if message['mode'] == "llm":
414
+ messages.chat_message(message["role"]).markdown(message["content"], unsafe_allow_html=True)
415
+ elif message['mode'] == "embeddings":
416
+ messages.chat_message(message["role"]).write(message["content"])
417
+ elif message['mode'] == "question_coefficient":
418
+ messages.chat_message(message["role"]).markdown(message["content"], unsafe_allow_html=True)
419
+ if model not in st.session_state['rqa']:
420
+ st.error("The API Key for the " + model + " is missing. Please add it before sending any query. `")
421
+ st.stop()
422
+
423
+ text_response = None
424
+ if mode == "embeddings":
425
+ with placeholder:
426
+ with st.spinner("Fetching the relevant context..."):
427
+ text_response, coordinates = st.session_state['rqa'][model].query_storage(
428
+ question,
429
+ st.session_state.doc_id,
430
+ context_size=context_size
431
+ )
432
+ elif mode == "llm":
433
+ with placeholder:
434
+ with st.spinner("Generating LLM response..."):
435
+ _, text_response, coordinates = st.session_state['rqa'][model].query_document(
436
+ question,
437
+ st.session_state.doc_id,
438
+ context_size=context_size
439
+ )
440
+
441
+ elif mode == "question_coefficient":
442
+ with st.spinner("Estimate question/context relevancy..."):
443
+ text_response, coordinates = st.session_state['rqa'][model].analyse_query(
444
+ question,
445
+ st.session_state.doc_id,
446
+ context_size=context_size
447
+ )
448
+
449
+ annotations = [[GrobidAggregationProcessor.box_to_dict([cs for cs in c.split(",")]) for c in coord_doc]
450
+ for coord_doc in coordinates]
451
+ gradients = generate_color_gradient(len(annotations))
452
+ for i, color in enumerate(gradients):
453
+ for annotation in annotations[i]:
454
+ annotation['color'] = color
455
+ if i == 0:
456
+ annotation['border'] = "dotted"
457
+
458
+ st.session_state['annotations'] = [annotation for annotation_doc in annotations for annotation in
459
+ annotation_doc]
460
+
461
+ if not text_response:
462
+ st.error("Something went wrong. Contact info AT sciencialab.com to report the issue through GitHub.")
463
+
464
+ if mode == "llm":
465
+ if st.session_state['ner_processing']:
466
+ with st.spinner("Processing NER on LLM response..."):
467
+ entities = gqa.process_single_text(text_response)
468
+ decorated_text = decorate_text_with_annotations(text_response.strip(), entities)
469
+ decorated_text = decorated_text.replace('class="label material"', 'style="color:green"')
470
+ decorated_text = re.sub(r'class="label[^"]+"', 'style="color:orange"', decorated_text)
471
+ text_response = decorated_text
472
+ messages.chat_message("assistant").markdown(text_response, unsafe_allow_html=True)
473
+ else:
474
+ messages.chat_message("assistant").write(text_response)
475
+ st.session_state.messages.append({"role": "assistant", "mode": mode, "content": text_response})
476
+
477
+ elif st.session_state.loaded_embeddings and st.session_state.doc_id:
478
+ play_old_messages(messages)
479
+
480
+ with left_column:
481
+ if st.session_state['binary']:
482
+ with st.container(height=600):
483
+ pdf_viewer(
484
+ input=st.session_state['binary'],
485
+ annotation_outline_size=2,
486
+ annotations=st.session_state['annotations'] if st.session_state['annotations'] else [],
487
+ render_text=True,
488
+ scroll_to_annotation=1 if (st.session_state['annotations'] and st.session_state[
489
+ 'scroll_to_first_annotation']) else None
490
+ )