BryanW commited on
Commit
9f88436
·
verified ·
1 Parent(s): 04b5c8e

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/aiosignal-1.3.2.dist-info/INSTALLER +1 -0
  2. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/aiosignal-1.3.2.dist-info/LICENSE +201 -0
  3. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/aiosignal-1.3.2.dist-info/METADATA +123 -0
  4. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/aiosignal-1.3.2.dist-info/RECORD +10 -0
  5. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/aiosignal-1.3.2.dist-info/WHEEL +6 -0
  6. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/aiosignal-1.3.2.dist-info/top_level.txt +1 -0
  7. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/attrs/__init__.py +69 -0
  8. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/attrs/__init__.pyi +263 -0
  9. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/attrs/converters.py +3 -0
  10. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/attrs/exceptions.py +3 -0
  11. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/attrs/filters.py +3 -0
  12. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/attrs/py.typed +0 -0
  13. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/attrs/setters.py +3 -0
  14. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/attrs/validators.py +3 -0
  15. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/datasets/__init__.py +79 -0
  16. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/datasets/arrow_writer.py +746 -0
  17. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/datasets/combine.py +215 -0
  18. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/datasets/data_files.py +825 -0
  19. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/datasets/fingerprint.py +494 -0
  20. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/datasets/hub.py +230 -0
  21. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/datasets/info.py +593 -0
  22. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/datasets/iterable_dataset.py +0 -0
  23. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/datasets/metric.py +652 -0
  24. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/datasets/search.py +785 -0
  25. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/datasets/splits.py +635 -0
  26. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/datasets/table.py +2422 -0
  27. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/frozenlist/__init__.py +98 -0
  28. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/frozenlist/__init__.pyi +47 -0
  29. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/frozenlist/_frozenlist.pyx +123 -0
  30. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/frozenlist/py.typed +1 -0
  31. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_models.py +516 -0
  32. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_ssl.py +9 -0
  33. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_utils.py +37 -0
  34. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/py.typed +0 -0
  35. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/jinja2-3.1.6.dist-info/INSTALLER +1 -0
  36. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/jinja2-3.1.6.dist-info/METADATA +84 -0
  37. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/jinja2-3.1.6.dist-info/RECORD +57 -0
  38. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/jinja2-3.1.6.dist-info/WHEEL +4 -0
  39. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/jinja2-3.1.6.dist-info/entry_points.txt +3 -0
  40. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml-6.0.2.dist-info/INSTALLER +1 -0
  41. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml-6.0.2.dist-info/METADATA +103 -0
  42. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml-6.0.2.dist-info/RECORD +204 -0
  43. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml-6.0.2.dist-info/WHEEL +6 -0
  44. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml-6.0.2.dist-info/top_level.txt +1 -0
  45. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia_curand_cu12-10.3.9.90.dist-info/INSTALLER +1 -0
  46. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia_curand_cu12-10.3.9.90.dist-info/License.txt +1568 -0
  47. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia_curand_cu12-10.3.9.90.dist-info/METADATA +44 -0
  48. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia_curand_cu12-10.3.9.90.dist-info/RECORD +32 -0
  49. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia_curand_cu12-10.3.9.90.dist-info/WHEEL +5 -0
  50. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia_curand_cu12-10.3.9.90.dist-info/top_level.txt +1 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/aiosignal-1.3.2.dist-info/INSTALLER ADDED
@@ -0,0 +1 @@
 
 
1
+ pip
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/aiosignal-1.3.2.dist-info/LICENSE ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "{}"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright 2013-2019 Nikolay Kim and Andrew Svetlov
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/aiosignal-1.3.2.dist-info/METADATA ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Metadata-Version: 2.1
2
+ Name: aiosignal
3
+ Version: 1.3.2
4
+ Summary: aiosignal: a list of registered asynchronous callbacks
5
+ Home-page: https://github.com/aio-libs/aiosignal
6
+ Maintainer: aiohttp team <team@aiohttp.org>
7
+ Maintainer-email: team@aiohttp.org
8
+ License: Apache 2.0
9
+ Project-URL: Chat: Gitter, https://gitter.im/aio-libs/Lobby
10
+ Project-URL: CI: GitHub Actions, https://github.com/aio-libs/aiosignal/actions
11
+ Project-URL: Coverage: codecov, https://codecov.io/github/aio-libs/aiosignal
12
+ Project-URL: Docs: RTD, https://docs.aiosignal.org
13
+ Project-URL: GitHub: issues, https://github.com/aio-libs/aiosignal/issues
14
+ Project-URL: GitHub: repo, https://github.com/aio-libs/aiosignal
15
+ Classifier: License :: OSI Approved :: Apache Software License
16
+ Classifier: Intended Audience :: Developers
17
+ Classifier: Programming Language :: Python
18
+ Classifier: Programming Language :: Python :: 3
19
+ Classifier: Programming Language :: Python :: 3 :: Only
20
+ Classifier: Development Status :: 5 - Production/Stable
21
+ Classifier: Operating System :: POSIX
22
+ Classifier: Operating System :: MacOS :: MacOS X
23
+ Classifier: Operating System :: Microsoft :: Windows
24
+ Classifier: Framework :: AsyncIO
25
+ Requires-Python: >=3.9
26
+ Description-Content-Type: text/x-rst
27
+ License-File: LICENSE
28
+ Requires-Dist: frozenlist>=1.1.0
29
+
30
+ =========
31
+ aiosignal
32
+ =========
33
+
34
+ .. image:: https://github.com/aio-libs/aiosignal/workflows/CI/badge.svg
35
+ :target: https://github.com/aio-libs/aiosignal/actions?query=workflow%3ACI
36
+ :alt: GitHub status for master branch
37
+
38
+ .. image:: https://codecov.io/gh/aio-libs/aiosignal/branch/master/graph/badge.svg
39
+ :target: https://codecov.io/gh/aio-libs/aiosignal
40
+ :alt: codecov.io status for master branch
41
+
42
+ .. image:: https://badge.fury.io/py/aiosignal.svg
43
+ :target: https://pypi.org/project/aiosignal
44
+ :alt: Latest PyPI package version
45
+
46
+ .. image:: https://readthedocs.org/projects/aiosignal/badge/?version=latest
47
+ :target: https://aiosignal.readthedocs.io/
48
+ :alt: Latest Read The Docs
49
+
50
+ .. image:: https://img.shields.io/discourse/topics?server=https%3A%2F%2Faio-libs.discourse.group%2F
51
+ :target: https://aio-libs.discourse.group/
52
+ :alt: Discourse group for io-libs
53
+
54
+ .. image:: https://badges.gitter.im/Join%20Chat.svg
55
+ :target: https://gitter.im/aio-libs/Lobby
56
+ :alt: Chat on Gitter
57
+
58
+ Introduction
59
+ ============
60
+
61
+ A project to manage callbacks in `asyncio` projects.
62
+
63
+ ``Signal`` is a list of registered asynchronous callbacks.
64
+
65
+ The signal's life-cycle has two stages: after creation its content
66
+ could be filled by using standard list operations: ``sig.append()``
67
+ etc.
68
+
69
+ After you call ``sig.freeze()`` the signal is *frozen*: adding, removing
70
+ and dropping callbacks is forbidden.
71
+
72
+ The only available operation is calling the previously registered
73
+ callbacks by using ``await sig.send(data)``.
74
+
75
+ For concrete usage examples see the `Signals
76
+ <https://docs.aiohttp.org/en/stable/web_advanced.html#aiohttp-web-signals>
77
+ section of the `Web Server Advanced
78
+ <https://docs.aiohttp.org/en/stable/web_advanced.html>` chapter of the `aiohttp
79
+ documentation`_.
80
+
81
+
82
+ Installation
83
+ ------------
84
+
85
+ ::
86
+
87
+ $ pip install aiosignal
88
+
89
+ The library requires Python 3.8 or newer.
90
+
91
+
92
+ Documentation
93
+ =============
94
+
95
+ https://aiosignal.readthedocs.io/
96
+
97
+ Communication channels
98
+ ======================
99
+
100
+ *gitter chat* https://gitter.im/aio-libs/Lobby
101
+
102
+ Requirements
103
+ ============
104
+
105
+ - Python >= 3.8
106
+ - frozenlist >= 1.0.0
107
+
108
+ License
109
+ =======
110
+
111
+ ``aiosignal`` is offered under the Apache 2 license.
112
+
113
+ Source code
114
+ ===========
115
+
116
+ The project is hosted on GitHub_
117
+
118
+ Please file an issue in the `bug tracker
119
+ <https://github.com/aio-libs/aiosignal/issues>`_ if you have found a bug
120
+ or have some suggestions to improve the library.
121
+
122
+ .. _GitHub: https://github.com/aio-libs/aiosignal
123
+ .. _aiohttp documentation: https://docs.aiohttp.org/
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/aiosignal-1.3.2.dist-info/RECORD ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ aiosignal-1.3.2.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
2
+ aiosignal-1.3.2.dist-info/LICENSE,sha256=b9UkPpLdf5jsacesN3co50kFcJ_1J6W_mNbQJjwE9bY,11332
3
+ aiosignal-1.3.2.dist-info/METADATA,sha256=TeI_xgZ191qgx37rviEnpMWC0QnYsg_j9EGVivNqqjc,3753
4
+ aiosignal-1.3.2.dist-info/RECORD,,
5
+ aiosignal-1.3.2.dist-info/WHEEL,sha256=pxeNX5JdtCe58PUSYP9upmc7jdRPgvT0Gm9kb1SHlVw,109
6
+ aiosignal-1.3.2.dist-info/top_level.txt,sha256=z45aNOKGDdrI1roqZY3BGXQ22kJFPHBmVdwtLYLtXC0,10
7
+ aiosignal/__init__.py,sha256=1oIrRl6kNpqFh32e7HfMFbMV_35v8sqJJFfnuKgmtEU,867
8
+ aiosignal/__init__.pyi,sha256=xeCddYSS8fZAkz8S4HuKSR2IDe3N7RW_LKcXDPPA1Xk,311
9
+ aiosignal/__pycache__/__init__.cpython-312.pyc,,
10
+ aiosignal/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/aiosignal-1.3.2.dist-info/WHEEL ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (75.6.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py2-none-any
5
+ Tag: py3-none-any
6
+
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/aiosignal-1.3.2.dist-info/top_level.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ aiosignal
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/attrs/__init__.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-License-Identifier: MIT
2
+
3
+ from attr import (
4
+ NOTHING,
5
+ Attribute,
6
+ AttrsInstance,
7
+ Converter,
8
+ Factory,
9
+ NothingType,
10
+ _make_getattr,
11
+ assoc,
12
+ cmp_using,
13
+ define,
14
+ evolve,
15
+ field,
16
+ fields,
17
+ fields_dict,
18
+ frozen,
19
+ has,
20
+ make_class,
21
+ mutable,
22
+ resolve_types,
23
+ validate,
24
+ )
25
+ from attr._next_gen import asdict, astuple
26
+
27
+ from . import converters, exceptions, filters, setters, validators
28
+
29
+
30
+ __all__ = [
31
+ "NOTHING",
32
+ "Attribute",
33
+ "AttrsInstance",
34
+ "Converter",
35
+ "Factory",
36
+ "NothingType",
37
+ "__author__",
38
+ "__copyright__",
39
+ "__description__",
40
+ "__doc__",
41
+ "__email__",
42
+ "__license__",
43
+ "__title__",
44
+ "__url__",
45
+ "__version__",
46
+ "__version_info__",
47
+ "asdict",
48
+ "assoc",
49
+ "astuple",
50
+ "cmp_using",
51
+ "converters",
52
+ "define",
53
+ "evolve",
54
+ "exceptions",
55
+ "field",
56
+ "fields",
57
+ "fields_dict",
58
+ "filters",
59
+ "frozen",
60
+ "has",
61
+ "make_class",
62
+ "mutable",
63
+ "resolve_types",
64
+ "setters",
65
+ "validate",
66
+ "validators",
67
+ ]
68
+
69
+ __getattr__ = _make_getattr(__name__)
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/attrs/__init__.pyi ADDED
@@ -0,0 +1,263 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+
3
+ from typing import (
4
+ Any,
5
+ Callable,
6
+ Mapping,
7
+ Sequence,
8
+ overload,
9
+ TypeVar,
10
+ )
11
+
12
+ # Because we need to type our own stuff, we have to make everything from
13
+ # attr explicitly public too.
14
+ from attr import __author__ as __author__
15
+ from attr import __copyright__ as __copyright__
16
+ from attr import __description__ as __description__
17
+ from attr import __email__ as __email__
18
+ from attr import __license__ as __license__
19
+ from attr import __title__ as __title__
20
+ from attr import __url__ as __url__
21
+ from attr import __version__ as __version__
22
+ from attr import __version_info__ as __version_info__
23
+ from attr import assoc as assoc
24
+ from attr import Attribute as Attribute
25
+ from attr import AttrsInstance as AttrsInstance
26
+ from attr import cmp_using as cmp_using
27
+ from attr import converters as converters
28
+ from attr import Converter as Converter
29
+ from attr import evolve as evolve
30
+ from attr import exceptions as exceptions
31
+ from attr import Factory as Factory
32
+ from attr import fields as fields
33
+ from attr import fields_dict as fields_dict
34
+ from attr import filters as filters
35
+ from attr import has as has
36
+ from attr import make_class as make_class
37
+ from attr import NOTHING as NOTHING
38
+ from attr import resolve_types as resolve_types
39
+ from attr import setters as setters
40
+ from attr import validate as validate
41
+ from attr import validators as validators
42
+ from attr import attrib, asdict as asdict, astuple as astuple
43
+ from attr import NothingType as NothingType
44
+
45
+ if sys.version_info >= (3, 11):
46
+ from typing import dataclass_transform
47
+ else:
48
+ from typing_extensions import dataclass_transform
49
+
50
+ _T = TypeVar("_T")
51
+ _C = TypeVar("_C", bound=type)
52
+
53
+ _EqOrderType = bool | Callable[[Any], Any]
54
+ _ValidatorType = Callable[[Any, "Attribute[_T]", _T], Any]
55
+ _CallableConverterType = Callable[[Any], Any]
56
+ _ConverterType = _CallableConverterType | Converter[Any, Any]
57
+ _ReprType = Callable[[Any], str]
58
+ _ReprArgType = bool | _ReprType
59
+ _OnSetAttrType = Callable[[Any, "Attribute[Any]", Any], Any]
60
+ _OnSetAttrArgType = _OnSetAttrType | list[_OnSetAttrType] | setters._NoOpType
61
+ _FieldTransformer = Callable[
62
+ [type, list["Attribute[Any]"]], list["Attribute[Any]"]
63
+ ]
64
+ # FIXME: in reality, if multiple validators are passed they must be in a list
65
+ # or tuple, but those are invariant and so would prevent subtypes of
66
+ # _ValidatorType from working when passed in a list or tuple.
67
+ _ValidatorArgType = _ValidatorType[_T] | Sequence[_ValidatorType[_T]]
68
+
69
+ @overload
70
+ def field(
71
+ *,
72
+ default: None = ...,
73
+ validator: None = ...,
74
+ repr: _ReprArgType = ...,
75
+ hash: bool | None = ...,
76
+ init: bool = ...,
77
+ metadata: Mapping[Any, Any] | None = ...,
78
+ converter: None = ...,
79
+ factory: None = ...,
80
+ kw_only: bool = ...,
81
+ eq: bool | None = ...,
82
+ order: bool | None = ...,
83
+ on_setattr: _OnSetAttrArgType | None = ...,
84
+ alias: str | None = ...,
85
+ type: type | None = ...,
86
+ ) -> Any: ...
87
+
88
+ # This form catches an explicit None or no default and infers the type from the
89
+ # other arguments.
90
+ @overload
91
+ def field(
92
+ *,
93
+ default: None = ...,
94
+ validator: _ValidatorArgType[_T] | None = ...,
95
+ repr: _ReprArgType = ...,
96
+ hash: bool | None = ...,
97
+ init: bool = ...,
98
+ metadata: Mapping[Any, Any] | None = ...,
99
+ converter: _ConverterType
100
+ | list[_ConverterType]
101
+ | tuple[_ConverterType]
102
+ | None = ...,
103
+ factory: Callable[[], _T] | None = ...,
104
+ kw_only: bool = ...,
105
+ eq: _EqOrderType | None = ...,
106
+ order: _EqOrderType | None = ...,
107
+ on_setattr: _OnSetAttrArgType | None = ...,
108
+ alias: str | None = ...,
109
+ type: type | None = ...,
110
+ ) -> _T: ...
111
+
112
+ # This form catches an explicit default argument.
113
+ @overload
114
+ def field(
115
+ *,
116
+ default: _T,
117
+ validator: _ValidatorArgType[_T] | None = ...,
118
+ repr: _ReprArgType = ...,
119
+ hash: bool | None = ...,
120
+ init: bool = ...,
121
+ metadata: Mapping[Any, Any] | None = ...,
122
+ converter: _ConverterType
123
+ | list[_ConverterType]
124
+ | tuple[_ConverterType]
125
+ | None = ...,
126
+ factory: Callable[[], _T] | None = ...,
127
+ kw_only: bool = ...,
128
+ eq: _EqOrderType | None = ...,
129
+ order: _EqOrderType | None = ...,
130
+ on_setattr: _OnSetAttrArgType | None = ...,
131
+ alias: str | None = ...,
132
+ type: type | None = ...,
133
+ ) -> _T: ...
134
+
135
+ # This form covers type=non-Type: e.g. forward references (str), Any
136
+ @overload
137
+ def field(
138
+ *,
139
+ default: _T | None = ...,
140
+ validator: _ValidatorArgType[_T] | None = ...,
141
+ repr: _ReprArgType = ...,
142
+ hash: bool | None = ...,
143
+ init: bool = ...,
144
+ metadata: Mapping[Any, Any] | None = ...,
145
+ converter: _ConverterType
146
+ | list[_ConverterType]
147
+ | tuple[_ConverterType]
148
+ | None = ...,
149
+ factory: Callable[[], _T] | None = ...,
150
+ kw_only: bool = ...,
151
+ eq: _EqOrderType | None = ...,
152
+ order: _EqOrderType | None = ...,
153
+ on_setattr: _OnSetAttrArgType | None = ...,
154
+ alias: str | None = ...,
155
+ type: type | None = ...,
156
+ ) -> Any: ...
157
+ @overload
158
+ @dataclass_transform(field_specifiers=(attrib, field))
159
+ def define(
160
+ maybe_cls: _C,
161
+ *,
162
+ these: dict[str, Any] | None = ...,
163
+ repr: bool = ...,
164
+ unsafe_hash: bool | None = ...,
165
+ hash: bool | None = ...,
166
+ init: bool = ...,
167
+ slots: bool = ...,
168
+ frozen: bool = ...,
169
+ weakref_slot: bool = ...,
170
+ str: bool = ...,
171
+ auto_attribs: bool = ...,
172
+ kw_only: bool = ...,
173
+ cache_hash: bool = ...,
174
+ auto_exc: bool = ...,
175
+ eq: bool | None = ...,
176
+ order: bool | None = ...,
177
+ auto_detect: bool = ...,
178
+ getstate_setstate: bool | None = ...,
179
+ on_setattr: _OnSetAttrArgType | None = ...,
180
+ field_transformer: _FieldTransformer | None = ...,
181
+ match_args: bool = ...,
182
+ ) -> _C: ...
183
+ @overload
184
+ @dataclass_transform(field_specifiers=(attrib, field))
185
+ def define(
186
+ maybe_cls: None = ...,
187
+ *,
188
+ these: dict[str, Any] | None = ...,
189
+ repr: bool = ...,
190
+ unsafe_hash: bool | None = ...,
191
+ hash: bool | None = ...,
192
+ init: bool = ...,
193
+ slots: bool = ...,
194
+ frozen: bool = ...,
195
+ weakref_slot: bool = ...,
196
+ str: bool = ...,
197
+ auto_attribs: bool = ...,
198
+ kw_only: bool = ...,
199
+ cache_hash: bool = ...,
200
+ auto_exc: bool = ...,
201
+ eq: bool | None = ...,
202
+ order: bool | None = ...,
203
+ auto_detect: bool = ...,
204
+ getstate_setstate: bool | None = ...,
205
+ on_setattr: _OnSetAttrArgType | None = ...,
206
+ field_transformer: _FieldTransformer | None = ...,
207
+ match_args: bool = ...,
208
+ ) -> Callable[[_C], _C]: ...
209
+
210
+ mutable = define
211
+
212
+ @overload
213
+ @dataclass_transform(frozen_default=True, field_specifiers=(attrib, field))
214
+ def frozen(
215
+ maybe_cls: _C,
216
+ *,
217
+ these: dict[str, Any] | None = ...,
218
+ repr: bool = ...,
219
+ unsafe_hash: bool | None = ...,
220
+ hash: bool | None = ...,
221
+ init: bool = ...,
222
+ slots: bool = ...,
223
+ frozen: bool = ...,
224
+ weakref_slot: bool = ...,
225
+ str: bool = ...,
226
+ auto_attribs: bool = ...,
227
+ kw_only: bool = ...,
228
+ cache_hash: bool = ...,
229
+ auto_exc: bool = ...,
230
+ eq: bool | None = ...,
231
+ order: bool | None = ...,
232
+ auto_detect: bool = ...,
233
+ getstate_setstate: bool | None = ...,
234
+ on_setattr: _OnSetAttrArgType | None = ...,
235
+ field_transformer: _FieldTransformer | None = ...,
236
+ match_args: bool = ...,
237
+ ) -> _C: ...
238
+ @overload
239
+ @dataclass_transform(frozen_default=True, field_specifiers=(attrib, field))
240
+ def frozen(
241
+ maybe_cls: None = ...,
242
+ *,
243
+ these: dict[str, Any] | None = ...,
244
+ repr: bool = ...,
245
+ unsafe_hash: bool | None = ...,
246
+ hash: bool | None = ...,
247
+ init: bool = ...,
248
+ slots: bool = ...,
249
+ frozen: bool = ...,
250
+ weakref_slot: bool = ...,
251
+ str: bool = ...,
252
+ auto_attribs: bool = ...,
253
+ kw_only: bool = ...,
254
+ cache_hash: bool = ...,
255
+ auto_exc: bool = ...,
256
+ eq: bool | None = ...,
257
+ order: bool | None = ...,
258
+ auto_detect: bool = ...,
259
+ getstate_setstate: bool | None = ...,
260
+ on_setattr: _OnSetAttrArgType | None = ...,
261
+ field_transformer: _FieldTransformer | None = ...,
262
+ match_args: bool = ...,
263
+ ) -> Callable[[_C], _C]: ...
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/attrs/converters.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # SPDX-License-Identifier: MIT
2
+
3
+ from attr.converters import * # noqa: F403
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/attrs/exceptions.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # SPDX-License-Identifier: MIT
2
+
3
+ from attr.exceptions import * # noqa: F403
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/attrs/filters.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # SPDX-License-Identifier: MIT
2
+
3
+ from attr.filters import * # noqa: F403
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/attrs/py.typed ADDED
File without changes
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/attrs/setters.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # SPDX-License-Identifier: MIT
2
+
3
+ from attr.setters import * # noqa: F403
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/attrs/validators.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # SPDX-License-Identifier: MIT
2
+
3
+ from attr.validators import * # noqa: F403
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/datasets/__init__.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ __version__ = "2.20.0"
16
+
17
+ from .arrow_dataset import Dataset
18
+ from .arrow_reader import ReadInstruction
19
+ from .builder import ArrowBasedBuilder, BeamBasedBuilder, BuilderConfig, DatasetBuilder, GeneratorBasedBuilder
20
+ from .combine import concatenate_datasets, interleave_datasets
21
+ from .dataset_dict import DatasetDict, IterableDatasetDict
22
+ from .download import *
23
+ from .features import *
24
+ from .fingerprint import disable_caching, enable_caching, is_caching_enabled, set_caching_enabled
25
+ from .info import DatasetInfo, MetricInfo
26
+ from .inspect import (
27
+ get_dataset_config_info,
28
+ get_dataset_config_names,
29
+ get_dataset_default_config_name,
30
+ get_dataset_infos,
31
+ get_dataset_split_names,
32
+ inspect_dataset,
33
+ inspect_metric,
34
+ list_datasets,
35
+ list_metrics,
36
+ )
37
+ from .iterable_dataset import IterableDataset
38
+ from .load import load_dataset, load_dataset_builder, load_from_disk, load_metric
39
+ from .metric import Metric
40
+ from .splits import (
41
+ NamedSplit,
42
+ NamedSplitAll,
43
+ Split,
44
+ SplitBase,
45
+ SplitDict,
46
+ SplitGenerator,
47
+ SplitInfo,
48
+ SubSplitInfo,
49
+ percent,
50
+ )
51
+ from .tasks import *
52
+ from .utils import *
53
+ from .utils import logging
54
+
55
+
56
+ # isort: split
57
+
58
+ # Deprecated modules
59
+ from . import arrow_dataset as _arrow_dataset
60
+ from . import utils as _utils
61
+ from .exceptions import ExpectedMoreDownloadedFiles, ExpectedMoreSplits, UnexpectedDownloadedFile, UnexpectedSplits
62
+ from .utils import download_manager as _deprecated_download_manager
63
+ from .utils import info_utils as _deprecated_info_utils
64
+
65
+
66
+ _arrow_dataset.concatenate_datasets = concatenate_datasets
67
+ _utils.DownloadConfig = DownloadConfig
68
+ _utils.DownloadManager = DownloadManager
69
+ _utils.DownloadMode = DownloadMode
70
+ _deprecated_download_manager.DownloadConfig = DownloadConfig
71
+ _deprecated_download_manager.DownloadMode = DownloadMode
72
+ _deprecated_download_manager.DownloadManager = DownloadManager
73
+ _deprecated_info_utils.ExpectedMoreDownloadedFiles = ExpectedMoreDownloadedFiles
74
+ _deprecated_info_utils.ExpectedMoreSplits = ExpectedMoreSplits
75
+ _deprecated_info_utils.UnexpectedDownloadedFile = UnexpectedDownloadedFile
76
+ _deprecated_info_utils.UnexpectedSplits = UnexpectedSplits
77
+
78
+ del _arrow_dataset, _utils, _deprecated_download_manager
79
+ del _deprecated_info_utils, ExpectedMoreDownloadedFiles, ExpectedMoreSplits, UnexpectedDownloadedFile, UnexpectedSplits
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/datasets/arrow_writer.py ADDED
@@ -0,0 +1,746 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+
13
+ # Lint as: python3
14
+ """To write records into Parquet files."""
15
+
16
+ import errno
17
+ import json
18
+ import os
19
+ import sys
20
+ from pathlib import Path
21
+ from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
22
+
23
+ import fsspec
24
+ import numpy as np
25
+ import pyarrow as pa
26
+ import pyarrow.parquet as pq
27
+ from fsspec.core import url_to_fs
28
+
29
+ from . import config
30
+ from .features import Features, Image, Value
31
+ from .features.features import (
32
+ FeatureType,
33
+ _ArrayXDExtensionType,
34
+ cast_to_python_objects,
35
+ generate_from_arrow_type,
36
+ get_nested_type,
37
+ list_of_np_array_to_pyarrow_listarray,
38
+ numpy_to_pyarrow_listarray,
39
+ to_pyarrow_listarray,
40
+ )
41
+ from .filesystems import is_remote_filesystem
42
+ from .info import DatasetInfo
43
+ from .keyhash import DuplicatedKeysError, KeyHasher
44
+ from .table import array_cast, cast_array_to_feature, embed_table_storage, table_cast
45
+ from .utils import logging
46
+ from .utils import tqdm as hf_tqdm
47
+ from .utils.file_utils import hash_url_to_filename
48
+ from .utils.py_utils import asdict, first_non_null_value
49
+
50
+
51
+ logger = logging.get_logger(__name__)
52
+
53
+ type_ = type # keep python's type function
54
+
55
+
56
+ class SchemaInferenceError(ValueError):
57
+ pass
58
+
59
+
60
+ class TypedSequence:
61
+ """
62
+ This data container generalizes the typing when instantiating pyarrow arrays, tables or batches.
63
+
64
+ More specifically it adds several features:
65
+ - Support extension types like ``datasets.features.Array2DExtensionType``:
66
+ By default pyarrow arrays don't return extension arrays. One has to call
67
+ ``pa.ExtensionArray.from_storage(type, pa.array(data, type.storage_type))``
68
+ in order to get an extension array.
69
+ - Support for ``try_type`` parameter that can be used instead of ``type``:
70
+ When an array is transformed, we like to keep the same type as before if possible.
71
+ For example when calling :func:`datasets.Dataset.map`, we don't want to change the type
72
+ of each column by default.
73
+ - Better error message when a pyarrow array overflows.
74
+
75
+ Example::
76
+
77
+ from datasets.features import Array2D, Array2DExtensionType, Value
78
+ from datasets.arrow_writer import TypedSequence
79
+ import pyarrow as pa
80
+
81
+ arr = pa.array(TypedSequence([1, 2, 3], type=Value("int32")))
82
+ assert arr.type == pa.int32()
83
+
84
+ arr = pa.array(TypedSequence([1, 2, 3], try_type=Value("int32")))
85
+ assert arr.type == pa.int32()
86
+
87
+ arr = pa.array(TypedSequence(["foo", "bar"], try_type=Value("int32")))
88
+ assert arr.type == pa.string()
89
+
90
+ arr = pa.array(TypedSequence([[[1, 2, 3]]], type=Array2D((1, 3), "int64")))
91
+ assert arr.type == Array2DExtensionType((1, 3), "int64")
92
+
93
+ table = pa.Table.from_pydict({
94
+ "image": TypedSequence([[[1, 2, 3]]], type=Array2D((1, 3), "int64"))
95
+ })
96
+ assert table["image"].type == Array2DExtensionType((1, 3), "int64")
97
+
98
+ """
99
+
100
+ def __init__(
101
+ self,
102
+ data: Iterable,
103
+ type: Optional[FeatureType] = None,
104
+ try_type: Optional[FeatureType] = None,
105
+ optimized_int_type: Optional[FeatureType] = None,
106
+ ):
107
+ # assert type is None or try_type is None,
108
+ if type is not None and try_type is not None:
109
+ raise ValueError("You cannot specify both type and try_type")
110
+ # set attributes
111
+ self.data = data
112
+ self.type = type
113
+ self.try_type = try_type # is ignored if it doesn't match the data
114
+ self.optimized_int_type = optimized_int_type
115
+ # when trying a type (is ignored if data is not compatible)
116
+ self.trying_type = self.try_type is not None
117
+ self.trying_int_optimization = optimized_int_type is not None and type is None and try_type is None
118
+ # used to get back the inferred type after __arrow_array__() is called once
119
+ self._inferred_type = None
120
+
121
+ def get_inferred_type(self) -> FeatureType:
122
+ """Return the inferred feature type.
123
+ This is done by converting the sequence to an Arrow array, and getting the corresponding
124
+ feature type.
125
+
126
+ Since building the Arrow array can be expensive, the value of the inferred type is cached
127
+ as soon as pa.array is called on the typed sequence.
128
+
129
+ Returns:
130
+ FeatureType: inferred feature type of the sequence.
131
+ """
132
+ if self._inferred_type is None:
133
+ self._inferred_type = generate_from_arrow_type(pa.array(self).type)
134
+ return self._inferred_type
135
+
136
+ @staticmethod
137
+ def _infer_custom_type_and_encode(data: Iterable) -> Tuple[Iterable, Optional[FeatureType]]:
138
+ """Implement type inference for custom objects like PIL.Image.Image -> Image type.
139
+
140
+ This function is only used for custom python objects that can't be direclty passed to build
141
+ an Arrow array. In such cases is infers the feature type to use, and it encodes the data so
142
+ that they can be passed to an Arrow array.
143
+
144
+ Args:
145
+ data (Iterable): array of data to infer the type, e.g. a list of PIL images.
146
+
147
+ Returns:
148
+ Tuple[Iterable, Optional[FeatureType]]: a tuple with:
149
+ - the (possibly encoded) array, if the inferred feature type requires encoding
150
+ - the inferred feature type if the array is made of supported custom objects like
151
+ PIL images, else None.
152
+ """
153
+ if config.PIL_AVAILABLE and "PIL" in sys.modules:
154
+ import PIL.Image
155
+
156
+ non_null_idx, non_null_value = first_non_null_value(data)
157
+ if isinstance(non_null_value, PIL.Image.Image):
158
+ return [Image().encode_example(value) if value is not None else None for value in data], Image()
159
+ return data, None
160
+
161
+ def __arrow_array__(self, type: Optional[pa.DataType] = None):
162
+ """This function is called when calling pa.array(typed_sequence)"""
163
+
164
+ if type is not None:
165
+ raise ValueError("TypedSequence is supposed to be used with pa.array(typed_sequence, type=None)")
166
+ del type # make sure we don't use it
167
+ data = self.data
168
+ # automatic type inference for custom objects
169
+ if self.type is None and self.try_type is None:
170
+ data, self._inferred_type = self._infer_custom_type_and_encode(data)
171
+ if self._inferred_type is None:
172
+ type = self.try_type if self.trying_type else self.type
173
+ else:
174
+ type = self._inferred_type
175
+ pa_type = get_nested_type(type) if type is not None else None
176
+ optimized_int_pa_type = (
177
+ get_nested_type(self.optimized_int_type) if self.optimized_int_type is not None else None
178
+ )
179
+ trying_cast_to_python_objects = False
180
+ try:
181
+ # custom pyarrow types
182
+ if isinstance(pa_type, _ArrayXDExtensionType):
183
+ storage = to_pyarrow_listarray(data, pa_type)
184
+ return pa.ExtensionArray.from_storage(pa_type, storage)
185
+
186
+ # efficient np array to pyarrow array
187
+ if isinstance(data, np.ndarray):
188
+ out = numpy_to_pyarrow_listarray(data)
189
+ elif isinstance(data, list) and data and isinstance(first_non_null_value(data)[1], np.ndarray):
190
+ out = list_of_np_array_to_pyarrow_listarray(data)
191
+ else:
192
+ trying_cast_to_python_objects = True
193
+ out = pa.array(cast_to_python_objects(data, only_1d_for_numpy=True))
194
+ # use smaller integer precisions if possible
195
+ if self.trying_int_optimization:
196
+ if pa.types.is_int64(out.type):
197
+ out = out.cast(optimized_int_pa_type)
198
+ elif pa.types.is_list(out.type):
199
+ if pa.types.is_int64(out.type.value_type):
200
+ out = array_cast(out, pa.list_(optimized_int_pa_type))
201
+ elif pa.types.is_list(out.type.value_type) and pa.types.is_int64(out.type.value_type.value_type):
202
+ out = array_cast(out, pa.list_(pa.list_(optimized_int_pa_type)))
203
+ # otherwise we can finally use the user's type
204
+ elif type is not None:
205
+ # We use cast_array_to_feature to support casting to custom types like Audio and Image
206
+ # Also, when trying type "string", we don't want to convert integers or floats to "string".
207
+ # We only do it if trying_type is False - since this is what the user asks for.
208
+ out = cast_array_to_feature(
209
+ out, type, allow_primitive_to_str=not self.trying_type, allow_decimal_to_str=not self.trying_type
210
+ )
211
+ return out
212
+ except (
213
+ TypeError,
214
+ pa.lib.ArrowInvalid,
215
+ pa.lib.ArrowNotImplementedError,
216
+ ) as e: # handle type errors and overflows
217
+ # Ignore ArrowNotImplementedError caused by trying type, otherwise re-raise
218
+ if not self.trying_type and isinstance(e, pa.lib.ArrowNotImplementedError):
219
+ raise
220
+
221
+ if self.trying_type:
222
+ try: # second chance
223
+ if isinstance(data, np.ndarray):
224
+ return numpy_to_pyarrow_listarray(data)
225
+ elif isinstance(data, list) and data and any(isinstance(value, np.ndarray) for value in data):
226
+ return list_of_np_array_to_pyarrow_listarray(data)
227
+ else:
228
+ trying_cast_to_python_objects = True
229
+ return pa.array(cast_to_python_objects(data, only_1d_for_numpy=True))
230
+ except pa.lib.ArrowInvalid as e:
231
+ if "overflow" in str(e):
232
+ raise OverflowError(
233
+ f"There was an overflow with type {type_(data)}. Try to reduce writer_batch_size to have batches smaller than 2GB.\n({e})"
234
+ ) from None
235
+ elif self.trying_int_optimization and "not in range" in str(e):
236
+ optimized_int_pa_type_str = np.dtype(optimized_int_pa_type.to_pandas_dtype()).name
237
+ logger.info(
238
+ f"Failed to cast a sequence to {optimized_int_pa_type_str}. Falling back to int64."
239
+ )
240
+ return out
241
+ elif trying_cast_to_python_objects and "Could not convert" in str(e):
242
+ out = pa.array(
243
+ cast_to_python_objects(data, only_1d_for_numpy=True, optimize_list_casting=False)
244
+ )
245
+ if type is not None:
246
+ out = cast_array_to_feature(
247
+ out, type, allow_primitive_to_str=True, allow_decimal_to_str=True
248
+ )
249
+ return out
250
+ else:
251
+ raise
252
+ elif "overflow" in str(e):
253
+ raise OverflowError(
254
+ f"There was an overflow with type {type_(data)}. Try to reduce writer_batch_size to have batches smaller than 2GB.\n({e})"
255
+ ) from None
256
+ elif self.trying_int_optimization and "not in range" in str(e):
257
+ optimized_int_pa_type_str = np.dtype(optimized_int_pa_type.to_pandas_dtype()).name
258
+ logger.info(f"Failed to cast a sequence to {optimized_int_pa_type_str}. Falling back to int64.")
259
+ return out
260
+ elif trying_cast_to_python_objects and "Could not convert" in str(e):
261
+ out = pa.array(cast_to_python_objects(data, only_1d_for_numpy=True, optimize_list_casting=False))
262
+ if type is not None:
263
+ out = cast_array_to_feature(out, type, allow_primitive_to_str=True, allow_decimal_to_str=True)
264
+ return out
265
+ else:
266
+ raise
267
+
268
+
269
+ class OptimizedTypedSequence(TypedSequence):
270
+ def __init__(
271
+ self,
272
+ data,
273
+ type: Optional[FeatureType] = None,
274
+ try_type: Optional[FeatureType] = None,
275
+ col: Optional[str] = None,
276
+ optimized_int_type: Optional[FeatureType] = None,
277
+ ):
278
+ optimized_int_type_by_col = {
279
+ "attention_mask": Value("int8"), # binary tensor
280
+ "special_tokens_mask": Value("int8"),
281
+ "input_ids": Value("int32"), # typical vocab size: 0-50k (max ~500k, never > 1M)
282
+ "token_type_ids": Value(
283
+ "int8"
284
+ ), # binary mask; some (XLNetModel) use an additional token represented by a 2
285
+ }
286
+ if type is None and try_type is None:
287
+ optimized_int_type = optimized_int_type_by_col.get(col, None)
288
+ super().__init__(data, type=type, try_type=try_type, optimized_int_type=optimized_int_type)
289
+
290
+
291
+ class ArrowWriter:
292
+ """Shuffles and writes Examples to Arrow files."""
293
+
294
+ _WRITER_CLASS = pa.RecordBatchStreamWriter
295
+
296
+ def __init__(
297
+ self,
298
+ schema: Optional[pa.Schema] = None,
299
+ features: Optional[Features] = None,
300
+ path: Optional[str] = None,
301
+ stream: Optional[pa.NativeFile] = None,
302
+ fingerprint: Optional[str] = None,
303
+ writer_batch_size: Optional[int] = None,
304
+ hash_salt: Optional[str] = None,
305
+ check_duplicates: Optional[bool] = False,
306
+ disable_nullable: bool = False,
307
+ update_features: bool = False,
308
+ with_metadata: bool = True,
309
+ unit: str = "examples",
310
+ embed_local_files: bool = False,
311
+ storage_options: Optional[dict] = None,
312
+ ):
313
+ if path is None and stream is None:
314
+ raise ValueError("At least one of path and stream must be provided.")
315
+ if features is not None:
316
+ self._features = features
317
+ self._schema = None
318
+ elif schema is not None:
319
+ self._schema: pa.Schema = schema
320
+ self._features = Features.from_arrow_schema(self._schema)
321
+ else:
322
+ self._features = None
323
+ self._schema = None
324
+
325
+ if hash_salt is not None:
326
+ # Create KeyHasher instance using split name as hash salt
327
+ self._hasher = KeyHasher(hash_salt)
328
+ else:
329
+ self._hasher = KeyHasher("")
330
+
331
+ self._check_duplicates = check_duplicates
332
+ self._disable_nullable = disable_nullable
333
+
334
+ if stream is None:
335
+ fs, path = url_to_fs(path, **(storage_options or {}))
336
+ self._fs: fsspec.AbstractFileSystem = fs
337
+ self._path = path if not is_remote_filesystem(self._fs) else self._fs.unstrip_protocol(path)
338
+ self.stream = self._fs.open(path, "wb")
339
+ self._closable_stream = True
340
+ else:
341
+ self._fs = None
342
+ self._path = None
343
+ self.stream = stream
344
+ self._closable_stream = False
345
+
346
+ self.fingerprint = fingerprint
347
+ self.disable_nullable = disable_nullable
348
+ self.writer_batch_size = writer_batch_size or config.DEFAULT_MAX_BATCH_SIZE
349
+ self.update_features = update_features
350
+ self.with_metadata = with_metadata
351
+ self.unit = unit
352
+ self.embed_local_files = embed_local_files
353
+
354
+ self._num_examples = 0
355
+ self._num_bytes = 0
356
+ self.current_examples: List[Tuple[Dict[str, Any], str]] = []
357
+ self.current_rows: List[pa.Table] = []
358
+ self.pa_writer: Optional[pa.RecordBatchStreamWriter] = None
359
+ self.hkey_record = []
360
+
361
+ def __len__(self):
362
+ """Return the number of writed and staged examples"""
363
+ return self._num_examples + len(self.current_examples) + len(self.current_rows)
364
+
365
+ def __enter__(self):
366
+ return self
367
+
368
+ def __exit__(self, exc_type, exc_val, exc_tb):
369
+ self.close()
370
+
371
+ def close(self):
372
+ # Try closing if opened; if closed: pyarrow.lib.ArrowInvalid: Invalid operation on closed file
373
+ if self.pa_writer: # it might be None
374
+ try:
375
+ self.pa_writer.close()
376
+ except Exception: # pyarrow.lib.ArrowInvalid, OSError
377
+ pass
378
+ if self._closable_stream and not self.stream.closed:
379
+ self.stream.close() # This also closes self.pa_writer if it is opened
380
+
381
+ def _build_writer(self, inferred_schema: pa.Schema):
382
+ schema = self.schema
383
+ inferred_features = Features.from_arrow_schema(inferred_schema)
384
+ if self._features is not None:
385
+ if self.update_features: # keep original features it they match, or update them
386
+ fields = {field.name: field for field in self._features.type}
387
+ for inferred_field in inferred_features.type:
388
+ name = inferred_field.name
389
+ if name in fields:
390
+ if inferred_field == fields[name]:
391
+ inferred_features[name] = self._features[name]
392
+ self._features = inferred_features
393
+ schema: pa.Schema = inferred_schema
394
+ else:
395
+ self._features = inferred_features
396
+ schema: pa.Schema = inferred_features.arrow_schema
397
+ if self.disable_nullable:
398
+ schema = pa.schema(pa.field(field.name, field.type, nullable=False) for field in schema)
399
+ if self.with_metadata:
400
+ schema = schema.with_metadata(self._build_metadata(DatasetInfo(features=self._features), self.fingerprint))
401
+ else:
402
+ schema = schema.with_metadata({})
403
+ self._schema = schema
404
+ self.pa_writer = self._WRITER_CLASS(self.stream, schema)
405
+
406
+ @property
407
+ def schema(self):
408
+ _schema = (
409
+ self._schema
410
+ if self._schema is not None
411
+ else (pa.schema(self._features.type) if self._features is not None else None)
412
+ )
413
+ if self._disable_nullable and _schema is not None:
414
+ _schema = pa.schema(pa.field(field.name, field.type, nullable=False) for field in _schema)
415
+ return _schema if _schema is not None else []
416
+
417
+ @staticmethod
418
+ def _build_metadata(info: DatasetInfo, fingerprint: Optional[str] = None) -> Dict[str, str]:
419
+ info_keys = ["features"] # we can add support for more DatasetInfo keys in the future
420
+ info_as_dict = asdict(info)
421
+ metadata = {}
422
+ metadata["info"] = {key: info_as_dict[key] for key in info_keys}
423
+ if fingerprint is not None:
424
+ metadata["fingerprint"] = fingerprint
425
+ return {"huggingface": json.dumps(metadata)}
426
+
427
+ def write_examples_on_file(self):
428
+ """Write stored examples from the write-pool of examples. It makes a table out of the examples and write it."""
429
+ if not self.current_examples:
430
+ return
431
+ # preserve the order the columns
432
+ if self.schema:
433
+ schema_cols = set(self.schema.names)
434
+ examples_cols = self.current_examples[0][0].keys() # .keys() preserves the order (unlike set)
435
+ common_cols = [col for col in self.schema.names if col in examples_cols]
436
+ extra_cols = [col for col in examples_cols if col not in schema_cols]
437
+ cols = common_cols + extra_cols
438
+ else:
439
+ cols = list(self.current_examples[0][0])
440
+ batch_examples = {}
441
+ for col in cols:
442
+ # We use row[0][col] since current_examples contains (example, key) tuples.
443
+ # Morever, examples could be Arrow arrays of 1 element.
444
+ # This can happen in `.map()` when we want to re-write the same Arrow data
445
+ if all(isinstance(row[0][col], (pa.Array, pa.ChunkedArray)) for row in self.current_examples):
446
+ arrays = [row[0][col] for row in self.current_examples]
447
+ arrays = [
448
+ chunk
449
+ for array in arrays
450
+ for chunk in (array.chunks if isinstance(array, pa.ChunkedArray) else [array])
451
+ ]
452
+ batch_examples[col] = pa.concat_arrays(arrays)
453
+ else:
454
+ batch_examples[col] = [
455
+ row[0][col].to_pylist()[0] if isinstance(row[0][col], (pa.Array, pa.ChunkedArray)) else row[0][col]
456
+ for row in self.current_examples
457
+ ]
458
+ self.write_batch(batch_examples=batch_examples)
459
+ self.current_examples = []
460
+
461
+ def write_rows_on_file(self):
462
+ """Write stored rows from the write-pool of rows. It concatenates the single-row tables and it writes the resulting table."""
463
+ if not self.current_rows:
464
+ return
465
+ table = pa.concat_tables(self.current_rows)
466
+ self.write_table(table)
467
+ self.current_rows = []
468
+
469
+ def write(
470
+ self,
471
+ example: Dict[str, Any],
472
+ key: Optional[Union[str, int, bytes]] = None,
473
+ writer_batch_size: Optional[int] = None,
474
+ ):
475
+ """Add a given (Example,Key) pair to the write-pool of examples which is written to file.
476
+
477
+ Args:
478
+ example: the Example to add.
479
+ key: Optional, a unique identifier(str, int or bytes) associated with each example
480
+ """
481
+ # Utilize the keys and duplicate checking when `self._check_duplicates` is passed True
482
+ if self._check_duplicates:
483
+ # Create unique hash from key and store as (key, example) pairs
484
+ hash = self._hasher.hash(key)
485
+ self.current_examples.append((example, hash))
486
+ # Maintain record of keys and their respective hashes for checking duplicates
487
+ self.hkey_record.append((hash, key))
488
+ else:
489
+ # Store example as a tuple so as to keep the structure of `self.current_examples` uniform
490
+ self.current_examples.append((example, ""))
491
+
492
+ if writer_batch_size is None:
493
+ writer_batch_size = self.writer_batch_size
494
+ if writer_batch_size is not None and len(self.current_examples) >= writer_batch_size:
495
+ if self._check_duplicates:
496
+ self.check_duplicate_keys()
497
+ # Re-intializing to empty list for next batch
498
+ self.hkey_record = []
499
+
500
+ self.write_examples_on_file()
501
+
502
+ def check_duplicate_keys(self):
503
+ """Raises error if duplicates found in a batch"""
504
+ tmp_record = set()
505
+ for hash, key in self.hkey_record:
506
+ if hash in tmp_record:
507
+ duplicate_key_indices = [
508
+ str(self._num_examples + index)
509
+ for index, (duplicate_hash, _) in enumerate(self.hkey_record)
510
+ if duplicate_hash == hash
511
+ ]
512
+
513
+ raise DuplicatedKeysError(key, duplicate_key_indices)
514
+ else:
515
+ tmp_record.add(hash)
516
+
517
+ def write_row(self, row: pa.Table, writer_batch_size: Optional[int] = None):
518
+ """Add a given single-row Table to the write-pool of rows which is written to file.
519
+
520
+ Args:
521
+ row: the row to add.
522
+ """
523
+ if len(row) != 1:
524
+ raise ValueError(f"Only single-row pyarrow tables are allowed but got table with {len(row)} rows.")
525
+ self.current_rows.append(row)
526
+ if writer_batch_size is None:
527
+ writer_batch_size = self.writer_batch_size
528
+ if writer_batch_size is not None and len(self.current_rows) >= writer_batch_size:
529
+ self.write_rows_on_file()
530
+
531
+ def write_batch(
532
+ self,
533
+ batch_examples: Dict[str, List],
534
+ writer_batch_size: Optional[int] = None,
535
+ ):
536
+ """Write a batch of Example to file.
537
+ Ignores the batch if it appears to be empty,
538
+ preventing a potential schema update of unknown types.
539
+
540
+ Args:
541
+ batch_examples: the batch of examples to add.
542
+ """
543
+ if batch_examples and len(next(iter(batch_examples.values()))) == 0:
544
+ return
545
+ features = None if self.pa_writer is None and self.update_features else self._features
546
+ try_features = self._features if self.pa_writer is None and self.update_features else None
547
+ arrays = []
548
+ inferred_features = Features()
549
+ # preserve the order the columns
550
+ if self.schema:
551
+ schema_cols = set(self.schema.names)
552
+ batch_cols = batch_examples.keys() # .keys() preserves the order (unlike set)
553
+ common_cols = [col for col in self.schema.names if col in batch_cols]
554
+ extra_cols = [col for col in batch_cols if col not in schema_cols]
555
+ cols = common_cols + extra_cols
556
+ else:
557
+ cols = list(batch_examples)
558
+ for col in cols:
559
+ col_values = batch_examples[col]
560
+ col_type = features[col] if features else None
561
+ if isinstance(col_values, (pa.Array, pa.ChunkedArray)):
562
+ array = cast_array_to_feature(col_values, col_type) if col_type is not None else col_values
563
+ arrays.append(array)
564
+ inferred_features[col] = generate_from_arrow_type(col_values.type)
565
+ else:
566
+ col_try_type = try_features[col] if try_features is not None and col in try_features else None
567
+ typed_sequence = OptimizedTypedSequence(col_values, type=col_type, try_type=col_try_type, col=col)
568
+ arrays.append(pa.array(typed_sequence))
569
+ inferred_features[col] = typed_sequence.get_inferred_type()
570
+ schema = inferred_features.arrow_schema if self.pa_writer is None else self.schema
571
+ pa_table = pa.Table.from_arrays(arrays, schema=schema)
572
+ self.write_table(pa_table, writer_batch_size)
573
+
574
+ def write_table(self, pa_table: pa.Table, writer_batch_size: Optional[int] = None):
575
+ """Write a Table to file.
576
+
577
+ Args:
578
+ example: the Table to add.
579
+ """
580
+ if writer_batch_size is None:
581
+ writer_batch_size = self.writer_batch_size
582
+ if self.pa_writer is None:
583
+ self._build_writer(inferred_schema=pa_table.schema)
584
+ pa_table = pa_table.combine_chunks()
585
+ pa_table = table_cast(pa_table, self._schema)
586
+ if self.embed_local_files:
587
+ pa_table = embed_table_storage(pa_table)
588
+ self._num_bytes += pa_table.nbytes
589
+ self._num_examples += pa_table.num_rows
590
+ self.pa_writer.write_table(pa_table, writer_batch_size)
591
+
592
+ def finalize(self, close_stream=True):
593
+ self.write_rows_on_file()
594
+ # In case current_examples < writer_batch_size, but user uses finalize()
595
+ if self._check_duplicates:
596
+ self.check_duplicate_keys()
597
+ # Re-intializing to empty list for next batch
598
+ self.hkey_record = []
599
+ self.write_examples_on_file()
600
+ # If schema is known, infer features even if no examples were written
601
+ if self.pa_writer is None and self.schema:
602
+ self._build_writer(self.schema)
603
+ if self.pa_writer is not None:
604
+ self.pa_writer.close()
605
+ self.pa_writer = None
606
+ if close_stream:
607
+ self.stream.close()
608
+ else:
609
+ if close_stream:
610
+ self.stream.close()
611
+ raise SchemaInferenceError("Please pass `features` or at least one example when writing data")
612
+ logger.debug(
613
+ f"Done writing {self._num_examples} {self.unit} in {self._num_bytes} bytes {self._path if self._path else ''}."
614
+ )
615
+ return self._num_examples, self._num_bytes
616
+
617
+
618
+ class ParquetWriter(ArrowWriter):
619
+ _WRITER_CLASS = pq.ParquetWriter
620
+
621
+
622
+ class BeamWriter:
623
+ """
624
+ Shuffles and writes Examples to Arrow files.
625
+ The Arrow files are converted from Parquet files that are the output of Apache Beam pipelines.
626
+ """
627
+
628
+ def __init__(
629
+ self,
630
+ features: Optional[Features] = None,
631
+ schema: Optional[pa.Schema] = None,
632
+ path: Optional[str] = None,
633
+ namespace: Optional[str] = None,
634
+ cache_dir: Optional[str] = None,
635
+ ):
636
+ if features is None and schema is None:
637
+ raise ValueError("At least one of features and schema must be provided.")
638
+ if path is None:
639
+ raise ValueError("Path must be provided.")
640
+
641
+ if features is not None:
642
+ self._features: Features = features
643
+ self._schema: pa.Schema = features.arrow_schema
644
+ else:
645
+ self._schema: pa.Schema = schema
646
+ self._features: Features = Features.from_arrow_schema(schema)
647
+
648
+ self._path = path
649
+ self._parquet_path = os.path.splitext(path)[0] # remove extension
650
+ self._namespace = namespace or "default"
651
+ self._num_examples = None
652
+ self._cache_dir = cache_dir or config.HF_DATASETS_CACHE
653
+
654
+ def write_from_pcollection(self, pcoll_examples):
655
+ """Add the final steps of the beam pipeline: write to parquet files."""
656
+ import apache_beam as beam
657
+
658
+ def inc_num_examples(example):
659
+ beam.metrics.Metrics.counter(self._namespace, "num_examples").inc()
660
+
661
+ # count examples
662
+ _ = pcoll_examples | "Count N. Examples" >> beam.Map(inc_num_examples)
663
+
664
+ # save dataset
665
+ return (
666
+ pcoll_examples
667
+ | "Get values" >> beam.Values()
668
+ | "Save to parquet"
669
+ >> beam.io.parquetio.WriteToParquet(
670
+ self._parquet_path, self._schema, shard_name_template="-SSSSS-of-NNNNN.parquet"
671
+ )
672
+ )
673
+
674
+ def finalize(self, metrics_query_result: dict):
675
+ """
676
+ Run after the pipeline has finished.
677
+ It converts the resulting parquet files to arrow and it completes the info from the pipeline metrics.
678
+
679
+ Args:
680
+ metrics_query_result: `dict` obtained from pipeline_results.metrics().query(m_filter). Make sure
681
+ that the filter keeps only the metrics for the considered split, under the namespace `split_name`.
682
+ """
683
+
684
+ # Beam FileSystems require the system's path separator in the older versions
685
+ fs, parquet_path = url_to_fs(self._parquet_path)
686
+ parquet_path = str(Path(parquet_path)) if not is_remote_filesystem(fs) else fs.unstrip_protocol(parquet_path)
687
+
688
+ shards = fs.glob(parquet_path + "*.parquet")
689
+ num_bytes = sum(fs.sizes(shards))
690
+ shard_lengths = get_parquet_lengths(shards)
691
+
692
+ # Convert to arrow
693
+ if self._path.endswith(".arrow"):
694
+ logger.info(f"Converting parquet files {self._parquet_path} to arrow {self._path}")
695
+ try: # stream conversion
696
+ num_bytes = 0
697
+ for shard in hf_tqdm(shards, unit="shards"):
698
+ with fs.open(shard, "rb") as source:
699
+ with fs.open(shard.replace(".parquet", ".arrow"), "wb") as destination:
700
+ shard_num_bytes, _ = parquet_to_arrow(source, destination)
701
+ num_bytes += shard_num_bytes
702
+ except OSError as e: # broken pipe can happen if the connection is unstable, do local conversion instead
703
+ if e.errno != errno.EPIPE: # not a broken pipe
704
+ raise
705
+ logger.warning(
706
+ "Broken Pipe during stream conversion from parquet to arrow. Using local convert instead"
707
+ )
708
+ local_convert_dir = os.path.join(self._cache_dir, "beam_convert")
709
+ os.makedirs(local_convert_dir, exist_ok=True)
710
+ num_bytes = 0
711
+ for shard in hf_tqdm(shards, unit="shards"):
712
+ local_parquet_path = os.path.join(local_convert_dir, hash_url_to_filename(shard) + ".parquet")
713
+ fs.download(shard, local_parquet_path)
714
+ local_arrow_path = local_parquet_path.replace(".parquet", ".arrow")
715
+ shard_num_bytes, _ = parquet_to_arrow(local_parquet_path, local_arrow_path)
716
+ num_bytes += shard_num_bytes
717
+ remote_arrow_path = shard.replace(".parquet", ".arrow")
718
+ fs.upload(local_arrow_path, remote_arrow_path)
719
+
720
+ # Save metrics
721
+ counters_dict = {metric.key.metric.name: metric.result for metric in metrics_query_result["counters"]}
722
+ self._num_examples = counters_dict["num_examples"]
723
+ self._num_bytes = num_bytes
724
+ self._shard_lengths = shard_lengths
725
+ return self._num_examples, self._num_bytes
726
+
727
+
728
+ def get_parquet_lengths(sources) -> List[int]:
729
+ shard_lengths = []
730
+ for source in hf_tqdm(sources, unit="parquet files"):
731
+ parquet_file = pa.parquet.ParquetFile(source)
732
+ shard_lengths.append(parquet_file.metadata.num_rows)
733
+ return shard_lengths
734
+
735
+
736
+ def parquet_to_arrow(source, destination) -> List[int]:
737
+ """Convert parquet file to arrow file. Inputs can be str paths or file-like objects"""
738
+ stream = None if isinstance(destination, str) else destination
739
+ parquet_file = pa.parquet.ParquetFile(source)
740
+ # Beam can create empty Parquet files, so we need to pass the source Parquet file's schema
741
+ with ArrowWriter(schema=parquet_file.schema_arrow, path=destination, stream=stream) as writer:
742
+ for record_batch in parquet_file.iter_batches():
743
+ pa_table = pa.Table.from_batches([record_batch])
744
+ writer.write_table(pa_table)
745
+ num_bytes, num_examples = writer.finalize()
746
+ return num_bytes, num_examples
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/datasets/combine.py ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Optional, TypeVar
2
+
3
+ from .arrow_dataset import Dataset, _concatenate_map_style_datasets, _interleave_map_style_datasets
4
+ from .dataset_dict import DatasetDict, IterableDatasetDict
5
+ from .info import DatasetInfo
6
+ from .iterable_dataset import IterableDataset, _concatenate_iterable_datasets, _interleave_iterable_datasets
7
+ from .splits import NamedSplit
8
+ from .utils import logging
9
+ from .utils.py_utils import Literal
10
+
11
+
12
+ logger = logging.get_logger(__name__)
13
+
14
+
15
+ DatasetType = TypeVar("DatasetType", Dataset, IterableDataset)
16
+
17
+
18
+ def interleave_datasets(
19
+ datasets: List[DatasetType],
20
+ probabilities: Optional[List[float]] = None,
21
+ seed: Optional[int] = None,
22
+ info: Optional[DatasetInfo] = None,
23
+ split: Optional[NamedSplit] = None,
24
+ stopping_strategy: Literal["first_exhausted", "all_exhausted"] = "first_exhausted",
25
+ ) -> DatasetType:
26
+ """
27
+ Interleave several datasets (sources) into a single dataset.
28
+ The new dataset is constructed by alternating between the sources to get the examples.
29
+
30
+ You can use this function on a list of [`Dataset`] objects, or on a list of [`IterableDataset`] objects.
31
+
32
+ - If `probabilities` is `None` (default) the new dataset is constructed by cycling between each source to get the examples.
33
+ - If `probabilities` is not `None`, the new dataset is constructed by getting examples from a random source at a time according to the provided probabilities.
34
+
35
+ The resulting dataset ends when one of the source datasets runs out of examples except when `oversampling` is `True`,
36
+ in which case, the resulting dataset ends when all datasets have ran out of examples at least one time.
37
+
38
+ Note for iterable datasets:
39
+
40
+ In a distributed setup or in PyTorch DataLoader workers, the stopping strategy is applied per process.
41
+ Therefore the "first_exhausted" strategy on an sharded iterable dataset can generate less samples in total (up to 1 missing sample per subdataset per worker).
42
+
43
+ Args:
44
+ datasets (`List[Dataset]` or `List[IterableDataset]`):
45
+ List of datasets to interleave.
46
+ probabilities (`List[float]`, *optional*, defaults to `None`):
47
+ If specified, the new dataset is constructed by sampling
48
+ examples from one source at a time according to these probabilities.
49
+ seed (`int`, *optional*, defaults to `None`):
50
+ The random seed used to choose a source for each example.
51
+ info ([`DatasetInfo`], *optional*):
52
+ Dataset information, like description, citation, etc.
53
+ <Added version="2.4.0"/>
54
+ split ([`NamedSplit`], *optional*):
55
+ Name of the dataset split.
56
+ <Added version="2.4.0"/>
57
+ stopping_strategy (`str`, defaults to `first_exhausted`):
58
+ Two strategies are proposed right now, `first_exhausted` and `all_exhausted`.
59
+ By default, `first_exhausted` is an undersampling strategy, i.e the dataset construction is stopped as soon as one dataset has ran out of samples.
60
+ If the strategy is `all_exhausted`, we use an oversampling strategy, i.e the dataset construction is stopped as soon as every samples of every dataset has been added at least once.
61
+ Note that if the strategy is `all_exhausted`, the interleaved dataset size can get enormous:
62
+ - with no probabilities, the resulting dataset will have `max_length_datasets*nb_dataset` samples.
63
+ - with given probabilities, the resulting dataset will have more samples if some datasets have really low probability of visiting.
64
+ Returns:
65
+ [`Dataset`] or [`IterableDataset`]: Return type depends on the input `datasets`
66
+ parameter. `Dataset` if the input is a list of `Dataset`, `IterableDataset` if the input is a list of
67
+ `IterableDataset`.
68
+
69
+ Example:
70
+
71
+ For regular datasets (map-style):
72
+
73
+ ```python
74
+ >>> from datasets import Dataset, interleave_datasets
75
+ >>> d1 = Dataset.from_dict({"a": [0, 1, 2]})
76
+ >>> d2 = Dataset.from_dict({"a": [10, 11, 12]})
77
+ >>> d3 = Dataset.from_dict({"a": [20, 21, 22]})
78
+ >>> dataset = interleave_datasets([d1, d2, d3], probabilities=[0.7, 0.2, 0.1], seed=42, stopping_strategy="all_exhausted")
79
+ >>> dataset["a"]
80
+ [10, 0, 11, 1, 2, 20, 12, 10, 0, 1, 2, 21, 0, 11, 1, 2, 0, 1, 12, 2, 10, 0, 22]
81
+ >>> dataset = interleave_datasets([d1, d2, d3], probabilities=[0.7, 0.2, 0.1], seed=42)
82
+ >>> dataset["a"]
83
+ [10, 0, 11, 1, 2]
84
+ >>> dataset = interleave_datasets([d1, d2, d3])
85
+ >>> dataset["a"]
86
+ [0, 10, 20, 1, 11, 21, 2, 12, 22]
87
+ >>> dataset = interleave_datasets([d1, d2, d3], stopping_strategy="all_exhausted")
88
+ >>> dataset["a"]
89
+ [0, 10, 20, 1, 11, 21, 2, 12, 22]
90
+ >>> d1 = Dataset.from_dict({"a": [0, 1, 2]})
91
+ >>> d2 = Dataset.from_dict({"a": [10, 11, 12, 13]})
92
+ >>> d3 = Dataset.from_dict({"a": [20, 21, 22, 23, 24]})
93
+ >>> dataset = interleave_datasets([d1, d2, d3])
94
+ >>> dataset["a"]
95
+ [0, 10, 20, 1, 11, 21, 2, 12, 22]
96
+ >>> dataset = interleave_datasets([d1, d2, d3], stopping_strategy="all_exhausted")
97
+ >>> dataset["a"]
98
+ [0, 10, 20, 1, 11, 21, 2, 12, 22, 0, 13, 23, 1, 10, 24]
99
+ >>> dataset = interleave_datasets([d1, d2, d3], probabilities=[0.7, 0.2, 0.1], seed=42)
100
+ >>> dataset["a"]
101
+ [10, 0, 11, 1, 2]
102
+ >>> dataset = interleave_datasets([d1, d2, d3], probabilities=[0.7, 0.2, 0.1], seed=42, stopping_strategy="all_exhausted")
103
+ >>> dataset["a"]
104
+ [10, 0, 11, 1, 2, 20, 12, 13, ..., 0, 1, 2, 0, 24]
105
+ For datasets in streaming mode (iterable):
106
+
107
+ >>> from datasets import load_dataset, interleave_datasets
108
+ >>> d1 = load_dataset("oscar", "unshuffled_deduplicated_en", split="train", streaming=True)
109
+ >>> d2 = load_dataset("oscar", "unshuffled_deduplicated_fr", split="train", streaming=True)
110
+ >>> dataset = interleave_datasets([d1, d2])
111
+ >>> iterator = iter(dataset)
112
+ >>> next(iterator)
113
+ {'text': 'Mtendere Village was inspired by the vision...}
114
+ >>> next(iterator)
115
+ {'text': "Média de débat d'idées, de culture...}
116
+ ```
117
+ """
118
+ from .arrow_dataset import Dataset
119
+ from .iterable_dataset import IterableDataset
120
+
121
+ if not datasets:
122
+ raise ValueError("Unable to interleave an empty list of datasets.")
123
+ for i, dataset in enumerate(datasets):
124
+ if not isinstance(dataset, (Dataset, IterableDataset)):
125
+ if isinstance(dataset, (DatasetDict, IterableDatasetDict)):
126
+ if not dataset:
127
+ raise ValueError(
128
+ f"Expected a list of Dataset objects or a list of IterableDataset objects, but element at position {i} "
129
+ "is an empty dataset dictionary."
130
+ )
131
+ raise ValueError(
132
+ f"Dataset at position {i} has at least one split: {list(dataset)}\n"
133
+ f"Please pick one to interleave with the other datasets, for example: dataset['{next(iter(dataset))}']"
134
+ )
135
+ raise ValueError(
136
+ f"Expected a list of Dataset objects or a list of IterableDataset objects, but element at position {i} is a {type(dataset).__name__}."
137
+ )
138
+ if i == 0:
139
+ dataset_type, other_type = (
140
+ (Dataset, IterableDataset) if isinstance(dataset, Dataset) else (IterableDataset, Dataset)
141
+ )
142
+ elif not isinstance(dataset, dataset_type):
143
+ raise ValueError(
144
+ f"Unable to interleave a {dataset_type.__name__} (at position 0) with a {other_type.__name__} (at position {i}). Expected a list of Dataset objects or a list of IterableDataset objects."
145
+ )
146
+ if stopping_strategy not in ["first_exhausted", "all_exhausted"]:
147
+ raise ValueError(f"{stopping_strategy} is not supported. Please enter a valid stopping_strategy.")
148
+ if dataset_type is Dataset:
149
+ return _interleave_map_style_datasets(
150
+ datasets, probabilities, seed, info=info, split=split, stopping_strategy=stopping_strategy
151
+ )
152
+ else:
153
+ return _interleave_iterable_datasets(
154
+ datasets, probabilities, seed, info=info, split=split, stopping_strategy=stopping_strategy
155
+ )
156
+
157
+
158
+ def concatenate_datasets(
159
+ dsets: List[DatasetType],
160
+ info: Optional[DatasetInfo] = None,
161
+ split: Optional[NamedSplit] = None,
162
+ axis: int = 0,
163
+ ) -> DatasetType:
164
+ """
165
+ Converts a list of [`Dataset`] with the same schema into a single [`Dataset`].
166
+
167
+ Args:
168
+ dsets (`List[datasets.Dataset]`):
169
+ List of Datasets to concatenate.
170
+ info (`DatasetInfo`, *optional*):
171
+ Dataset information, like description, citation, etc.
172
+ split (`NamedSplit`, *optional*):
173
+ Name of the dataset split.
174
+ axis (`{0, 1}`, defaults to `0`):
175
+ Axis to concatenate over, where `0` means over rows (vertically) and `1` means over columns
176
+ (horizontally).
177
+
178
+ <Added version="1.6.0"/>
179
+
180
+ Example:
181
+
182
+ ```py
183
+ >>> ds3 = concatenate_datasets([ds1, ds2])
184
+ ```
185
+ """
186
+
187
+ if not dsets:
188
+ raise ValueError("Unable to concatenate an empty list of datasets.")
189
+ for i, dataset in enumerate(dsets):
190
+ if not isinstance(dataset, (Dataset, IterableDataset)):
191
+ if isinstance(dataset, (DatasetDict, IterableDatasetDict)):
192
+ if not dataset:
193
+ raise ValueError(
194
+ f"Expected a list of Dataset objects or a list of IterableDataset objects, but element at position {i} "
195
+ "is an empty dataset dictionary."
196
+ )
197
+ raise ValueError(
198
+ f"Dataset at position {i} has at least one split: {list(dataset)}\n"
199
+ f"Please pick one to interleave with the other datasets, for example: dataset['{next(iter(dataset))}']"
200
+ )
201
+ raise ValueError(
202
+ f"Expected a list of Dataset objects or a list of IterableDataset objects, but element at position {i} is a {type(dataset).__name__}."
203
+ )
204
+ if i == 0:
205
+ dataset_type, other_type = (
206
+ (Dataset, IterableDataset) if isinstance(dataset, Dataset) else (IterableDataset, Dataset)
207
+ )
208
+ elif not isinstance(dataset, dataset_type):
209
+ raise ValueError(
210
+ f"Unable to interleave a {dataset_type.__name__} (at position 0) with a {other_type.__name__} (at position {i}). Expected a list of Dataset objects or a list of IterableDataset objects."
211
+ )
212
+ if dataset_type is Dataset:
213
+ return _concatenate_map_style_datasets(dsets, info=info, split=split, axis=axis)
214
+ else:
215
+ return _concatenate_iterable_datasets(dsets, info=info, split=split, axis=axis)
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/datasets/data_files.py ADDED
@@ -0,0 +1,825 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ from functools import partial
4
+ from glob import has_magic
5
+ from pathlib import Path, PurePath
6
+ from typing import Callable, Dict, List, Optional, Set, Tuple, Union
7
+
8
+ import huggingface_hub
9
+ from fsspec.core import url_to_fs
10
+ from fsspec.implementations.http import HTTPFileSystem
11
+ from huggingface_hub import HfFileSystem
12
+ from packaging import version
13
+ from tqdm.contrib.concurrent import thread_map
14
+
15
+ from . import config
16
+ from .download import DownloadConfig
17
+ from .naming import _split_re
18
+ from .splits import Split
19
+ from .utils import logging
20
+ from .utils import tqdm as hf_tqdm
21
+ from .utils.file_utils import _prepare_path_and_storage_options, is_local_path, is_relative_path, xbasename, xjoin
22
+ from .utils.py_utils import glob_pattern_to_regex, string_to_dict
23
+
24
+
25
+ SingleOriginMetadata = Union[Tuple[str, str], Tuple[str], Tuple[()]]
26
+
27
+
28
+ SANITIZED_DEFAULT_SPLIT = str(Split.TRAIN)
29
+
30
+
31
+ logger = logging.get_logger(__name__)
32
+
33
+
34
+ class Url(str):
35
+ pass
36
+
37
+
38
+ class EmptyDatasetError(FileNotFoundError):
39
+ pass
40
+
41
+
42
+ SPLIT_PATTERN_SHARDED = "data/{split}-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*"
43
+
44
+ SPLIT_KEYWORDS = {
45
+ Split.TRAIN: ["train", "training"],
46
+ Split.VALIDATION: ["validation", "valid", "dev", "val"],
47
+ Split.TEST: ["test", "testing", "eval", "evaluation"],
48
+ }
49
+ NON_WORDS_CHARS = "-._ 0-9"
50
+ if config.FSSPEC_VERSION < version.parse("2023.9.0"):
51
+ KEYWORDS_IN_FILENAME_BASE_PATTERNS = ["**[{sep}/]{keyword}[{sep}]*", "{keyword}[{sep}]*"]
52
+ KEYWORDS_IN_DIR_NAME_BASE_PATTERNS = [
53
+ "{keyword}/**",
54
+ "{keyword}[{sep}]*/**",
55
+ "**[{sep}/]{keyword}/**",
56
+ "**[{sep}/]{keyword}[{sep}]*/**",
57
+ ]
58
+ elif config.FSSPEC_VERSION < version.parse("2023.12.0"):
59
+ KEYWORDS_IN_FILENAME_BASE_PATTERNS = ["**/*[{sep}/]{keyword}[{sep}]*", "{keyword}[{sep}]*"]
60
+ KEYWORDS_IN_DIR_NAME_BASE_PATTERNS = [
61
+ "{keyword}/**/*",
62
+ "{keyword}[{sep}]*/**/*",
63
+ "**/*[{sep}/]{keyword}/**/*",
64
+ "**/*[{sep}/]{keyword}[{sep}]*/**/*",
65
+ ]
66
+ else:
67
+ KEYWORDS_IN_FILENAME_BASE_PATTERNS = ["**/{keyword}[{sep}]*", "**/*[{sep}]{keyword}[{sep}]*"]
68
+ KEYWORDS_IN_DIR_NAME_BASE_PATTERNS = [
69
+ "**/{keyword}/**",
70
+ "**/{keyword}[{sep}]*/**",
71
+ "**/*[{sep}]{keyword}/**",
72
+ "**/*[{sep}]{keyword}[{sep}]*/**",
73
+ ]
74
+
75
+ DEFAULT_SPLITS = [Split.TRAIN, Split.VALIDATION, Split.TEST]
76
+ DEFAULT_PATTERNS_SPLIT_IN_FILENAME = {
77
+ split: [
78
+ pattern.format(keyword=keyword, sep=NON_WORDS_CHARS)
79
+ for keyword in SPLIT_KEYWORDS[split]
80
+ for pattern in KEYWORDS_IN_FILENAME_BASE_PATTERNS
81
+ ]
82
+ for split in DEFAULT_SPLITS
83
+ }
84
+ DEFAULT_PATTERNS_SPLIT_IN_DIR_NAME = {
85
+ split: [
86
+ pattern.format(keyword=keyword, sep=NON_WORDS_CHARS)
87
+ for keyword in SPLIT_KEYWORDS[split]
88
+ for pattern in KEYWORDS_IN_DIR_NAME_BASE_PATTERNS
89
+ ]
90
+ for split in DEFAULT_SPLITS
91
+ }
92
+
93
+
94
+ DEFAULT_PATTERNS_ALL = {
95
+ Split.TRAIN: ["**"],
96
+ }
97
+
98
+ ALL_SPLIT_PATTERNS = [SPLIT_PATTERN_SHARDED]
99
+ ALL_DEFAULT_PATTERNS = [
100
+ DEFAULT_PATTERNS_SPLIT_IN_DIR_NAME,
101
+ DEFAULT_PATTERNS_SPLIT_IN_FILENAME,
102
+ DEFAULT_PATTERNS_ALL,
103
+ ]
104
+ if config.FSSPEC_VERSION < version.parse("2023.9.0"):
105
+ METADATA_PATTERNS = [
106
+ "metadata.csv",
107
+ "**/metadata.csv",
108
+ "metadata.jsonl",
109
+ "**/metadata.jsonl",
110
+ ] # metadata file for ImageFolder and AudioFolder
111
+ else:
112
+ METADATA_PATTERNS = [
113
+ "**/metadata.csv",
114
+ "**/metadata.jsonl",
115
+ ] # metadata file for ImageFolder and AudioFolder
116
+ WILDCARD_CHARACTERS = "*[]"
117
+ FILES_TO_IGNORE = [
118
+ "README.md",
119
+ "config.json",
120
+ "dataset_info.json",
121
+ "dataset_infos.json",
122
+ "dummy_data.zip",
123
+ "dataset_dict.json",
124
+ ]
125
+
126
+
127
+ def contains_wildcards(pattern: str) -> bool:
128
+ return any(wilcard_character in pattern for wilcard_character in WILDCARD_CHARACTERS)
129
+
130
+
131
+ def sanitize_patterns(patterns: Union[Dict, List, str]) -> Dict[str, Union[List[str], "DataFilesList"]]:
132
+ """
133
+ Take the data_files patterns from the user, and format them into a dictionary.
134
+ Each key is the name of the split, and each value is a list of data files patterns (paths or urls).
135
+ The default split is "train".
136
+
137
+ Returns:
138
+ patterns: dictionary of split_name -> list of patterns
139
+ """
140
+ if isinstance(patterns, dict):
141
+ return {str(key): value if isinstance(value, list) else [value] for key, value in patterns.items()}
142
+ elif isinstance(patterns, str):
143
+ return {SANITIZED_DEFAULT_SPLIT: [patterns]}
144
+ elif isinstance(patterns, list):
145
+ if any(isinstance(pattern, dict) for pattern in patterns):
146
+ for pattern in patterns:
147
+ if not (
148
+ isinstance(pattern, dict)
149
+ and len(pattern) == 2
150
+ and "split" in pattern
151
+ and isinstance(pattern.get("path"), (str, list))
152
+ ):
153
+ raise ValueError(
154
+ f"Expected each split to have a 'path' key which can be a string or a list of strings, but got {pattern}"
155
+ )
156
+ splits = [pattern["split"] for pattern in patterns]
157
+ if len(set(splits)) != len(splits):
158
+ raise ValueError(f"Some splits are duplicated in data_files: {splits}")
159
+ return {
160
+ str(pattern["split"]): pattern["path"] if isinstance(pattern["path"], list) else [pattern["path"]]
161
+ for pattern in patterns
162
+ }
163
+ else:
164
+ return {SANITIZED_DEFAULT_SPLIT: patterns}
165
+ else:
166
+ return sanitize_patterns(list(patterns))
167
+
168
+
169
+ def _is_inside_unrequested_special_dir(matched_rel_path: str, pattern: str) -> bool:
170
+ """
171
+ When a path matches a pattern, we additionnally check if it's inside a special directory
172
+ we ignore by default (if it starts with a double underscore).
173
+
174
+ Users can still explicitly request a filepath inside such a directory if "__pycache__" is
175
+ mentioned explicitly in the requested pattern.
176
+
177
+ Some examples:
178
+
179
+ base directory:
180
+
181
+ ./
182
+ └── __pycache__
183
+ └── b.txt
184
+
185
+ >>> _is_inside_unrequested_special_dir("__pycache__/b.txt", "**")
186
+ True
187
+ >>> _is_inside_unrequested_special_dir("__pycache__/b.txt", "*/b.txt")
188
+ True
189
+ >>> _is_inside_unrequested_special_dir("__pycache__/b.txt", "__pycache__/*")
190
+ False
191
+ >>> _is_inside_unrequested_special_dir("__pycache__/b.txt", "__*/*")
192
+ False
193
+ """
194
+ # We just need to check if every special directories from the path is present explicly in the pattern.
195
+ # Since we assume that the path matches the pattern, it's equivalent to counting that both
196
+ # the parent path and the parent pattern have the same number of special directories.
197
+ data_dirs_to_ignore_in_path = [part for part in PurePath(matched_rel_path).parent.parts if part.startswith("__")]
198
+ data_dirs_to_ignore_in_pattern = [part for part in PurePath(pattern).parent.parts if part.startswith("__")]
199
+ return len(data_dirs_to_ignore_in_path) != len(data_dirs_to_ignore_in_pattern)
200
+
201
+
202
+ def _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(matched_rel_path: str, pattern: str) -> bool:
203
+ """
204
+ When a path matches a pattern, we additionnally check if it's a hidden file or if it's inside
205
+ a hidden directory we ignore by default, i.e. if the file name or a parent directory name starts with a dot.
206
+
207
+ Users can still explicitly request a filepath that is hidden or is inside a hidden directory
208
+ if the hidden part is mentioned explicitly in the requested pattern.
209
+
210
+ Some examples:
211
+
212
+ base directory:
213
+
214
+ ./
215
+ └── .hidden_file.txt
216
+
217
+ >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_file.txt", "**")
218
+ True
219
+ >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_file.txt", ".*")
220
+ False
221
+
222
+ base directory:
223
+
224
+ ./
225
+ └── .hidden_dir
226
+ └── a.txt
227
+
228
+ >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/a.txt", "**")
229
+ True
230
+ >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/a.txt", ".*/*")
231
+ False
232
+ >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/a.txt", ".hidden_dir/*")
233
+ False
234
+
235
+ base directory:
236
+
237
+ ./
238
+ └── .hidden_dir
239
+ └── .hidden_file.txt
240
+
241
+ >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", "**")
242
+ True
243
+ >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", ".*/*")
244
+ True
245
+ >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", ".*/.*")
246
+ False
247
+ >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", ".hidden_dir/*")
248
+ True
249
+ >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", ".hidden_dir/.*")
250
+ False
251
+ """
252
+ # We just need to check if every hidden part from the path is present explicly in the pattern.
253
+ # Since we assume that the path matches the pattern, it's equivalent to counting that both
254
+ # the path and the pattern have the same number of hidden parts.
255
+ hidden_directories_in_path = [
256
+ part for part in PurePath(matched_rel_path).parts if part.startswith(".") and not set(part) == {"."}
257
+ ]
258
+ hidden_directories_in_pattern = [
259
+ part for part in PurePath(pattern).parts if part.startswith(".") and not set(part) == {"."}
260
+ ]
261
+ return len(hidden_directories_in_path) != len(hidden_directories_in_pattern)
262
+
263
+
264
+ def _get_data_files_patterns(pattern_resolver: Callable[[str], List[str]]) -> Dict[str, List[str]]:
265
+ """
266
+ Get the default pattern from a directory or repository by testing all the supported patterns.
267
+ The first patterns to return a non-empty list of data files is returned.
268
+
269
+ In order, it first tests if SPLIT_PATTERN_SHARDED works, otherwise it tests the patterns in ALL_DEFAULT_PATTERNS.
270
+ """
271
+ # first check the split patterns like data/{split}-00000-of-00001.parquet
272
+ for split_pattern in ALL_SPLIT_PATTERNS:
273
+ pattern = split_pattern.replace("{split}", "*")
274
+ try:
275
+ data_files = pattern_resolver(pattern)
276
+ except FileNotFoundError:
277
+ continue
278
+ if len(data_files) > 0:
279
+ splits: Set[str] = {
280
+ string_to_dict(xbasename(p), glob_pattern_to_regex(xbasename(split_pattern)))["split"]
281
+ for p in data_files
282
+ }
283
+ if any(not re.match(_split_re, split) for split in splits):
284
+ raise ValueError(f"Split name should match '{_split_re}'' but got '{splits}'.")
285
+ sorted_splits = [str(split) for split in DEFAULT_SPLITS if split in splits] + sorted(
286
+ splits - set(DEFAULT_SPLITS)
287
+ )
288
+ return {split: [split_pattern.format(split=split)] for split in sorted_splits}
289
+ # then check the default patterns based on train/valid/test splits
290
+ for patterns_dict in ALL_DEFAULT_PATTERNS:
291
+ non_empty_splits = []
292
+ for split, patterns in patterns_dict.items():
293
+ for pattern in patterns:
294
+ try:
295
+ data_files = pattern_resolver(pattern)
296
+ except FileNotFoundError:
297
+ continue
298
+ if len(data_files) > 0:
299
+ non_empty_splits.append(split)
300
+ break
301
+ if non_empty_splits:
302
+ return {split: patterns_dict[split] for split in non_empty_splits}
303
+ raise FileNotFoundError(f"Couldn't resolve pattern {pattern} with resolver {pattern_resolver}")
304
+
305
+
306
+ def _get_metadata_files_patterns(pattern_resolver: Callable[[str], List[str]]) -> List[str]:
307
+ """
308
+ Get the supported metadata patterns from a directory or repository.
309
+ """
310
+ non_empty_patterns = []
311
+ for pattern in METADATA_PATTERNS:
312
+ try:
313
+ metadata_files = pattern_resolver(pattern)
314
+ if len(metadata_files) > 0:
315
+ non_empty_patterns.append(pattern)
316
+ except FileNotFoundError:
317
+ pass
318
+ if non_empty_patterns:
319
+ return non_empty_patterns
320
+ raise FileNotFoundError(f"Couldn't resolve pattern {pattern} with resolver {pattern_resolver}")
321
+
322
+
323
+ def resolve_pattern(
324
+ pattern: str,
325
+ base_path: str,
326
+ allowed_extensions: Optional[List[str]] = None,
327
+ download_config: Optional[DownloadConfig] = None,
328
+ ) -> List[str]:
329
+ """
330
+ Resolve the paths and URLs of the data files from the pattern passed by the user.
331
+
332
+ You can use patterns to resolve multiple local files. Here are a few examples:
333
+ - *.csv to match all the CSV files at the first level
334
+ - **.csv to match all the CSV files at any level
335
+ - data/* to match all the files inside "data"
336
+ - data/** to match all the files inside "data" and its subdirectories
337
+
338
+ The patterns are resolved using the fsspec glob. In fsspec>=2023.12.0 this is equivalent to
339
+ Python's glob.glob, Path.glob, Path.match and fnmatch where ** is unsupported with a prefix/suffix
340
+ other than a forward slash /.
341
+
342
+ More generally:
343
+ - '*' matches any character except a forward-slash (to match just the file or directory name)
344
+ - '**' matches any character including a forward-slash /
345
+
346
+ Hidden files and directories (i.e. whose names start with a dot) are ignored, unless they are explicitly requested.
347
+ The same applies to special directories that start with a double underscore like "__pycache__".
348
+ You can still include one if the pattern explicilty mentions it:
349
+ - to include a hidden file: "*/.hidden.txt" or "*/.*"
350
+ - to include a hidden directory: ".hidden/*" or ".*/*"
351
+ - to include a special directory: "__special__/*" or "__*/*"
352
+
353
+ Example::
354
+
355
+ >>> from datasets.data_files import resolve_pattern
356
+ >>> base_path = "."
357
+ >>> resolve_pattern("docs/**/*.py", base_path)
358
+ [/Users/mariosasko/Desktop/projects/datasets/docs/source/_config.py']
359
+
360
+ Args:
361
+ pattern (str): Unix pattern or paths or URLs of the data files to resolve.
362
+ The paths can be absolute or relative to base_path.
363
+ Remote filesystems using fsspec are supported, e.g. with the hf:// protocol.
364
+ base_path (str): Base path to use when resolving relative paths.
365
+ allowed_extensions (Optional[list], optional): White-list of file extensions to use. Defaults to None (all extensions).
366
+ For example: allowed_extensions=[".csv", ".json", ".txt", ".parquet"]
367
+ download_config ([`DownloadConfig`], *optional*): Specific download configuration parameters.
368
+ Returns:
369
+ List[str]: List of paths or URLs to the local or remote files that match the patterns.
370
+ """
371
+ if is_relative_path(pattern):
372
+ pattern = xjoin(base_path, pattern)
373
+ elif is_local_path(pattern):
374
+ base_path = os.path.splitdrive(pattern)[0] + os.sep
375
+ else:
376
+ base_path = ""
377
+ pattern, storage_options = _prepare_path_and_storage_options(pattern, download_config=download_config)
378
+ fs, fs_pattern = url_to_fs(pattern, **storage_options)
379
+ files_to_ignore = set(FILES_TO_IGNORE) - {xbasename(pattern)}
380
+ protocol = fs.protocol if isinstance(fs.protocol, str) else fs.protocol[0]
381
+ protocol_prefix = protocol + "://" if protocol != "file" else ""
382
+ glob_kwargs = {}
383
+ if protocol == "hf" and config.HF_HUB_VERSION >= version.parse("0.20.0"):
384
+ # 10 times faster glob with detail=True (ignores costly info like lastCommit)
385
+ glob_kwargs["expand_info"] = False
386
+ matched_paths = [
387
+ filepath if filepath.startswith(protocol_prefix) else protocol_prefix + filepath
388
+ for filepath, info in fs.glob(pattern, detail=True, **glob_kwargs).items()
389
+ if info["type"] == "file"
390
+ and (xbasename(filepath) not in files_to_ignore)
391
+ and not _is_inside_unrequested_special_dir(filepath, fs_pattern)
392
+ and not _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(filepath, fs_pattern)
393
+ ] # ignore .ipynb and __pycache__, but keep /../
394
+ if allowed_extensions is not None:
395
+ out = [
396
+ filepath
397
+ for filepath in matched_paths
398
+ if any("." + suffix in allowed_extensions for suffix in xbasename(filepath).split(".")[1:])
399
+ ]
400
+ if len(out) < len(matched_paths):
401
+ invalid_matched_files = list(set(matched_paths) - set(out))
402
+ logger.info(
403
+ f"Some files matched the pattern '{pattern}' but don't have valid data file extensions: {invalid_matched_files}"
404
+ )
405
+ else:
406
+ out = matched_paths
407
+ if not out:
408
+ error_msg = f"Unable to find '{pattern}'"
409
+ if allowed_extensions is not None:
410
+ error_msg += f" with any supported extension {list(allowed_extensions)}"
411
+ raise FileNotFoundError(error_msg)
412
+ return out
413
+
414
+
415
+ def get_data_patterns(base_path: str, download_config: Optional[DownloadConfig] = None) -> Dict[str, List[str]]:
416
+ """
417
+ Get the default pattern from a directory testing all the supported patterns.
418
+ The first patterns to return a non-empty list of data files is returned.
419
+
420
+ Some examples of supported patterns:
421
+
422
+ Input:
423
+
424
+ my_dataset_repository/
425
+ ├── README.md
426
+ └── dataset.csv
427
+
428
+ Output:
429
+
430
+ {'train': ['**']}
431
+
432
+ Input:
433
+
434
+ my_dataset_repository/
435
+ ├── README.md
436
+ ├── train.csv
437
+ └── test.csv
438
+
439
+ my_dataset_repository/
440
+ ├── README.md
441
+ └── data/
442
+ ├── train.csv
443
+ └── test.csv
444
+
445
+ my_dataset_repository/
446
+ ├── README.md
447
+ ├── train_0.csv
448
+ ├── train_1.csv
449
+ ├── train_2.csv
450
+ ├── train_3.csv
451
+ ├── test_0.csv
452
+ └── test_1.csv
453
+
454
+ Output:
455
+
456
+ {'train': ['**/train[-._ 0-9]*', '**/*[-._ 0-9]train[-._ 0-9]*', '**/training[-._ 0-9]*', '**/*[-._ 0-9]training[-._ 0-9]*'],
457
+ 'test': ['**/test[-._ 0-9]*', '**/*[-._ 0-9]test[-._ 0-9]*', '**/testing[-._ 0-9]*', '**/*[-._ 0-9]testing[-._ 0-9]*', ...]}
458
+
459
+ Input:
460
+
461
+ my_dataset_repository/
462
+ ├── README.md
463
+ └── data/
464
+ ├── train/
465
+ │ ├── shard_0.csv
466
+ │ ├── shard_1.csv
467
+ │ ├── shard_2.csv
468
+ │ └── shard_3.csv
469
+ └── test/
470
+ ├── shard_0.csv
471
+ └── shard_1.csv
472
+
473
+ Output:
474
+
475
+ {'train': ['**/train/**', '**/train[-._ 0-9]*/**', '**/*[-._ 0-9]train/**', '**/*[-._ 0-9]train[-._ 0-9]*/**', ...],
476
+ 'test': ['**/test/**', '**/test[-._ 0-9]*/**', '**/*[-._ 0-9]test/**', '**/*[-._ 0-9]test[-._ 0-9]*/**', ...]}
477
+
478
+ Input:
479
+
480
+ my_dataset_repository/
481
+ ├── README.md
482
+ └── data/
483
+ ├── train-00000-of-00003.csv
484
+ ├── train-00001-of-00003.csv
485
+ ├── train-00002-of-00003.csv
486
+ ├── test-00000-of-00001.csv
487
+ ├── random-00000-of-00003.csv
488
+ ├── random-00001-of-00003.csv
489
+ └── random-00002-of-00003.csv
490
+
491
+ Output:
492
+
493
+ {'train': ['data/train-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*'],
494
+ 'test': ['data/test-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*'],
495
+ 'random': ['data/random-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*']}
496
+
497
+ In order, it first tests if SPLIT_PATTERN_SHARDED works, otherwise it tests the patterns in ALL_DEFAULT_PATTERNS.
498
+ """
499
+ resolver = partial(resolve_pattern, base_path=base_path, download_config=download_config)
500
+ try:
501
+ return _get_data_files_patterns(resolver)
502
+ except FileNotFoundError:
503
+ raise EmptyDatasetError(f"The directory at {base_path} doesn't contain any data files") from None
504
+
505
+
506
+ def get_metadata_patterns(
507
+ base_path: str,
508
+ download_config: Optional[DownloadConfig] = None,
509
+ ) -> List[str]:
510
+ """
511
+ Get the supported metadata patterns from a local directory.
512
+ """
513
+ resolver = partial(resolve_pattern, base_path=base_path, download_config=download_config)
514
+ try:
515
+ return _get_metadata_files_patterns(resolver)
516
+ except FileNotFoundError:
517
+ raise FileNotFoundError(f"The directory at {base_path} doesn't contain any metadata file") from None
518
+
519
+
520
+ def _get_single_origin_metadata(
521
+ data_file: str,
522
+ download_config: Optional[DownloadConfig] = None,
523
+ ) -> SingleOriginMetadata:
524
+ data_file, storage_options = _prepare_path_and_storage_options(data_file, download_config=download_config)
525
+ fs, *_ = url_to_fs(data_file, **storage_options)
526
+ if isinstance(fs, HfFileSystem):
527
+ resolved_path = fs.resolve_path(data_file)
528
+ return resolved_path.repo_id, resolved_path.revision
529
+ elif isinstance(fs, HTTPFileSystem) and data_file.startswith(config.HF_ENDPOINT):
530
+ hffs = HfFileSystem(endpoint=config.HF_ENDPOINT, token=download_config.token)
531
+ data_file = "hf://" + data_file[len(config.HF_ENDPOINT) + 1 :].replace("/resolve/", "@", 1)
532
+ resolved_path = hffs.resolve_path(data_file)
533
+ return resolved_path.repo_id, resolved_path.revision
534
+ info = fs.info(data_file)
535
+ # s3fs uses "ETag", gcsfs uses "etag", and for local we simply check mtime
536
+ for key in ["ETag", "etag", "mtime"]:
537
+ if key in info:
538
+ return (str(info[key]),)
539
+ return ()
540
+
541
+
542
+ def _get_origin_metadata(
543
+ data_files: List[str],
544
+ download_config: Optional[DownloadConfig] = None,
545
+ max_workers: Optional[int] = None,
546
+ ) -> List[SingleOriginMetadata]:
547
+ max_workers = max_workers if max_workers is not None else config.HF_DATASETS_MULTITHREADING_MAX_WORKERS
548
+ return thread_map(
549
+ partial(_get_single_origin_metadata, download_config=download_config),
550
+ data_files,
551
+ max_workers=max_workers,
552
+ tqdm_class=hf_tqdm,
553
+ desc="Resolving data files",
554
+ # set `disable=None` rather than `disable=False` by default to disable progress bar when no TTY attached
555
+ disable=len(data_files) <= 16 or None,
556
+ )
557
+
558
+
559
+ class DataFilesList(List[str]):
560
+ """
561
+ List of data files (absolute local paths or URLs).
562
+ It has two construction methods given the user's data files patterns:
563
+ - ``from_hf_repo``: resolve patterns inside a dataset repository
564
+ - ``from_local_or_remote``: resolve patterns from a local path
565
+
566
+ Moreover, DataFilesList has an additional attribute ``origin_metadata``.
567
+ It can store:
568
+ - the last modified time of local files
569
+ - ETag of remote files
570
+ - commit sha of a dataset repository
571
+
572
+ Thanks to this additional attribute, it is possible to hash the list
573
+ and get a different hash if and only if at least one file changed.
574
+ This is useful for caching Dataset objects that are obtained from a list of data files.
575
+ """
576
+
577
+ def __init__(self, data_files: List[str], origin_metadata: List[SingleOriginMetadata]) -> None:
578
+ super().__init__(data_files)
579
+ self.origin_metadata = origin_metadata
580
+
581
+ def __add__(self, other: "DataFilesList") -> "DataFilesList":
582
+ return DataFilesList([*self, *other], self.origin_metadata + other.origin_metadata)
583
+
584
+ @classmethod
585
+ def from_hf_repo(
586
+ cls,
587
+ patterns: List[str],
588
+ dataset_info: huggingface_hub.hf_api.DatasetInfo,
589
+ base_path: Optional[str] = None,
590
+ allowed_extensions: Optional[List[str]] = None,
591
+ download_config: Optional[DownloadConfig] = None,
592
+ ) -> "DataFilesList":
593
+ base_path = f"hf://datasets/{dataset_info.id}@{dataset_info.sha}/{base_path or ''}".rstrip("/")
594
+ return cls.from_patterns(
595
+ patterns, base_path=base_path, allowed_extensions=allowed_extensions, download_config=download_config
596
+ )
597
+
598
+ @classmethod
599
+ def from_local_or_remote(
600
+ cls,
601
+ patterns: List[str],
602
+ base_path: Optional[str] = None,
603
+ allowed_extensions: Optional[List[str]] = None,
604
+ download_config: Optional[DownloadConfig] = None,
605
+ ) -> "DataFilesList":
606
+ base_path = base_path if base_path is not None else Path().resolve().as_posix()
607
+ return cls.from_patterns(
608
+ patterns, base_path=base_path, allowed_extensions=allowed_extensions, download_config=download_config
609
+ )
610
+
611
+ @classmethod
612
+ def from_patterns(
613
+ cls,
614
+ patterns: List[str],
615
+ base_path: Optional[str] = None,
616
+ allowed_extensions: Optional[List[str]] = None,
617
+ download_config: Optional[DownloadConfig] = None,
618
+ ) -> "DataFilesList":
619
+ base_path = base_path if base_path is not None else Path().resolve().as_posix()
620
+ data_files = []
621
+ for pattern in patterns:
622
+ try:
623
+ data_files.extend(
624
+ resolve_pattern(
625
+ pattern,
626
+ base_path=base_path,
627
+ allowed_extensions=allowed_extensions,
628
+ download_config=download_config,
629
+ )
630
+ )
631
+ except FileNotFoundError:
632
+ if not has_magic(pattern):
633
+ raise
634
+ origin_metadata = _get_origin_metadata(data_files, download_config=download_config)
635
+ return cls(data_files, origin_metadata)
636
+
637
+ def filter_extensions(self, extensions: List[str]) -> "DataFilesList":
638
+ pattern = "|".join("\\" + ext for ext in extensions)
639
+ pattern = re.compile(f".*({pattern})(\\..+)?$")
640
+ return DataFilesList(
641
+ [data_file for data_file in self if pattern.match(data_file)],
642
+ origin_metadata=self.origin_metadata,
643
+ )
644
+
645
+
646
+ class DataFilesDict(Dict[str, DataFilesList]):
647
+ """
648
+ Dict of split_name -> list of data files (absolute local paths or URLs).
649
+ It has two construction methods given the user's data files patterns :
650
+ - ``from_hf_repo``: resolve patterns inside a dataset repository
651
+ - ``from_local_or_remote``: resolve patterns from a local path
652
+
653
+ Moreover, each list is a DataFilesList. It is possible to hash the dictionary
654
+ and get a different hash if and only if at least one file changed.
655
+ For more info, see [`DataFilesList`].
656
+
657
+ This is useful for caching Dataset objects that are obtained from a list of data files.
658
+
659
+ Changing the order of the keys of this dictionary also doesn't change its hash.
660
+ """
661
+
662
+ @classmethod
663
+ def from_local_or_remote(
664
+ cls,
665
+ patterns: Dict[str, Union[List[str], DataFilesList]],
666
+ base_path: Optional[str] = None,
667
+ allowed_extensions: Optional[List[str]] = None,
668
+ download_config: Optional[DownloadConfig] = None,
669
+ ) -> "DataFilesDict":
670
+ out = cls()
671
+ for key, patterns_for_key in patterns.items():
672
+ out[key] = (
673
+ patterns_for_key
674
+ if isinstance(patterns_for_key, DataFilesList)
675
+ else DataFilesList.from_local_or_remote(
676
+ patterns_for_key,
677
+ base_path=base_path,
678
+ allowed_extensions=allowed_extensions,
679
+ download_config=download_config,
680
+ )
681
+ )
682
+ return out
683
+
684
+ @classmethod
685
+ def from_hf_repo(
686
+ cls,
687
+ patterns: Dict[str, Union[List[str], DataFilesList]],
688
+ dataset_info: huggingface_hub.hf_api.DatasetInfo,
689
+ base_path: Optional[str] = None,
690
+ allowed_extensions: Optional[List[str]] = None,
691
+ download_config: Optional[DownloadConfig] = None,
692
+ ) -> "DataFilesDict":
693
+ out = cls()
694
+ for key, patterns_for_key in patterns.items():
695
+ out[key] = (
696
+ patterns_for_key
697
+ if isinstance(patterns_for_key, DataFilesList)
698
+ else DataFilesList.from_hf_repo(
699
+ patterns_for_key,
700
+ dataset_info=dataset_info,
701
+ base_path=base_path,
702
+ allowed_extensions=allowed_extensions,
703
+ download_config=download_config,
704
+ )
705
+ )
706
+ return out
707
+
708
+ @classmethod
709
+ def from_patterns(
710
+ cls,
711
+ patterns: Dict[str, Union[List[str], DataFilesList]],
712
+ base_path: Optional[str] = None,
713
+ allowed_extensions: Optional[List[str]] = None,
714
+ download_config: Optional[DownloadConfig] = None,
715
+ ) -> "DataFilesDict":
716
+ out = cls()
717
+ for key, patterns_for_key in patterns.items():
718
+ out[key] = (
719
+ patterns_for_key
720
+ if isinstance(patterns_for_key, DataFilesList)
721
+ else DataFilesList.from_patterns(
722
+ patterns_for_key,
723
+ base_path=base_path,
724
+ allowed_extensions=allowed_extensions,
725
+ download_config=download_config,
726
+ )
727
+ )
728
+ return out
729
+
730
+ def filter_extensions(self, extensions: List[str]) -> "DataFilesDict":
731
+ out = type(self)()
732
+ for key, data_files_list in self.items():
733
+ out[key] = data_files_list.filter_extensions(extensions)
734
+ return out
735
+
736
+
737
+ class DataFilesPatternsList(List[str]):
738
+ """
739
+ List of data files patterns (absolute local paths or URLs).
740
+ For each pattern there should also be a list of allowed extensions
741
+ to keep, or a None ot keep all the files for the pattern.
742
+ """
743
+
744
+ def __init__(
745
+ self,
746
+ patterns: List[str],
747
+ allowed_extensions: List[Optional[List[str]]],
748
+ ):
749
+ super().__init__(patterns)
750
+ self.allowed_extensions = allowed_extensions
751
+
752
+ def __add__(self, other):
753
+ return DataFilesList([*self, *other], self.allowed_extensions + other.allowed_extensions)
754
+
755
+ @classmethod
756
+ def from_patterns(
757
+ cls, patterns: List[str], allowed_extensions: Optional[List[str]] = None
758
+ ) -> "DataFilesPatternsList":
759
+ return cls(patterns, [allowed_extensions] * len(patterns))
760
+
761
+ def resolve(
762
+ self,
763
+ base_path: str,
764
+ download_config: Optional[DownloadConfig] = None,
765
+ ) -> "DataFilesList":
766
+ base_path = base_path if base_path is not None else Path().resolve().as_posix()
767
+ data_files = []
768
+ for pattern, allowed_extensions in zip(self, self.allowed_extensions):
769
+ try:
770
+ data_files.extend(
771
+ resolve_pattern(
772
+ pattern,
773
+ base_path=base_path,
774
+ allowed_extensions=allowed_extensions,
775
+ download_config=download_config,
776
+ )
777
+ )
778
+ except FileNotFoundError:
779
+ if not has_magic(pattern):
780
+ raise
781
+ origin_metadata = _get_origin_metadata(data_files, download_config=download_config)
782
+ return DataFilesList(data_files, origin_metadata)
783
+
784
+ def filter_extensions(self, extensions: List[str]) -> "DataFilesPatternsList":
785
+ return DataFilesPatternsList(
786
+ self, [allowed_extensions + extensions for allowed_extensions in self.allowed_extensions]
787
+ )
788
+
789
+
790
+ class DataFilesPatternsDict(Dict[str, DataFilesPatternsList]):
791
+ """
792
+ Dict of split_name -> list of data files patterns (absolute local paths or URLs).
793
+ """
794
+
795
+ @classmethod
796
+ def from_patterns(
797
+ cls, patterns: Dict[str, List[str]], allowed_extensions: Optional[List[str]] = None
798
+ ) -> "DataFilesPatternsDict":
799
+ out = cls()
800
+ for key, patterns_for_key in patterns.items():
801
+ out[key] = (
802
+ patterns_for_key
803
+ if isinstance(patterns_for_key, DataFilesPatternsList)
804
+ else DataFilesPatternsList.from_patterns(
805
+ patterns_for_key,
806
+ allowed_extensions=allowed_extensions,
807
+ )
808
+ )
809
+ return out
810
+
811
+ def resolve(
812
+ self,
813
+ base_path: str,
814
+ download_config: Optional[DownloadConfig] = None,
815
+ ) -> "DataFilesDict":
816
+ out = DataFilesDict()
817
+ for key, data_files_patterns_list in self.items():
818
+ out[key] = data_files_patterns_list.resolve(base_path, download_config)
819
+ return out
820
+
821
+ def filter_extensions(self, extensions: List[str]) -> "DataFilesPatternsDict":
822
+ out = type(self)()
823
+ for key, data_files_patterns_list in self.items():
824
+ out[key] = data_files_patterns_list.filter_extensions(extensions)
825
+ return out
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/datasets/fingerprint.py ADDED
@@ -0,0 +1,494 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import inspect
2
+ import os
3
+ import random
4
+ import shutil
5
+ import tempfile
6
+ import weakref
7
+ from functools import wraps
8
+ from pathlib import Path
9
+ from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
10
+
11
+ import numpy as np
12
+ import xxhash
13
+
14
+ from . import config
15
+ from .naming import INVALID_WINDOWS_CHARACTERS_IN_PATH
16
+ from .utils._dill import dumps
17
+ from .utils.deprecation_utils import deprecated
18
+ from .utils.logging import get_logger
19
+
20
+
21
+ if TYPE_CHECKING:
22
+ from .arrow_dataset import Dataset
23
+
24
+
25
+ logger = get_logger(__name__)
26
+
27
+
28
+ # Fingerprinting allows to have one deterministic fingerprint per dataset state.
29
+ # A dataset fingerprint is updated after each transform.
30
+ # Re-running the same transforms on a dataset in a different session results in the same fingerprint.
31
+ # This is possible thanks to a custom hashing function that works with most python objects.
32
+
33
+ # Fingerprinting is the main mechanism that enables caching.
34
+ # The caching mechanism allows to reload an existing cache file if it's already been computed.
35
+
36
+
37
+ #################
38
+ # Caching
39
+ #################
40
+
41
+ _CACHING_ENABLED = True
42
+ _TEMP_DIR_FOR_TEMP_CACHE_FILES: Optional["_TempCacheDir"] = None
43
+ _DATASETS_WITH_TABLE_IN_TEMP_DIR: Optional[weakref.WeakSet] = None
44
+
45
+
46
+ class _TempCacheDir:
47
+ """
48
+ A temporary directory for storing cached Arrow files with a cleanup that frees references to the Arrow files
49
+ before deleting the directory itself to avoid permission errors on Windows.
50
+ """
51
+
52
+ def __init__(self):
53
+ self.name = tempfile.mkdtemp(prefix=config.TEMP_CACHE_DIR_PREFIX)
54
+ self._finalizer = weakref.finalize(self, self._cleanup)
55
+
56
+ def _cleanup(self):
57
+ for dset in get_datasets_with_cache_file_in_temp_dir():
58
+ dset.__del__()
59
+ if os.path.exists(self.name):
60
+ try:
61
+ shutil.rmtree(self.name)
62
+ except Exception as e:
63
+ raise OSError(
64
+ f"An error occured while trying to delete temporary cache directory {self.name}. Please delete it manually."
65
+ ) from e
66
+
67
+ def cleanup(self):
68
+ if self._finalizer.detach():
69
+ self._cleanup()
70
+
71
+
72
+ def maybe_register_dataset_for_temp_dir_deletion(dataset):
73
+ """
74
+ This function registers the datasets that have cache files in _TEMP_DIR_FOR_TEMP_CACHE_FILES in order
75
+ to properly delete them before deleting the temporary directory.
76
+ The temporary directory _TEMP_DIR_FOR_TEMP_CACHE_FILES is used when caching is disabled.
77
+ """
78
+ if _TEMP_DIR_FOR_TEMP_CACHE_FILES is None:
79
+ return
80
+
81
+ global _DATASETS_WITH_TABLE_IN_TEMP_DIR
82
+ if _DATASETS_WITH_TABLE_IN_TEMP_DIR is None:
83
+ _DATASETS_WITH_TABLE_IN_TEMP_DIR = weakref.WeakSet()
84
+ if any(
85
+ Path(_TEMP_DIR_FOR_TEMP_CACHE_FILES.name) in Path(cache_file["filename"]).parents
86
+ for cache_file in dataset.cache_files
87
+ ):
88
+ _DATASETS_WITH_TABLE_IN_TEMP_DIR.add(dataset)
89
+
90
+
91
+ def get_datasets_with_cache_file_in_temp_dir():
92
+ return list(_DATASETS_WITH_TABLE_IN_TEMP_DIR) if _DATASETS_WITH_TABLE_IN_TEMP_DIR is not None else []
93
+
94
+
95
+ def enable_caching():
96
+ """
97
+ When applying transforms on a dataset, the data are stored in cache files.
98
+ The caching mechanism allows to reload an existing cache file if it's already been computed.
99
+
100
+ Reloading a dataset is possible since the cache files are named using the dataset fingerprint, which is updated
101
+ after each transform.
102
+
103
+ If disabled, the library will no longer reload cached datasets files when applying transforms to the datasets.
104
+ More precisely, if the caching is disabled:
105
+ - cache files are always recreated
106
+ - cache files are written to a temporary directory that is deleted when session closes
107
+ - cache files are named using a random hash instead of the dataset fingerprint
108
+ - use [`~datasets.Dataset.save_to_disk`] to save a transformed dataset or it will be deleted when session closes
109
+ - caching doesn't affect [`~datasets.load_dataset`]. If you want to regenerate a dataset from scratch you should use
110
+ the `download_mode` parameter in [`~datasets.load_dataset`].
111
+ """
112
+ global _CACHING_ENABLED
113
+ _CACHING_ENABLED = True
114
+
115
+
116
+ def disable_caching():
117
+ """
118
+ When applying transforms on a dataset, the data are stored in cache files.
119
+ The caching mechanism allows to reload an existing cache file if it's already been computed.
120
+
121
+ Reloading a dataset is possible since the cache files are named using the dataset fingerprint, which is updated
122
+ after each transform.
123
+
124
+ If disabled, the library will no longer reload cached datasets files when applying transforms to the datasets.
125
+ More precisely, if the caching is disabled:
126
+ - cache files are always recreated
127
+ - cache files are written to a temporary directory that is deleted when session closes
128
+ - cache files are named using a random hash instead of the dataset fingerprint
129
+ - use [`~datasets.Dataset.save_to_disk`] to save a transformed dataset or it will be deleted when session closes
130
+ - caching doesn't affect [`~datasets.load_dataset`]. If you want to regenerate a dataset from scratch you should use
131
+ the `download_mode` parameter in [`~datasets.load_dataset`].
132
+ """
133
+ global _CACHING_ENABLED
134
+ _CACHING_ENABLED = False
135
+
136
+
137
+ @deprecated(
138
+ "Use datasets.enable_caching() or datasets.disable_caching() instead. This function will be removed in a future version of datasets."
139
+ )
140
+ def set_caching_enabled(boolean: bool):
141
+ """
142
+ When applying transforms on a dataset, the data are stored in cache files.
143
+ The caching mechanism allows to reload an existing cache file if it's already been computed.
144
+
145
+ Reloading a dataset is possible since the cache files are named using the dataset fingerprint, which is updated
146
+ after each transform.
147
+
148
+ If disabled, the library will no longer reload cached datasets files when applying transforms to the datasets.
149
+ More precisely, if the caching is disabled:
150
+ - cache files are always recreated
151
+ - cache files are written to a temporary directory that is deleted when session closes
152
+ - cache files are named using a random hash instead of the dataset fingerprint
153
+ - use :func:`datasets.Dataset.save_to_disk` to save a transformed dataset or it will be deleted when session closes
154
+ - caching doesn't affect :func:`datasets.load_dataset`. If you want to regenerate a dataset from scratch you should use
155
+ the ``download_mode`` parameter in :func:`datasets.load_dataset`.
156
+ """
157
+ global _CACHING_ENABLED
158
+ _CACHING_ENABLED = bool(boolean)
159
+
160
+
161
+ def is_caching_enabled() -> bool:
162
+ """
163
+ When applying transforms on a dataset, the data are stored in cache files.
164
+ The caching mechanism allows to reload an existing cache file if it's already been computed.
165
+
166
+ Reloading a dataset is possible since the cache files are named using the dataset fingerprint, which is updated
167
+ after each transform.
168
+
169
+ If disabled, the library will no longer reload cached datasets files when applying transforms to the datasets.
170
+ More precisely, if the caching is disabled:
171
+ - cache files are always recreated
172
+ - cache files are written to a temporary directory that is deleted when session closes
173
+ - cache files are named using a random hash instead of the dataset fingerprint
174
+ - use [`~datasets.Dataset.save_to_disk`]] to save a transformed dataset or it will be deleted when session closes
175
+ - caching doesn't affect [`~datasets.load_dataset`]. If you want to regenerate a dataset from scratch you should use
176
+ the `download_mode` parameter in [`~datasets.load_dataset`].
177
+ """
178
+ global _CACHING_ENABLED
179
+ return bool(_CACHING_ENABLED)
180
+
181
+
182
+ def get_temporary_cache_files_directory() -> str:
183
+ """Return a directory that is deleted when session closes."""
184
+ global _TEMP_DIR_FOR_TEMP_CACHE_FILES
185
+ if _TEMP_DIR_FOR_TEMP_CACHE_FILES is None:
186
+ _TEMP_DIR_FOR_TEMP_CACHE_FILES = _TempCacheDir()
187
+ return _TEMP_DIR_FOR_TEMP_CACHE_FILES.name
188
+
189
+
190
+ #################
191
+ # Hashing
192
+ #################
193
+
194
+
195
+ @deprecated("Use `copyreg.pickle` to register a custom reducer.")
196
+ def hashregister(*types):
197
+ def proxy(func):
198
+ for t in types:
199
+ Hasher.dispatch[t] = func
200
+ return func
201
+
202
+ return proxy
203
+
204
+
205
+ class Hasher:
206
+ """Hasher that accepts python objects as inputs."""
207
+
208
+ dispatch: Dict = {}
209
+
210
+ def __init__(self):
211
+ self.m = xxhash.xxh64()
212
+
213
+ @classmethod
214
+ def hash_bytes(cls, value: Union[bytes, List[bytes]]) -> str:
215
+ value = [value] if isinstance(value, bytes) else value
216
+ m = xxhash.xxh64()
217
+ for x in value:
218
+ m.update(x)
219
+ return m.hexdigest()
220
+
221
+ @classmethod
222
+ @deprecated("Use `Hasher.hash` instead.")
223
+ def hash_default(cls, value: Any) -> str:
224
+ return cls.hash(value)
225
+
226
+ @classmethod
227
+ def hash(cls, value: Any) -> str:
228
+ return cls.hash_bytes(dumps(value))
229
+
230
+ def update(self, value: Any) -> None:
231
+ header_for_update = f"=={type(value)}=="
232
+ value_for_update = self.hash(value)
233
+ self.m.update(header_for_update.encode("utf8"))
234
+ self.m.update(value_for_update.encode("utf-8"))
235
+
236
+ def hexdigest(self) -> str:
237
+ return self.m.hexdigest()
238
+
239
+
240
+ #################
241
+ # Fingerprinting
242
+ #################
243
+
244
+ fingerprint_rng = random.Random()
245
+ # we show a warning only once when fingerprinting fails to avoid spam
246
+ fingerprint_warnings: Dict[str, bool] = {}
247
+
248
+
249
+ def generate_fingerprint(dataset: "Dataset") -> str:
250
+ state = dataset.__dict__
251
+ hasher = Hasher()
252
+ for key in sorted(state):
253
+ if key == "_fingerprint":
254
+ continue
255
+ hasher.update(key)
256
+ hasher.update(state[key])
257
+ # hash data files last modification timestamps as well
258
+ for cache_file in dataset.cache_files:
259
+ hasher.update(os.path.getmtime(cache_file["filename"]))
260
+ return hasher.hexdigest()
261
+
262
+
263
+ def generate_random_fingerprint(nbits: int = 64) -> str:
264
+ return f"{fingerprint_rng.getrandbits(nbits):0{nbits//4}x}"
265
+
266
+
267
+ def update_fingerprint(fingerprint, transform, transform_args):
268
+ global fingerprint_warnings
269
+ hasher = Hasher()
270
+ hasher.update(fingerprint)
271
+ try:
272
+ hasher.update(transform)
273
+ except: # noqa various errors might raise here from pickle or dill
274
+ if _CACHING_ENABLED:
275
+ if not fingerprint_warnings.get("update_fingerprint_transform_hash_failed", False):
276
+ logger.warning(
277
+ f"Transform {transform} couldn't be hashed properly, a random hash was used instead. "
278
+ "Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. "
279
+ "If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. "
280
+ "This warning is only showed once. Subsequent hashing failures won't be showed."
281
+ )
282
+ fingerprint_warnings["update_fingerprint_transform_hash_failed"] = True
283
+ else:
284
+ logger.info(f"Transform {transform} couldn't be hashed properly, a random hash was used instead.")
285
+ else:
286
+ logger.info(
287
+ f"Transform {transform} couldn't be hashed properly, a random hash was used instead. This doesn't affect caching since it's disabled."
288
+ )
289
+
290
+ return generate_random_fingerprint()
291
+ for key in sorted(transform_args):
292
+ hasher.update(key)
293
+ try:
294
+ hasher.update(transform_args[key])
295
+ except: # noqa various errors might raise here from pickle or dill
296
+ if _CACHING_ENABLED:
297
+ if not fingerprint_warnings.get("update_fingerprint_transform_hash_failed", False):
298
+ logger.warning(
299
+ f"Parameter '{key}'={transform_args[key]} of the transform {transform} couldn't be hashed properly, a random hash was used instead. "
300
+ "Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. "
301
+ "If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. "
302
+ "This warning is only showed once. Subsequent hashing failures won't be showed."
303
+ )
304
+ fingerprint_warnings["update_fingerprint_transform_hash_failed"] = True
305
+ else:
306
+ logger.info(
307
+ f"Parameter '{key}'={transform_args[key]} of the transform {transform} couldn't be hashed properly, a random hash was used instead."
308
+ )
309
+ else:
310
+ logger.info(
311
+ f"Parameter '{key}'={transform_args[key]} of the transform {transform} couldn't be hashed properly, a random hash was used instead. This doesn't affect caching since it's disabled."
312
+ )
313
+ return generate_random_fingerprint()
314
+ return hasher.hexdigest()
315
+
316
+
317
+ def validate_fingerprint(fingerprint: str, max_length=64):
318
+ """
319
+ Make sure the fingerprint is a non-empty string that is not longer that max_length=64 by default,
320
+ so that the fingerprint can be used to name cache files without issues.
321
+ """
322
+ if not isinstance(fingerprint, str) or not fingerprint:
323
+ raise ValueError(f"Invalid fingerprint '{fingerprint}': it should be a non-empty string.")
324
+ for invalid_char in INVALID_WINDOWS_CHARACTERS_IN_PATH:
325
+ if invalid_char in fingerprint:
326
+ raise ValueError(
327
+ f"Invalid fingerprint. Bad characters from black list '{INVALID_WINDOWS_CHARACTERS_IN_PATH}' found in '{fingerprint}'. "
328
+ f"They could create issues when creating cache files."
329
+ )
330
+ if len(fingerprint) > max_length:
331
+ raise ValueError(
332
+ f"Invalid fingerprint. Maximum lenth is {max_length} but '{fingerprint}' has length {len(fingerprint)}."
333
+ "It could create issues when creating cache files."
334
+ )
335
+
336
+
337
+ def format_transform_for_fingerprint(func: Callable, version: Optional[str] = None) -> str:
338
+ """
339
+ Format a transform to the format that will be used to update the fingerprint.
340
+ """
341
+ transform = f"{func.__module__}.{func.__qualname__}"
342
+ if version is not None:
343
+ transform += f"@{version}"
344
+ return transform
345
+
346
+
347
+ def format_kwargs_for_fingerprint(
348
+ func: Callable,
349
+ args: Tuple,
350
+ kwargs: Dict[str, Any],
351
+ use_kwargs: Optional[List[str]] = None,
352
+ ignore_kwargs: Optional[List[str]] = None,
353
+ randomized_function: bool = False,
354
+ ) -> Dict[str, Any]:
355
+ """
356
+ Format the kwargs of a transform to the format that will be used to update the fingerprint.
357
+ """
358
+ kwargs_for_fingerprint = kwargs.copy()
359
+ if args:
360
+ params = [p.name for p in inspect.signature(func).parameters.values() if p != p.VAR_KEYWORD]
361
+ args = args[1:] # assume the first argument is the dataset
362
+ params = params[1:]
363
+ kwargs_for_fingerprint.update(zip(params, args))
364
+ else:
365
+ del kwargs_for_fingerprint[
366
+ next(iter(inspect.signature(func).parameters))
367
+ ] # assume the first key is the dataset
368
+
369
+ # keep the right kwargs to be hashed to generate the fingerprint
370
+
371
+ if use_kwargs:
372
+ kwargs_for_fingerprint = {k: v for k, v in kwargs_for_fingerprint.items() if k in use_kwargs}
373
+ if ignore_kwargs:
374
+ kwargs_for_fingerprint = {k: v for k, v in kwargs_for_fingerprint.items() if k not in ignore_kwargs}
375
+ if randomized_function: # randomized functions have `seed` and `generator` parameters
376
+ if kwargs_for_fingerprint.get("seed") is None and kwargs_for_fingerprint.get("generator") is None:
377
+ _, seed, pos, *_ = np.random.get_state()
378
+ seed = seed[pos] if pos < 624 else seed[0]
379
+ kwargs_for_fingerprint["generator"] = np.random.default_rng(seed)
380
+
381
+ # remove kwargs that are the default values
382
+
383
+ default_values = {
384
+ p.name: p.default for p in inspect.signature(func).parameters.values() if p.default != inspect._empty
385
+ }
386
+ for default_varname, default_value in default_values.items():
387
+ if default_varname in kwargs_for_fingerprint and kwargs_for_fingerprint[default_varname] == default_value:
388
+ kwargs_for_fingerprint.pop(default_varname)
389
+ return kwargs_for_fingerprint
390
+
391
+
392
+ def fingerprint_transform(
393
+ inplace: bool,
394
+ use_kwargs: Optional[List[str]] = None,
395
+ ignore_kwargs: Optional[List[str]] = None,
396
+ fingerprint_names: Optional[List[str]] = None,
397
+ randomized_function: bool = False,
398
+ version: Optional[str] = None,
399
+ ):
400
+ """
401
+ Wrapper for dataset transforms to update the dataset fingerprint using ``update_fingerprint``
402
+ Args:
403
+ inplace (:obj:`bool`): If inplace is True, the fingerprint of the dataset is updated inplace.
404
+ Otherwise, a parameter "new_fingerprint" is passed to the wrapped method that should take care of
405
+ setting the fingerprint of the returned Dataset.
406
+ use_kwargs (:obj:`List[str]`, optional): optional white list of argument names to take into account
407
+ to update the fingerprint to the wrapped method that should take care of
408
+ setting the fingerprint of the returned Dataset. By default all the arguments are used.
409
+ ignore_kwargs (:obj:`List[str]`, optional): optional black list of argument names to take into account
410
+ to update the fingerprint. Note that ignore_kwargs prevails on use_kwargs.
411
+ fingerprint_names (:obj:`List[str]`, optional, defaults to ["new_fingerprint"]):
412
+ If the dataset transforms is not inplace and returns a DatasetDict, then it can require
413
+ several fingerprints (one per dataset in the DatasetDict). By specifying fingerprint_names,
414
+ one fingerprint named after each element of fingerprint_names is going to be passed.
415
+ randomized_function (:obj:`bool`, defaults to False): If the dataset transform is random and has
416
+ optional parameters "seed" and "generator", then you can set randomized_function to True.
417
+ This way, even if users set "seed" and "generator" to None, then the fingerprint is
418
+ going to be randomly generated depending on numpy's current state. In this case, the
419
+ generator is set to np.random.default_rng(np.random.get_state()[1][0]).
420
+ version (:obj:`str`, optional): version of the transform. The version is taken into account when
421
+ computing the fingerprint. If a datase transform changes (or at least if the output data
422
+ that are cached changes), then one should increase the version. If the version stays the
423
+ same, then old cached data could be reused that are not compatible with the new transform.
424
+ It should be in the format "MAJOR.MINOR.PATCH".
425
+ """
426
+
427
+ if use_kwargs is not None and not isinstance(use_kwargs, list):
428
+ raise ValueError(f"use_kwargs is supposed to be a list, not {type(use_kwargs)}")
429
+
430
+ if ignore_kwargs is not None and not isinstance(ignore_kwargs, list):
431
+ raise ValueError(f"ignore_kwargs is supposed to be a list, not {type(use_kwargs)}")
432
+
433
+ if inplace and fingerprint_names:
434
+ raise ValueError("fingerprint_names are only used when inplace is False")
435
+
436
+ fingerprint_names = fingerprint_names if fingerprint_names is not None else ["new_fingerprint"]
437
+
438
+ def _fingerprint(func):
439
+ if not inplace and not all(name in func.__code__.co_varnames for name in fingerprint_names):
440
+ raise ValueError(f"function {func} is missing parameters {fingerprint_names} in signature")
441
+
442
+ if randomized_function: # randomized function have seed and generator parameters
443
+ if "seed" not in func.__code__.co_varnames:
444
+ raise ValueError(f"'seed' must be in {func}'s signature")
445
+ if "generator" not in func.__code__.co_varnames:
446
+ raise ValueError(f"'generator' must be in {func}'s signature")
447
+ # this call has to be outside the wrapper or since __qualname__ changes in multiprocessing
448
+ transform = format_transform_for_fingerprint(func, version=version)
449
+
450
+ @wraps(func)
451
+ def wrapper(*args, **kwargs):
452
+ kwargs_for_fingerprint = format_kwargs_for_fingerprint(
453
+ func,
454
+ args,
455
+ kwargs,
456
+ use_kwargs=use_kwargs,
457
+ ignore_kwargs=ignore_kwargs,
458
+ randomized_function=randomized_function,
459
+ )
460
+
461
+ if args:
462
+ dataset: Dataset = args[0]
463
+ args = args[1:]
464
+ else:
465
+ dataset: Dataset = kwargs.pop(next(iter(inspect.signature(func).parameters)))
466
+
467
+ # compute new_fingerprint and add it to the args of not in-place transforms
468
+ if inplace:
469
+ new_fingerprint = update_fingerprint(dataset._fingerprint, transform, kwargs_for_fingerprint)
470
+ else:
471
+ for fingerprint_name in fingerprint_names: # transforms like `train_test_split` have several hashes
472
+ if kwargs.get(fingerprint_name) is None:
473
+ kwargs_for_fingerprint["fingerprint_name"] = fingerprint_name
474
+ kwargs[fingerprint_name] = update_fingerprint(
475
+ dataset._fingerprint, transform, kwargs_for_fingerprint
476
+ )
477
+ else:
478
+ validate_fingerprint(kwargs[fingerprint_name])
479
+
480
+ # Call actual function
481
+
482
+ out = func(dataset, *args, **kwargs)
483
+
484
+ # Update fingerprint of in-place transforms + update in-place history of transforms
485
+
486
+ if inplace: # update after calling func so that the fingerprint doesn't change if the function fails
487
+ dataset._fingerprint = new_fingerprint
488
+
489
+ return out
490
+
491
+ wrapper._decorator_name_ = "fingerprint"
492
+ return wrapper
493
+
494
+ return _fingerprint
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/datasets/hub.py ADDED
@@ -0,0 +1,230 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ from itertools import chain
3
+ from typing import Optional, Union
4
+
5
+ from huggingface_hub import (
6
+ CommitInfo,
7
+ CommitOperationAdd,
8
+ CommitOperationDelete,
9
+ DatasetCard,
10
+ DatasetCardData,
11
+ HfApi,
12
+ HfFileSystem,
13
+ )
14
+ from huggingface_hub.utils import HfHubHTTPError
15
+
16
+ import datasets.config
17
+ from datasets.info import DatasetInfosDict
18
+ from datasets.inspect import get_dataset_config_names, get_dataset_default_config_name
19
+ from datasets.load import load_dataset, load_dataset_builder
20
+ from datasets.utils.metadata import MetadataConfigs
21
+
22
+
23
+ def convert_to_parquet(
24
+ repo_id: str,
25
+ revision: Optional[str] = None,
26
+ token: Optional[Union[bool, str]] = None,
27
+ trust_remote_code: Optional[bool] = None,
28
+ ) -> CommitInfo:
29
+ """Convert Hub [script-based dataset](dataset_script) to Parquet [data-only dataset](repository_structure), so that
30
+ the dataset viewer will be supported.
31
+
32
+ This function:
33
+ - makes a copy of the script on the "main" branch into a dedicated branch called "script" (if it does not already exist)
34
+ - creates a pull request to the Hub dataset to convert it to Parquet files (and deletes the script from the main branch)
35
+
36
+ If in the future you need to recreate the Parquet files from the "script" branch, pass the `revision="script"` argument.
37
+
38
+ Note that you should pass the `trust_remote_code=True` argument only if you trust the remote code to be executed locally on your machine.
39
+
40
+ Args:
41
+ repo_id (`str`): ID of the source Hub dataset repository, in the following format: `<user>/<dataset_name>` or
42
+ `<org>/<dataset_name>`.
43
+ revision (`str`, *optional*): Branch of the source Hub dataset repository. Defaults to the `"main"` branch.
44
+ token (`bool` or `str`, *optional*): Authentication token for the Hugging Face Hub.
45
+ trust_remote_code (`bool`, defaults to `True`): Whether you trust the remote code of the Hub script-based
46
+ dataset to be executed locally on your machine. This option should only be set to `True` for repositories
47
+ where you have read the code and which you trust.
48
+
49
+ <Tip warning={true}>
50
+
51
+ `trust_remote_code` will default to False in the next major release.
52
+
53
+ </Tip>
54
+
55
+ Returns:
56
+ `huggingface_hub.CommitInfo`
57
+ """
58
+ print(f"{repo_id}")
59
+ configs = get_dataset_config_names(repo_id, token=token, revision=revision, trust_remote_code=trust_remote_code)
60
+ print(f"{configs = }")
61
+ default_config = get_dataset_default_config_name(
62
+ repo_id, token=token, revision=revision, trust_remote_code=trust_remote_code
63
+ )
64
+ print(f"{default_config = }")
65
+ if default_config:
66
+ config = default_config
67
+ configs.remove(default_config)
68
+ else:
69
+ config = configs.pop(0)
70
+ print(f"{config = }")
71
+ dataset = load_dataset(repo_id, config, revision=revision, trust_remote_code=trust_remote_code)
72
+ commit_info = dataset.push_to_hub(
73
+ repo_id,
74
+ config_name=config,
75
+ commit_message="Convert dataset to Parquet",
76
+ commit_description="Convert dataset to Parquet.",
77
+ create_pr=True,
78
+ token=token,
79
+ set_default=default_config is not None,
80
+ )
81
+ time.sleep(5)
82
+ pr_revision, pr_url = commit_info.pr_revision, commit_info.pr_url
83
+ for config in configs:
84
+ print(f"{config = }")
85
+ dataset = load_dataset(repo_id, config, revision=revision, trust_remote_code=trust_remote_code)
86
+ dataset.push_to_hub(
87
+ repo_id,
88
+ config_name=config,
89
+ commit_message=f"Add '{config}' config data files",
90
+ revision=pr_revision,
91
+ token=token,
92
+ )
93
+ time.sleep(5)
94
+ _delete_files(repo_id, revision=pr_revision, token=token)
95
+ if not revision:
96
+ api = HfApi(endpoint=datasets.config.HF_ENDPOINT, token=token)
97
+ try:
98
+ api.create_branch(repo_id, branch="script", repo_type="dataset", token=token, exist_ok=True)
99
+ except HfHubHTTPError:
100
+ pass
101
+ print(f"You can find your PR to convert the dataset to Parquet at: {pr_url}")
102
+ return commit_info
103
+
104
+
105
+ def delete_from_hub(
106
+ repo_id: str,
107
+ config_name: str,
108
+ revision: Optional[str] = None,
109
+ token: Optional[Union[bool, str]] = None,
110
+ ) -> CommitInfo:
111
+ """Delete a dataset configuration from a [data-only dataset](repository_structure) on the Hub.
112
+
113
+ Args:
114
+ repo_id (`str`): ID of the Hub dataset repository, in the following format: `<user>/<dataset_name>` or
115
+ `<org>/<dataset_name>`.
116
+ config_name (`str`): Name of the dataset configuration.
117
+ revision (`str`, *optional*): Branch to delete the configuration from. Defaults to the `"main"` branch.
118
+ token (`bool` or `str`, *optional*): Authentication token for the Hugging Face Hub.
119
+
120
+ Returns:
121
+ `huggingface_hub.CommitInfo`
122
+ """
123
+ operations = []
124
+ # data_files
125
+ fs = HfFileSystem(endpoint=datasets.config.HF_ENDPOINT, token=token)
126
+ builder = load_dataset_builder(repo_id, config_name, revision=revision, token=token, trust_remote_code=False)
127
+ for data_file in chain(*builder.config.data_files.values()):
128
+ data_file_resolved_path = fs.resolve_path(data_file)
129
+ if data_file_resolved_path.repo_id == repo_id:
130
+ operations.append(CommitOperationDelete(path_in_repo=data_file_resolved_path.path_in_repo))
131
+ # README.md
132
+ dataset_card = DatasetCard.load(repo_id)
133
+ # config_names
134
+ if dataset_card.data.get("config_names", None) and config_name in dataset_card.data["config_names"]:
135
+ dataset_card.data["config_names"].remove(config_name)
136
+ # metadata_configs
137
+ metadata_configs = MetadataConfigs.from_dataset_card_data(dataset_card.data)
138
+ if metadata_configs:
139
+ _ = metadata_configs.pop(config_name, None)
140
+ dataset_card_data = DatasetCardData()
141
+ metadata_configs.to_dataset_card_data(dataset_card_data)
142
+ if datasets.config.METADATA_CONFIGS_FIELD in dataset_card_data:
143
+ dataset_card.data[datasets.config.METADATA_CONFIGS_FIELD] = dataset_card_data[
144
+ datasets.config.METADATA_CONFIGS_FIELD
145
+ ]
146
+ else:
147
+ _ = dataset_card.data.pop(datasets.config.METADATA_CONFIGS_FIELD, None)
148
+ # dataset_info
149
+ dataset_infos: DatasetInfosDict = DatasetInfosDict.from_dataset_card_data(dataset_card.data)
150
+ if dataset_infos:
151
+ _ = dataset_infos.pop(config_name, None)
152
+ dataset_card_data = DatasetCardData()
153
+ dataset_infos.to_dataset_card_data(dataset_card_data)
154
+ if "dataset_info" in dataset_card_data:
155
+ dataset_card.data["dataset_info"] = dataset_card_data["dataset_info"]
156
+ else:
157
+ _ = dataset_card.data.pop("dataset_info", None)
158
+ # Commit
159
+ operations.append(
160
+ CommitOperationAdd(path_in_repo=datasets.config.REPOCARD_FILENAME, path_or_fileobj=str(dataset_card).encode())
161
+ )
162
+ api = HfApi(endpoint=datasets.config.HF_ENDPOINT, token=token)
163
+ commit_info = api.create_commit(
164
+ repo_id,
165
+ operations=operations,
166
+ commit_message=f"Delete '{config_name}' config",
167
+ commit_description=f"Delete '{config_name}' config.",
168
+ token=token,
169
+ repo_type="dataset",
170
+ revision=revision,
171
+ create_pr=True,
172
+ )
173
+ print(f"You can find your PR to delete the dataset config at: {commit_info.pr_url}")
174
+ return commit_info
175
+
176
+
177
+ def _delete_files(dataset_id, revision=None, token=None):
178
+ dataset_name = dataset_id.split("/")[-1]
179
+ hf_api = HfApi(endpoint=datasets.config.HF_ENDPOINT, token=token)
180
+ repo_files = hf_api.list_repo_files(
181
+ dataset_id,
182
+ repo_type="dataset",
183
+ )
184
+ if repo_files:
185
+ legacy_json_file = []
186
+ python_files = []
187
+ data_files = []
188
+ for filename in repo_files:
189
+ if filename in {".gitattributes", "README.md"}:
190
+ continue
191
+ elif filename == f"{dataset_name}.py":
192
+ hf_api.delete_file(
193
+ filename,
194
+ dataset_id,
195
+ repo_type="dataset",
196
+ revision=revision,
197
+ commit_message="Delete loading script",
198
+ )
199
+ elif filename == "dataset_infos.json":
200
+ legacy_json_file.append(filename)
201
+ elif filename.endswith(".py"):
202
+ python_files.append(filename)
203
+ else:
204
+ data_files.append(filename)
205
+ if legacy_json_file:
206
+ hf_api.delete_file(
207
+ "dataset_infos.json",
208
+ dataset_id,
209
+ repo_type="dataset",
210
+ revision=revision,
211
+ commit_message="Delete legacy dataset_infos.json",
212
+ )
213
+ if python_files:
214
+ for filename in python_files:
215
+ hf_api.delete_file(
216
+ filename,
217
+ dataset_id,
218
+ repo_type="dataset",
219
+ revision=revision,
220
+ commit_message="Delete loading script auxiliary file",
221
+ )
222
+ if data_files:
223
+ for filename in data_files:
224
+ hf_api.delete_file(
225
+ filename,
226
+ dataset_id,
227
+ repo_type="dataset",
228
+ revision=revision,
229
+ commit_message="Delete data file",
230
+ )
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/datasets/info.py ADDED
@@ -0,0 +1,593 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ # Lint as: python3
16
+ """DatasetInfo and MetricInfo record information we know about a dataset and a metric.
17
+
18
+ This includes things that we know about the dataset statically, i.e.:
19
+ - description
20
+ - canonical location
21
+ - does it have validation and tests splits
22
+ - size
23
+ - etc.
24
+
25
+ This also includes the things that can and should be computed once we've
26
+ processed the dataset as well:
27
+ - number of examples (in each split)
28
+ - etc.
29
+ """
30
+
31
+ import copy
32
+ import dataclasses
33
+ import json
34
+ import os
35
+ import posixpath
36
+ import warnings
37
+ from dataclasses import dataclass
38
+ from pathlib import Path
39
+ from typing import ClassVar, Dict, List, Optional, Union
40
+
41
+ import fsspec
42
+ from fsspec.core import url_to_fs
43
+ from huggingface_hub import DatasetCard, DatasetCardData
44
+
45
+ from . import config
46
+ from .features import Features, Value
47
+ from .splits import SplitDict
48
+ from .tasks import TaskTemplate, task_template_from_dict
49
+ from .utils import Version
50
+ from .utils.logging import get_logger
51
+ from .utils.py_utils import asdict, unique_values
52
+
53
+
54
+ logger = get_logger(__name__)
55
+
56
+
57
+ @dataclass
58
+ class SupervisedKeysData:
59
+ input: str = ""
60
+ output: str = ""
61
+
62
+
63
+ @dataclass
64
+ class DownloadChecksumsEntryData:
65
+ key: str = ""
66
+ value: str = ""
67
+
68
+
69
+ class MissingCachedSizesConfigError(Exception):
70
+ """The expected cached sizes of the download file are missing."""
71
+
72
+
73
+ class NonMatchingCachedSizesError(Exception):
74
+ """The prepared split doesn't have expected sizes."""
75
+
76
+
77
+ @dataclass
78
+ class PostProcessedInfo:
79
+ features: Optional[Features] = None
80
+ resources_checksums: Optional[dict] = None
81
+
82
+ def __post_init__(self):
83
+ # Convert back to the correct classes when we reload from dict
84
+ if self.features is not None and not isinstance(self.features, Features):
85
+ self.features = Features.from_dict(self.features)
86
+
87
+ @classmethod
88
+ def from_dict(cls, post_processed_info_dict: dict) -> "PostProcessedInfo":
89
+ field_names = {f.name for f in dataclasses.fields(cls)}
90
+ return cls(**{k: v for k, v in post_processed_info_dict.items() if k in field_names})
91
+
92
+
93
+ @dataclass
94
+ class DatasetInfo:
95
+ """Information about a dataset.
96
+
97
+ `DatasetInfo` documents datasets, including its name, version, and features.
98
+ See the constructor arguments and properties for a full list.
99
+
100
+ Not all fields are known on construction and may be updated later.
101
+
102
+ Attributes:
103
+ description (`str`):
104
+ A description of the dataset.
105
+ citation (`str`):
106
+ A BibTeX citation of the dataset.
107
+ homepage (`str`):
108
+ A URL to the official homepage for the dataset.
109
+ license (`str`):
110
+ The dataset's license. It can be the name of the license or a paragraph containing the terms of the license.
111
+ features ([`Features`], *optional*):
112
+ The features used to specify the dataset's column types.
113
+ post_processed (`PostProcessedInfo`, *optional*):
114
+ Information regarding the resources of a possible post-processing of a dataset. For example, it can contain the information of an index.
115
+ supervised_keys (`SupervisedKeysData`, *optional*):
116
+ Specifies the input feature and the label for supervised learning if applicable for the dataset (legacy from TFDS).
117
+ builder_name (`str`, *optional*):
118
+ The name of the `GeneratorBasedBuilder` subclass used to create the dataset. Usually matched to the corresponding script name. It is also the snake_case version of the dataset builder class name.
119
+ config_name (`str`, *optional*):
120
+ The name of the configuration derived from [`BuilderConfig`].
121
+ version (`str` or [`Version`], *optional*):
122
+ The version of the dataset.
123
+ splits (`dict`, *optional*):
124
+ The mapping between split name and metadata.
125
+ download_checksums (`dict`, *optional*):
126
+ The mapping between the URL to download the dataset's checksums and corresponding metadata.
127
+ download_size (`int`, *optional*):
128
+ The size of the files to download to generate the dataset, in bytes.
129
+ post_processing_size (`int`, *optional*):
130
+ Size of the dataset in bytes after post-processing, if any.
131
+ dataset_size (`int`, *optional*):
132
+ The combined size in bytes of the Arrow tables for all splits.
133
+ size_in_bytes (`int`, *optional*):
134
+ The combined size in bytes of all files associated with the dataset (downloaded files + Arrow files).
135
+ task_templates (`List[TaskTemplate]`, *optional*):
136
+ The task templates to prepare the dataset for during training and evaluation. Each template casts the dataset's [`Features`] to standardized column names and types as detailed in `datasets.tasks`.
137
+ **config_kwargs (additional keyword arguments):
138
+ Keyword arguments to be passed to the [`BuilderConfig`] and used in the [`DatasetBuilder`].
139
+ """
140
+
141
+ # Set in the dataset scripts
142
+ description: str = dataclasses.field(default_factory=str)
143
+ citation: str = dataclasses.field(default_factory=str)
144
+ homepage: str = dataclasses.field(default_factory=str)
145
+ license: str = dataclasses.field(default_factory=str)
146
+ features: Optional[Features] = None
147
+ post_processed: Optional[PostProcessedInfo] = None
148
+ supervised_keys: Optional[SupervisedKeysData] = None
149
+ task_templates: Optional[List[TaskTemplate]] = None
150
+
151
+ # Set later by the builder
152
+ builder_name: Optional[str] = None
153
+ dataset_name: Optional[str] = None # for packaged builders, to be different from builder_name
154
+ config_name: Optional[str] = None
155
+ version: Optional[Union[str, Version]] = None
156
+ # Set later by `download_and_prepare`
157
+ splits: Optional[dict] = None
158
+ download_checksums: Optional[dict] = None
159
+ download_size: Optional[int] = None
160
+ post_processing_size: Optional[int] = None
161
+ dataset_size: Optional[int] = None
162
+ size_in_bytes: Optional[int] = None
163
+
164
+ _INCLUDED_INFO_IN_YAML: ClassVar[List[str]] = [
165
+ "config_name",
166
+ "download_size",
167
+ "dataset_size",
168
+ "features",
169
+ "splits",
170
+ ]
171
+
172
+ def __post_init__(self):
173
+ # Convert back to the correct classes when we reload from dict
174
+ if self.features is not None and not isinstance(self.features, Features):
175
+ self.features = Features.from_dict(self.features)
176
+ if self.post_processed is not None and not isinstance(self.post_processed, PostProcessedInfo):
177
+ self.post_processed = PostProcessedInfo.from_dict(self.post_processed)
178
+ if self.version is not None and not isinstance(self.version, Version):
179
+ if isinstance(self.version, str):
180
+ self.version = Version(self.version)
181
+ else:
182
+ self.version = Version.from_dict(self.version)
183
+ if self.splits is not None and not isinstance(self.splits, SplitDict):
184
+ self.splits = SplitDict.from_split_dict(self.splits)
185
+ if self.supervised_keys is not None and not isinstance(self.supervised_keys, SupervisedKeysData):
186
+ if isinstance(self.supervised_keys, (tuple, list)):
187
+ self.supervised_keys = SupervisedKeysData(*self.supervised_keys)
188
+ else:
189
+ self.supervised_keys = SupervisedKeysData(**self.supervised_keys)
190
+
191
+ # Parse and make a list of templates
192
+ if self.task_templates is not None:
193
+ if isinstance(self.task_templates, (list, tuple)):
194
+ templates = [
195
+ template if isinstance(template, TaskTemplate) else task_template_from_dict(template)
196
+ for template in self.task_templates
197
+ ]
198
+ self.task_templates = [template for template in templates if template is not None]
199
+ elif isinstance(self.task_templates, TaskTemplate):
200
+ self.task_templates = [self.task_templates]
201
+ else:
202
+ template = task_template_from_dict(self.task_templates)
203
+ self.task_templates = [template] if template is not None else []
204
+
205
+ # Align task templates with features
206
+ if self.task_templates is not None:
207
+ self.task_templates = list(self.task_templates)
208
+ if self.features is not None:
209
+ self.task_templates = [
210
+ template.align_with_features(self.features) for template in (self.task_templates)
211
+ ]
212
+
213
+ def write_to_directory(
214
+ self, dataset_info_dir, pretty_print=False, fs="deprecated", storage_options: Optional[dict] = None
215
+ ):
216
+ """Write `DatasetInfo` and license (if present) as JSON files to `dataset_info_dir`.
217
+
218
+ Args:
219
+ dataset_info_dir (`str`):
220
+ Destination directory.
221
+ pretty_print (`bool`, defaults to `False`):
222
+ If `True`, the JSON will be pretty-printed with the indent level of 4.
223
+ fs (`fsspec.spec.AbstractFileSystem`, *optional*):
224
+ Instance of the remote filesystem used to download the files from.
225
+
226
+ <Deprecated version="2.9.0">
227
+
228
+ `fs` was deprecated in version 2.9.0 and will be removed in 3.0.0.
229
+ Please use `storage_options` instead, e.g. `storage_options=fs.storage_options`.
230
+
231
+ </Deprecated>
232
+
233
+ storage_options (`dict`, *optional*):
234
+ Key/value pairs to be passed on to the file-system backend, if any.
235
+
236
+ <Added version="2.9.0"/>
237
+
238
+ Example:
239
+
240
+ ```py
241
+ >>> from datasets import load_dataset
242
+ >>> ds = load_dataset("rotten_tomatoes", split="validation")
243
+ >>> ds.info.write_to_directory("/path/to/directory/")
244
+ ```
245
+ """
246
+ if fs != "deprecated":
247
+ warnings.warn(
248
+ "'fs' was deprecated in favor of 'storage_options' in version 2.9.0 and will be removed in 3.0.0.\n"
249
+ "You can remove this warning by passing 'storage_options=fs.storage_options' instead.",
250
+ FutureWarning,
251
+ )
252
+ storage_options = fs.storage_options
253
+
254
+ fs: fsspec.AbstractFileSystem
255
+ fs, *_ = url_to_fs(dataset_info_dir, **(storage_options or {}))
256
+ with fs.open(posixpath.join(dataset_info_dir, config.DATASET_INFO_FILENAME), "wb") as f:
257
+ self._dump_info(f, pretty_print=pretty_print)
258
+ if self.license:
259
+ with fs.open(posixpath.join(dataset_info_dir, config.LICENSE_FILENAME), "wb") as f:
260
+ self._dump_license(f)
261
+
262
+ def _dump_info(self, file, pretty_print=False):
263
+ """Dump info in `file` file-like object open in bytes mode (to support remote files)"""
264
+ file.write(json.dumps(asdict(self), indent=4 if pretty_print else None).encode("utf-8"))
265
+
266
+ def _dump_license(self, file):
267
+ """Dump license in `file` file-like object open in bytes mode (to support remote files)"""
268
+ file.write(self.license.encode("utf-8"))
269
+
270
+ @classmethod
271
+ def from_merge(cls, dataset_infos: List["DatasetInfo"]):
272
+ dataset_infos = [dset_info.copy() for dset_info in dataset_infos if dset_info is not None]
273
+
274
+ if len(dataset_infos) > 0 and all(dataset_infos[0] == dset_info for dset_info in dataset_infos):
275
+ # if all dataset_infos are equal we don't need to merge. Just return the first.
276
+ return dataset_infos[0]
277
+
278
+ description = "\n\n".join(unique_values(info.description for info in dataset_infos)).strip()
279
+ citation = "\n\n".join(unique_values(info.citation for info in dataset_infos)).strip()
280
+ homepage = "\n\n".join(unique_values(info.homepage for info in dataset_infos)).strip()
281
+ license = "\n\n".join(unique_values(info.license for info in dataset_infos)).strip()
282
+ features = None
283
+ supervised_keys = None
284
+ task_templates = None
285
+
286
+ # Find common task templates across all dataset infos
287
+ all_task_templates = [info.task_templates for info in dataset_infos if info.task_templates is not None]
288
+ if len(all_task_templates) > 1:
289
+ task_templates = list(set(all_task_templates[0]).intersection(*all_task_templates[1:]))
290
+ elif len(all_task_templates):
291
+ task_templates = list(set(all_task_templates[0]))
292
+ # If no common task templates found, replace empty list with None
293
+ task_templates = task_templates if task_templates else None
294
+
295
+ return cls(
296
+ description=description,
297
+ citation=citation,
298
+ homepage=homepage,
299
+ license=license,
300
+ features=features,
301
+ supervised_keys=supervised_keys,
302
+ task_templates=task_templates,
303
+ )
304
+
305
+ @classmethod
306
+ def from_directory(
307
+ cls, dataset_info_dir: str, fs="deprecated", storage_options: Optional[dict] = None
308
+ ) -> "DatasetInfo":
309
+ """Create [`DatasetInfo`] from the JSON file in `dataset_info_dir`.
310
+
311
+ This function updates all the dynamically generated fields (num_examples,
312
+ hash, time of creation,...) of the [`DatasetInfo`].
313
+
314
+ This will overwrite all previous metadata.
315
+
316
+ Args:
317
+ dataset_info_dir (`str`):
318
+ The directory containing the metadata file. This
319
+ should be the root directory of a specific dataset version.
320
+ fs (`fsspec.spec.AbstractFileSystem`, *optional*):
321
+ Instance of the remote filesystem used to download the files from.
322
+
323
+ <Deprecated version="2.9.0">
324
+
325
+ `fs` was deprecated in version 2.9.0 and will be removed in 3.0.0.
326
+ Please use `storage_options` instead, e.g. `storage_options=fs.storage_options`.
327
+
328
+ </Deprecated>
329
+
330
+ storage_options (`dict`, *optional*):
331
+ Key/value pairs to be passed on to the file-system backend, if any.
332
+
333
+ <Added version="2.9.0"/>
334
+
335
+ Example:
336
+
337
+ ```py
338
+ >>> from datasets import DatasetInfo
339
+ >>> ds_info = DatasetInfo.from_directory("/path/to/directory/")
340
+ ```
341
+ """
342
+ if fs != "deprecated":
343
+ warnings.warn(
344
+ "'fs' was deprecated in favor of 'storage_options' in version 2.9.0 and will be removed in 3.0.0.\n"
345
+ "You can remove this warning by passing 'storage_options=fs.storage_options' instead.",
346
+ FutureWarning,
347
+ )
348
+ storage_options = fs.storage_options
349
+
350
+ fs: fsspec.AbstractFileSystem
351
+ fs, *_ = url_to_fs(dataset_info_dir, **(storage_options or {}))
352
+ logger.info(f"Loading Dataset info from {dataset_info_dir}")
353
+ if not dataset_info_dir:
354
+ raise ValueError("Calling DatasetInfo.from_directory() with undefined dataset_info_dir.")
355
+ with fs.open(posixpath.join(dataset_info_dir, config.DATASET_INFO_FILENAME), "r", encoding="utf-8") as f:
356
+ dataset_info_dict = json.load(f)
357
+ return cls.from_dict(dataset_info_dict)
358
+
359
+ @classmethod
360
+ def from_dict(cls, dataset_info_dict: dict) -> "DatasetInfo":
361
+ field_names = {f.name for f in dataclasses.fields(cls)}
362
+ return cls(**{k: v for k, v in dataset_info_dict.items() if k in field_names})
363
+
364
+ def update(self, other_dataset_info: "DatasetInfo", ignore_none=True):
365
+ self_dict = self.__dict__
366
+ self_dict.update(
367
+ **{
368
+ k: copy.deepcopy(v)
369
+ for k, v in other_dataset_info.__dict__.items()
370
+ if (v is not None or not ignore_none)
371
+ }
372
+ )
373
+
374
+ def copy(self) -> "DatasetInfo":
375
+ return self.__class__(**{k: copy.deepcopy(v) for k, v in self.__dict__.items()})
376
+
377
+ def _to_yaml_dict(self) -> dict:
378
+ yaml_dict = {}
379
+ dataset_info_dict = asdict(self)
380
+ for key in dataset_info_dict:
381
+ if key in self._INCLUDED_INFO_IN_YAML:
382
+ value = getattr(self, key)
383
+ if hasattr(value, "_to_yaml_list"): # Features, SplitDict
384
+ yaml_dict[key] = value._to_yaml_list()
385
+ elif hasattr(value, "_to_yaml_string"): # Version
386
+ yaml_dict[key] = value._to_yaml_string()
387
+ else:
388
+ yaml_dict[key] = value
389
+ return yaml_dict
390
+
391
+ @classmethod
392
+ def _from_yaml_dict(cls, yaml_data: dict) -> "DatasetInfo":
393
+ yaml_data = copy.deepcopy(yaml_data)
394
+ if yaml_data.get("features") is not None:
395
+ yaml_data["features"] = Features._from_yaml_list(yaml_data["features"])
396
+ if yaml_data.get("splits") is not None:
397
+ yaml_data["splits"] = SplitDict._from_yaml_list(yaml_data["splits"])
398
+ field_names = {f.name for f in dataclasses.fields(cls)}
399
+ return cls(**{k: v for k, v in yaml_data.items() if k in field_names})
400
+
401
+
402
+ class DatasetInfosDict(Dict[str, DatasetInfo]):
403
+ def write_to_directory(self, dataset_infos_dir, overwrite=False, pretty_print=False) -> None:
404
+ total_dataset_infos = {}
405
+ dataset_infos_path = os.path.join(dataset_infos_dir, config.DATASETDICT_INFOS_FILENAME)
406
+ dataset_readme_path = os.path.join(dataset_infos_dir, config.REPOCARD_FILENAME)
407
+ if not overwrite:
408
+ total_dataset_infos = self.from_directory(dataset_infos_dir)
409
+ total_dataset_infos.update(self)
410
+ if os.path.exists(dataset_infos_path):
411
+ # for backward compatibility, let's update the JSON file if it exists
412
+ with open(dataset_infos_path, "w", encoding="utf-8") as f:
413
+ dataset_infos_dict = {
414
+ config_name: asdict(dset_info) for config_name, dset_info in total_dataset_infos.items()
415
+ }
416
+ json.dump(dataset_infos_dict, f, indent=4 if pretty_print else None)
417
+ # Dump the infos in the YAML part of the README.md file
418
+ if os.path.exists(dataset_readme_path):
419
+ dataset_card = DatasetCard.load(dataset_readme_path)
420
+ dataset_card_data = dataset_card.data
421
+ else:
422
+ dataset_card = None
423
+ dataset_card_data = DatasetCardData()
424
+ if total_dataset_infos:
425
+ total_dataset_infos.to_dataset_card_data(dataset_card_data)
426
+ dataset_card = (
427
+ DatasetCard("---\n" + str(dataset_card_data) + "\n---\n") if dataset_card is None else dataset_card
428
+ )
429
+ dataset_card.save(Path(dataset_readme_path))
430
+
431
+ @classmethod
432
+ def from_directory(cls, dataset_infos_dir) -> "DatasetInfosDict":
433
+ logger.info(f"Loading Dataset Infos from {dataset_infos_dir}")
434
+ # Load the info from the YAML part of README.md
435
+ if os.path.exists(os.path.join(dataset_infos_dir, config.REPOCARD_FILENAME)):
436
+ dataset_card_data = DatasetCard.load(Path(dataset_infos_dir) / config.REPOCARD_FILENAME).data
437
+ if "dataset_info" in dataset_card_data:
438
+ return cls.from_dataset_card_data(dataset_card_data)
439
+ if os.path.exists(os.path.join(dataset_infos_dir, config.DATASETDICT_INFOS_FILENAME)):
440
+ # this is just to have backward compatibility with dataset_infos.json files
441
+ with open(os.path.join(dataset_infos_dir, config.DATASETDICT_INFOS_FILENAME), encoding="utf-8") as f:
442
+ return cls(
443
+ {
444
+ config_name: DatasetInfo.from_dict(dataset_info_dict)
445
+ for config_name, dataset_info_dict in json.load(f).items()
446
+ }
447
+ )
448
+ else:
449
+ return cls()
450
+
451
+ @classmethod
452
+ def from_dataset_card_data(cls, dataset_card_data: DatasetCardData) -> "DatasetInfosDict":
453
+ if isinstance(dataset_card_data.get("dataset_info"), (list, dict)):
454
+ if isinstance(dataset_card_data["dataset_info"], list):
455
+ return cls(
456
+ {
457
+ dataset_info_yaml_dict.get("config_name", "default"): DatasetInfo._from_yaml_dict(
458
+ dataset_info_yaml_dict
459
+ )
460
+ for dataset_info_yaml_dict in dataset_card_data["dataset_info"]
461
+ }
462
+ )
463
+ else:
464
+ dataset_info = DatasetInfo._from_yaml_dict(dataset_card_data["dataset_info"])
465
+ dataset_info.config_name = dataset_card_data["dataset_info"].get("config_name", "default")
466
+ return cls({dataset_info.config_name: dataset_info})
467
+ else:
468
+ return cls()
469
+
470
+ def to_dataset_card_data(self, dataset_card_data: DatasetCardData) -> None:
471
+ if self:
472
+ # first get existing metadata info
473
+ if "dataset_info" in dataset_card_data and isinstance(dataset_card_data["dataset_info"], dict):
474
+ dataset_metadata_infos = {
475
+ dataset_card_data["dataset_info"].get("config_name", "default"): dataset_card_data["dataset_info"]
476
+ }
477
+ elif "dataset_info" in dataset_card_data and isinstance(dataset_card_data["dataset_info"], list):
478
+ dataset_metadata_infos = {
479
+ config_metadata["config_name"]: config_metadata
480
+ for config_metadata in dataset_card_data["dataset_info"]
481
+ }
482
+ else:
483
+ dataset_metadata_infos = {}
484
+ # update/rewrite existing metadata info with the one to dump
485
+ total_dataset_infos = {
486
+ **dataset_metadata_infos,
487
+ **{config_name: dset_info._to_yaml_dict() for config_name, dset_info in self.items()},
488
+ }
489
+ # the config_name from the dataset_infos_dict takes over the config_name of the DatasetInfo
490
+ for config_name, dset_info_yaml_dict in total_dataset_infos.items():
491
+ dset_info_yaml_dict["config_name"] = config_name
492
+ if len(total_dataset_infos) == 1:
493
+ # use a struct instead of a list of configurations, since there's only one
494
+ dataset_card_data["dataset_info"] = next(iter(total_dataset_infos.values()))
495
+ config_name = dataset_card_data["dataset_info"].pop("config_name", None)
496
+ if config_name != "default":
497
+ # if config_name is not "default" preserve it and put at the first position
498
+ dataset_card_data["dataset_info"] = {
499
+ "config_name": config_name,
500
+ **dataset_card_data["dataset_info"],
501
+ }
502
+ else:
503
+ dataset_card_data["dataset_info"] = []
504
+ for config_name, dataset_info_yaml_dict in sorted(total_dataset_infos.items()):
505
+ # add the config_name field in first position
506
+ dataset_info_yaml_dict.pop("config_name", None)
507
+ dataset_info_yaml_dict = {"config_name": config_name, **dataset_info_yaml_dict}
508
+ dataset_card_data["dataset_info"].append(dataset_info_yaml_dict)
509
+
510
+
511
+ @dataclass
512
+ class MetricInfo:
513
+ """Information about a metric.
514
+
515
+ `MetricInfo` documents a metric, including its name, version, and features.
516
+ See the constructor arguments and properties for a full list.
517
+
518
+ Note: Not all fields are known on construction and may be updated later.
519
+ """
520
+
521
+ # Set in the dataset scripts
522
+ description: str
523
+ citation: str
524
+ features: Features
525
+ inputs_description: str = dataclasses.field(default_factory=str)
526
+ homepage: str = dataclasses.field(default_factory=str)
527
+ license: str = dataclasses.field(default_factory=str)
528
+ codebase_urls: List[str] = dataclasses.field(default_factory=list)
529
+ reference_urls: List[str] = dataclasses.field(default_factory=list)
530
+ streamable: bool = False
531
+ format: Optional[str] = None
532
+
533
+ # Set later by the builder
534
+ metric_name: Optional[str] = None
535
+ config_name: Optional[str] = None
536
+ experiment_id: Optional[str] = None
537
+
538
+ def __post_init__(self):
539
+ if self.format is not None:
540
+ for key, value in self.features.items():
541
+ if not isinstance(value, Value):
542
+ raise ValueError(
543
+ f"When using 'numpy' format, all features should be a `datasets.Value` feature. "
544
+ f"Here {key} is an instance of {value.__class__.__name__}"
545
+ )
546
+
547
+ def write_to_directory(self, metric_info_dir, pretty_print=False):
548
+ """Write `MetricInfo` as JSON to `metric_info_dir`.
549
+ Also save the license separately in LICENCE.
550
+ If `pretty_print` is True, the JSON will be pretty-printed with the indent level of 4.
551
+
552
+ Example:
553
+
554
+ ```py
555
+ >>> from datasets import load_metric
556
+ >>> metric = load_metric("accuracy")
557
+ >>> metric.info.write_to_directory("/path/to/directory/")
558
+ ```
559
+ """
560
+ with open(os.path.join(metric_info_dir, config.METRIC_INFO_FILENAME), "w", encoding="utf-8") as f:
561
+ json.dump(asdict(self), f, indent=4 if pretty_print else None)
562
+
563
+ if self.license:
564
+ with open(os.path.join(metric_info_dir, config.LICENSE_FILENAME), "w", encoding="utf-8") as f:
565
+ f.write(self.license)
566
+
567
+ @classmethod
568
+ def from_directory(cls, metric_info_dir) -> "MetricInfo":
569
+ """Create MetricInfo from the JSON file in `metric_info_dir`.
570
+
571
+ Args:
572
+ metric_info_dir: `str` The directory containing the metadata file. This
573
+ should be the root directory of a specific dataset version.
574
+
575
+ Example:
576
+
577
+ ```py
578
+ >>> from datasets import MetricInfo
579
+ >>> metric_info = MetricInfo.from_directory("/path/to/directory/")
580
+ ```
581
+ """
582
+ logger.info(f"Loading Metric info from {metric_info_dir}")
583
+ if not metric_info_dir:
584
+ raise ValueError("Calling MetricInfo.from_directory() with undefined metric_info_dir.")
585
+
586
+ with open(os.path.join(metric_info_dir, config.METRIC_INFO_FILENAME), encoding="utf-8") as f:
587
+ metric_info_dict = json.load(f)
588
+ return cls.from_dict(metric_info_dict)
589
+
590
+ @classmethod
591
+ def from_dict(cls, metric_info_dict: dict) -> "MetricInfo":
592
+ field_names = {f.name for f in dataclasses.fields(cls)}
593
+ return cls(**{k: v for k, v in metric_info_dict.items() if k in field_names})
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/datasets/iterable_dataset.py ADDED
The diff for this file is too large to render. See raw diff
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/datasets/metric.py ADDED
@@ -0,0 +1,652 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2020 The HuggingFace Datasets Authors
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ # Lint as: python3
16
+ """Metrics base class."""
17
+
18
+ import os
19
+ import types
20
+ import uuid
21
+ from typing import Any, Dict, List, Optional, Tuple, Union
22
+
23
+ import numpy as np
24
+ import pyarrow as pa
25
+ from filelock import BaseFileLock, Timeout
26
+
27
+ from . import config
28
+ from .arrow_dataset import Dataset
29
+ from .arrow_reader import ArrowReader
30
+ from .arrow_writer import ArrowWriter
31
+ from .download.download_config import DownloadConfig
32
+ from .download.download_manager import DownloadManager
33
+ from .features import Features
34
+ from .info import DatasetInfo, MetricInfo
35
+ from .naming import camelcase_to_snakecase
36
+ from .utils._filelock import FileLock
37
+ from .utils.deprecation_utils import deprecated
38
+ from .utils.logging import get_logger
39
+ from .utils.py_utils import copyfunc, temp_seed
40
+
41
+
42
+ logger = get_logger(__name__)
43
+
44
+
45
+ class FileFreeLock(BaseFileLock):
46
+ """Thread lock until a file **cannot** be locked"""
47
+
48
+ def __init__(self, lock_file, *args, **kwargs):
49
+ self.filelock = FileLock(lock_file)
50
+ super().__init__(self.filelock.lock_file, *args, **kwargs)
51
+
52
+ def _acquire(self):
53
+ try:
54
+ self.filelock.acquire(timeout=0.01, poll_intervall=0.02) # Try to lock once
55
+ except Timeout:
56
+ # We couldn't acquire the lock, the file is locked!
57
+ self._context.lock_file_fd = self.filelock.lock_file
58
+ else:
59
+ # We were able to acquire the lock, the file is not yet locked!
60
+ self.filelock.release()
61
+ self._context.lock_file_fd = None
62
+
63
+ def _release(self):
64
+ self._context.lock_file_fd = None
65
+
66
+
67
+ # lists - summarize long lists similarly to NumPy
68
+ # arrays/tensors - let the frameworks control formatting
69
+ def summarize_if_long_list(obj):
70
+ if not type(obj) == list or len(obj) <= 6: # noqa: E721
71
+ return f"{obj}"
72
+
73
+ def format_chunk(chunk):
74
+ return ", ".join(repr(x) for x in chunk)
75
+
76
+ return f"[{format_chunk(obj[:3])}, ..., {format_chunk(obj[-3:])}]"
77
+
78
+
79
+ class MetricInfoMixin:
80
+ """This base class exposes some attributes of MetricInfo
81
+ at the base level of the Metric for easy access.
82
+
83
+ <Deprecated version="2.5.0">
84
+
85
+ Use the new library 🤗 Evaluate instead: https://huggingface.co/docs/evaluate
86
+
87
+ </Deprecated>
88
+
89
+ """
90
+
91
+ def __init__(self, info: MetricInfo):
92
+ self._metric_info = info
93
+
94
+ @property
95
+ def info(self):
96
+ """:class:`datasets.MetricInfo` object containing all the metadata in the metric."""
97
+ return self._metric_info
98
+
99
+ @property
100
+ def name(self) -> str:
101
+ return self._metric_info.metric_name
102
+
103
+ @property
104
+ def experiment_id(self) -> Optional[str]:
105
+ return self._metric_info.experiment_id
106
+
107
+ @property
108
+ def description(self) -> str:
109
+ return self._metric_info.description
110
+
111
+ @property
112
+ def citation(self) -> str:
113
+ return self._metric_info.citation
114
+
115
+ @property
116
+ def features(self) -> Features:
117
+ return self._metric_info.features
118
+
119
+ @property
120
+ def inputs_description(self) -> str:
121
+ return self._metric_info.inputs_description
122
+
123
+ @property
124
+ def homepage(self) -> Optional[str]:
125
+ return self._metric_info.homepage
126
+
127
+ @property
128
+ def license(self) -> str:
129
+ return self._metric_info.license
130
+
131
+ @property
132
+ def codebase_urls(self) -> Optional[List[str]]:
133
+ return self._metric_info.codebase_urls
134
+
135
+ @property
136
+ def reference_urls(self) -> Optional[List[str]]:
137
+ return self._metric_info.reference_urls
138
+
139
+ @property
140
+ def streamable(self) -> bool:
141
+ return self._metric_info.streamable
142
+
143
+ @property
144
+ def format(self) -> Optional[str]:
145
+ return self._metric_info.format
146
+
147
+
148
+ class Metric(MetricInfoMixin):
149
+ """A Metric is the base class and common API for all metrics.
150
+
151
+ <Deprecated version="2.5.0">
152
+
153
+ Use the new library 🤗 Evaluate instead: https://huggingface.co/docs/evaluate
154
+
155
+ </Deprecated>
156
+
157
+ Args:
158
+ config_name (``str``): This is used to define a hash specific to a metrics computation script and prevents the metric's data
159
+ to be overridden when the metric loading script is modified.
160
+ keep_in_memory (:obj:`bool`): keep all predictions and references in memory. Not possible in distributed settings.
161
+ cache_dir (``str``): Path to a directory in which temporary prediction/references data will be stored.
162
+ The data directory should be located on a shared file-system in distributed setups.
163
+ num_process (``int``): specify the total number of nodes in a distributed settings.
164
+ This is useful to compute metrics in distributed setups (in particular non-additive metrics like F1).
165
+ process_id (``int``): specify the id of the current process in a distributed setup (between 0 and num_process-1)
166
+ This is useful to compute metrics in distributed setups (in particular non-additive metrics like F1).
167
+ seed (:obj:`int`, optional): If specified, this will temporarily set numpy's random seed when :func:`datasets.Metric.compute` is run.
168
+ experiment_id (``str``): A specific experiment id. This is used if several distributed evaluations share the same file system.
169
+ This is useful to compute metrics in distributed setups (in particular non-additive metrics like F1).
170
+ max_concurrent_cache_files (``int``): Max number of concurrent metrics cache files (default 10000).
171
+ timeout (``Union[int, float]``): Timeout in second for distributed setting synchronization.
172
+ """
173
+
174
+ @deprecated("Use the new library 🤗 Evaluate instead: https://huggingface.co/docs/evaluate")
175
+ def __init__(
176
+ self,
177
+ config_name: Optional[str] = None,
178
+ keep_in_memory: bool = False,
179
+ cache_dir: Optional[str] = None,
180
+ num_process: int = 1,
181
+ process_id: int = 0,
182
+ seed: Optional[int] = None,
183
+ experiment_id: Optional[str] = None,
184
+ max_concurrent_cache_files: int = 10000,
185
+ timeout: Union[int, float] = 100,
186
+ **kwargs,
187
+ ):
188
+ # prepare info
189
+ self.config_name = config_name or "default"
190
+ info = self._info()
191
+ info.metric_name = camelcase_to_snakecase(self.__class__.__name__)
192
+ info.config_name = self.config_name
193
+ info.experiment_id = experiment_id or "default_experiment"
194
+ MetricInfoMixin.__init__(self, info) # For easy access on low level
195
+
196
+ # Safety checks on num_process and process_id
197
+ if not isinstance(process_id, int) or process_id < 0:
198
+ raise ValueError("'process_id' should be a number greater than 0")
199
+ if not isinstance(num_process, int) or num_process <= process_id:
200
+ raise ValueError("'num_process' should be a number greater than process_id")
201
+ if keep_in_memory and num_process != 1:
202
+ raise ValueError("Using 'keep_in_memory' is not possible in distributed setting (num_process > 1).")
203
+
204
+ self.num_process = num_process
205
+ self.process_id = process_id
206
+ self.max_concurrent_cache_files = max_concurrent_cache_files
207
+
208
+ self.keep_in_memory = keep_in_memory
209
+ self._data_dir_root = os.path.expanduser(cache_dir or config.HF_METRICS_CACHE)
210
+ self.data_dir = self._build_data_dir()
211
+ if seed is None:
212
+ _, seed, pos, *_ = np.random.get_state()
213
+ self.seed: int = seed[pos] if pos < 624 else seed[0]
214
+ else:
215
+ self.seed: int = seed
216
+ self.timeout: Union[int, float] = timeout
217
+
218
+ # Update 'compute' and 'add' docstring
219
+ # methods need to be copied otherwise it changes the docstrings of every instance
220
+ self.compute = types.MethodType(copyfunc(self.compute), self)
221
+ self.add_batch = types.MethodType(copyfunc(self.add_batch), self)
222
+ self.add = types.MethodType(copyfunc(self.add), self)
223
+ self.compute.__func__.__doc__ += self.info.inputs_description
224
+ self.add_batch.__func__.__doc__ += self.info.inputs_description
225
+ self.add.__func__.__doc__ += self.info.inputs_description
226
+
227
+ # self.arrow_schema = pa.schema(field for field in self.info.features.type)
228
+ self.buf_writer = None
229
+ self.writer = None
230
+ self.writer_batch_size = None
231
+ self.data = None
232
+
233
+ # This is the cache file we store our predictions/references in
234
+ # Keep it None for now so we can (cloud)pickle the object
235
+ self.cache_file_name = None
236
+ self.filelock = None
237
+ self.rendez_vous_lock = None
238
+
239
+ # This is all the cache files on which we have a lock when we are in a distributed setting
240
+ self.file_paths = None
241
+ self.filelocks = None
242
+
243
+ def __len__(self):
244
+ """Return the number of examples (predictions or predictions/references pair)
245
+ currently stored in the metric's cache.
246
+ """
247
+ return 0 if self.writer is None else len(self.writer)
248
+
249
+ def __repr__(self):
250
+ return (
251
+ f'Metric(name: "{self.name}", features: {self.features}, '
252
+ f'usage: """{self.inputs_description}""", '
253
+ f"stored examples: {len(self)})"
254
+ )
255
+
256
+ def _build_data_dir(self):
257
+ """Path of this metric in cache_dir:
258
+ Will be:
259
+ self._data_dir_root/self.name/self.config_name/self.hash (if not none)/
260
+ If any of these element is missing or if ``with_version=False`` the corresponding subfolders are dropped.
261
+ """
262
+ builder_data_dir = self._data_dir_root
263
+ builder_data_dir = os.path.join(builder_data_dir, self.name, self.config_name)
264
+ os.makedirs(builder_data_dir, exist_ok=True)
265
+ return builder_data_dir
266
+
267
+ def _create_cache_file(self, timeout=1) -> Tuple[str, FileLock]:
268
+ """Create a new cache file. If the default cache file is used, we generated a new hash."""
269
+ file_path = os.path.join(self.data_dir, f"{self.experiment_id}-{self.num_process}-{self.process_id}.arrow")
270
+ filelock = None
271
+ for i in range(self.max_concurrent_cache_files):
272
+ filelock = FileLock(file_path + ".lock")
273
+ try:
274
+ filelock.acquire(timeout=timeout)
275
+ except Timeout:
276
+ # If we have reached the max number of attempts or we are not allow to find a free name (distributed setup)
277
+ # We raise an error
278
+ if self.num_process != 1:
279
+ raise ValueError(
280
+ f"Error in _create_cache_file: another metric instance is already using the local cache file at {file_path}. "
281
+ f"Please specify an experiment_id (currently: {self.experiment_id}) to avoid collision "
282
+ f"between distributed metric instances."
283
+ ) from None
284
+ if i == self.max_concurrent_cache_files - 1:
285
+ raise ValueError(
286
+ f"Cannot acquire lock, too many metric instance are operating concurrently on this file system."
287
+ f"You should set a larger value of max_concurrent_cache_files when creating the metric "
288
+ f"(current value is {self.max_concurrent_cache_files})."
289
+ ) from None
290
+ # In other cases (allow to find new file name + not yet at max num of attempts) we can try to sample a new hashing name.
291
+ file_uuid = str(uuid.uuid4())
292
+ file_path = os.path.join(
293
+ self.data_dir, f"{self.experiment_id}-{file_uuid}-{self.num_process}-{self.process_id}.arrow"
294
+ )
295
+ else:
296
+ break
297
+
298
+ return file_path, filelock
299
+
300
+ def _get_all_cache_files(self) -> Tuple[List[str], List[FileLock]]:
301
+ """Get a lock on all the cache files in a distributed setup.
302
+ We wait for timeout second to let all the distributed node finish their tasks (default is 100 seconds).
303
+ """
304
+ if self.num_process == 1:
305
+ if self.cache_file_name is None:
306
+ raise ValueError(
307
+ "Metric cache file doesn't exist. Please make sure that you call `add` or `add_batch` "
308
+ "at least once before calling `compute`."
309
+ )
310
+ file_paths = [self.cache_file_name]
311
+ else:
312
+ file_paths = [
313
+ os.path.join(self.data_dir, f"{self.experiment_id}-{self.num_process}-{process_id}.arrow")
314
+ for process_id in range(self.num_process)
315
+ ]
316
+
317
+ # Let's acquire a lock on each process files to be sure they are finished writing
318
+ filelocks = []
319
+ for process_id, file_path in enumerate(file_paths):
320
+ if process_id == 0: # process 0 already has its lock file
321
+ filelocks.append(self.filelock)
322
+ else:
323
+ filelock = FileLock(file_path + ".lock")
324
+ try:
325
+ filelock.acquire(timeout=self.timeout)
326
+ except Timeout:
327
+ raise ValueError(
328
+ f"Cannot acquire lock on cached file {file_path} for process {process_id}."
329
+ ) from None
330
+ else:
331
+ filelocks.append(filelock)
332
+
333
+ return file_paths, filelocks
334
+
335
+ def _check_all_processes_locks(self):
336
+ expected_lock_file_names = [
337
+ os.path.join(self.data_dir, f"{self.experiment_id}-{self.num_process}-{process_id}.arrow.lock")
338
+ for process_id in range(self.num_process)
339
+ ]
340
+ for expected_lock_file_name in expected_lock_file_names:
341
+ nofilelock = FileFreeLock(expected_lock_file_name)
342
+ try:
343
+ nofilelock.acquire(timeout=self.timeout)
344
+ except Timeout:
345
+ raise ValueError(
346
+ f"Expected to find locked file {expected_lock_file_name} from process {self.process_id} but it doesn't exist."
347
+ ) from None
348
+ else:
349
+ nofilelock.release()
350
+
351
+ def _check_rendez_vous(self):
352
+ expected_lock_file_name = os.path.join(self.data_dir, f"{self.experiment_id}-{self.num_process}-0.arrow.lock")
353
+ nofilelock = FileFreeLock(expected_lock_file_name)
354
+ try:
355
+ nofilelock.acquire(timeout=self.timeout)
356
+ except Timeout:
357
+ raise ValueError(
358
+ f"Expected to find locked file {expected_lock_file_name} from process {self.process_id} but it doesn't exist."
359
+ ) from None
360
+ else:
361
+ nofilelock.release()
362
+ lock_file_name = os.path.join(self.data_dir, f"{self.experiment_id}-{self.num_process}-rdv.lock")
363
+ rendez_vous_lock = FileLock(lock_file_name)
364
+ try:
365
+ rendez_vous_lock.acquire(timeout=self.timeout)
366
+ except Timeout:
367
+ raise ValueError(f"Couldn't acquire lock on {lock_file_name} from process {self.process_id}.") from None
368
+ else:
369
+ rendez_vous_lock.release()
370
+
371
+ def _finalize(self):
372
+ """Close all the writing process and load/gather the data
373
+ from all the nodes if main node or all_process is True.
374
+ """
375
+ if self.writer is not None:
376
+ self.writer.finalize()
377
+ self.writer = None
378
+ # release the locks of the processes > 0 so that process 0 can lock them to read + delete the data
379
+ if self.filelock is not None and self.process_id > 0:
380
+ self.filelock.release()
381
+
382
+ if self.keep_in_memory:
383
+ # Read the predictions and references
384
+ reader = ArrowReader(path=self.data_dir, info=DatasetInfo(features=self.features))
385
+ self.data = Dataset.from_buffer(self.buf_writer.getvalue())
386
+
387
+ elif self.process_id == 0:
388
+ # Let's acquire a lock on each node files to be sure they are finished writing
389
+ file_paths, filelocks = self._get_all_cache_files()
390
+
391
+ # Read the predictions and references
392
+ try:
393
+ reader = ArrowReader(path="", info=DatasetInfo(features=self.features))
394
+ self.data = Dataset(**reader.read_files([{"filename": f} for f in file_paths]))
395
+ except FileNotFoundError:
396
+ raise ValueError(
397
+ "Error in finalize: another metric instance is already using the local cache file. "
398
+ "Please specify an experiment_id to avoid collision between distributed metric instances."
399
+ ) from None
400
+
401
+ # Store file paths and locks and we will release/delete them after the computation.
402
+ self.file_paths = file_paths
403
+ self.filelocks = filelocks
404
+
405
+ def compute(self, *, predictions=None, references=None, **kwargs) -> Optional[dict]:
406
+ """Compute the metrics.
407
+
408
+ Usage of positional arguments is not allowed to prevent mistakes.
409
+
410
+ Args:
411
+ predictions (list/array/tensor, optional): Predictions.
412
+ references (list/array/tensor, optional): References.
413
+ **kwargs (optional): Keyword arguments that will be forwarded to the metrics :meth:`_compute`
414
+ method (see details in the docstring).
415
+
416
+ Return:
417
+ dict or None
418
+
419
+ - Dictionary with the metrics if this metric is run on the main process (``process_id == 0``).
420
+ - None if the metric is not run on the main process (``process_id != 0``).
421
+
422
+ Example:
423
+
424
+ ```py
425
+ >>> from datasets import load_metric
426
+ >>> metric = load_metric("accuracy")
427
+ >>> accuracy = metric.compute(predictions=model_prediction, references=labels)
428
+ ```
429
+ """
430
+ all_kwargs = {"predictions": predictions, "references": references, **kwargs}
431
+ if predictions is None and references is None:
432
+ missing_kwargs = {k: None for k in self.features if k not in all_kwargs}
433
+ all_kwargs.update(missing_kwargs)
434
+ else:
435
+ missing_inputs = [k for k in self.features if k not in all_kwargs]
436
+ if missing_inputs:
437
+ raise ValueError(
438
+ f"Metric inputs are missing: {missing_inputs}. All required inputs are {list(self.features)}"
439
+ )
440
+ inputs = {input_name: all_kwargs[input_name] for input_name in self.features}
441
+ compute_kwargs = {k: kwargs[k] for k in kwargs if k not in self.features}
442
+
443
+ if any(v is not None for v in inputs.values()):
444
+ self.add_batch(**inputs)
445
+ self._finalize()
446
+
447
+ self.cache_file_name = None
448
+ self.filelock = None
449
+
450
+ if self.process_id == 0:
451
+ self.data.set_format(type=self.info.format)
452
+
453
+ inputs = {input_name: self.data[input_name] for input_name in self.features}
454
+ with temp_seed(self.seed):
455
+ output = self._compute(**inputs, **compute_kwargs)
456
+
457
+ if self.buf_writer is not None:
458
+ self.buf_writer = None
459
+ del self.data
460
+ self.data = None
461
+ else:
462
+ # Release locks and delete all the cache files. Process 0 is released last.
463
+ for filelock, file_path in reversed(list(zip(self.filelocks, self.file_paths))):
464
+ logger.info(f"Removing {file_path}")
465
+ del self.data
466
+ self.data = None
467
+ del self.writer
468
+ self.writer = None
469
+ os.remove(file_path)
470
+ filelock.release()
471
+
472
+ return output
473
+ else:
474
+ return None
475
+
476
+ def add_batch(self, *, predictions=None, references=None, **kwargs):
477
+ """Add a batch of predictions and references for the metric's stack.
478
+
479
+ Args:
480
+ predictions (list/array/tensor, optional): Predictions.
481
+ references (list/array/tensor, optional): References.
482
+
483
+ Example:
484
+
485
+ ```py
486
+ >>> from datasets import load_metric
487
+ >>> metric = load_metric("accuracy")
488
+ >>> metric.add_batch(predictions=model_prediction, references=labels)
489
+ ```
490
+ """
491
+ bad_inputs = [input_name for input_name in kwargs if input_name not in self.features]
492
+ if bad_inputs:
493
+ raise ValueError(f"Bad inputs for metric: {bad_inputs}. All required inputs are {list(self.features)}")
494
+ batch = {"predictions": predictions, "references": references, **kwargs}
495
+ batch = {intput_name: batch[intput_name] for intput_name in self.features}
496
+ batch = self.info.features.encode_batch(batch)
497
+ if self.writer is None:
498
+ self._init_writer()
499
+ try:
500
+ self.writer.write_batch(batch)
501
+ except pa.ArrowInvalid:
502
+ if any(len(batch[c]) != len(next(iter(batch.values()))) for c in batch):
503
+ col0 = next(iter(batch))
504
+ bad_col = [c for c in batch if len(batch[c]) != len(batch[col0])][0]
505
+ error_msg = (
506
+ f"Mismatch in the number of {col0} ({len(batch[col0])}) and {bad_col} ({len(batch[bad_col])})"
507
+ )
508
+ elif sorted(self.features) != ["references", "predictions"]:
509
+ error_msg = f"Metric inputs don't match the expected format.\n" f"Expected format: {self.features},\n"
510
+ error_msg_inputs = ",\n".join(
511
+ f"Input {input_name}: {summarize_if_long_list(batch[input_name])}" for input_name in self.features
512
+ )
513
+ error_msg += error_msg_inputs
514
+ else:
515
+ error_msg = (
516
+ f"Predictions and/or references don't match the expected format.\n"
517
+ f"Expected format: {self.features},\n"
518
+ f"Input predictions: {summarize_if_long_list(predictions)},\n"
519
+ f"Input references: {summarize_if_long_list(references)}"
520
+ )
521
+ raise ValueError(error_msg) from None
522
+
523
+ def add(self, *, prediction=None, reference=None, **kwargs):
524
+ """Add one prediction and reference for the metric's stack.
525
+
526
+ Args:
527
+ prediction (list/array/tensor, optional): Predictions.
528
+ reference (list/array/tensor, optional): References.
529
+
530
+ Example:
531
+
532
+ ```py
533
+ >>> from datasets import load_metric
534
+ >>> metric = load_metric("accuracy")
535
+ >>> metric.add(predictions=model_predictions, references=labels)
536
+ ```
537
+ """
538
+ bad_inputs = [input_name for input_name in kwargs if input_name not in self.features]
539
+ if bad_inputs:
540
+ raise ValueError(f"Bad inputs for metric: {bad_inputs}. All required inputs are {list(self.features)}")
541
+ example = {"predictions": prediction, "references": reference, **kwargs}
542
+ example = {intput_name: example[intput_name] for intput_name in self.features}
543
+ example = self.info.features.encode_example(example)
544
+ if self.writer is None:
545
+ self._init_writer()
546
+ try:
547
+ self.writer.write(example)
548
+ except pa.ArrowInvalid:
549
+ error_msg = f"Metric inputs don't match the expected format.\n" f"Expected format: {self.features},\n"
550
+ error_msg_inputs = ",\n".join(
551
+ f"Input {input_name}: {summarize_if_long_list(example[input_name])}" for input_name in self.features
552
+ )
553
+ error_msg += error_msg_inputs
554
+ raise ValueError(error_msg) from None
555
+
556
+ def _init_writer(self, timeout=1):
557
+ if self.num_process > 1:
558
+ if self.process_id == 0:
559
+ file_path = os.path.join(self.data_dir, f"{self.experiment_id}-{self.num_process}-rdv.lock")
560
+ self.rendez_vous_lock = FileLock(file_path)
561
+ try:
562
+ self.rendez_vous_lock.acquire(timeout=timeout)
563
+ except TimeoutError:
564
+ raise ValueError(
565
+ f"Error in _init_writer: another metric instance is already using the local cache file at {file_path}. "
566
+ f"Please specify an experiment_id (currently: {self.experiment_id}) to avoid collision "
567
+ f"between distributed metric instances."
568
+ ) from None
569
+
570
+ if self.keep_in_memory:
571
+ self.buf_writer = pa.BufferOutputStream()
572
+ self.writer = ArrowWriter(
573
+ features=self.info.features, stream=self.buf_writer, writer_batch_size=self.writer_batch_size
574
+ )
575
+ else:
576
+ self.buf_writer = None
577
+
578
+ # Get cache file name and lock it
579
+ if self.cache_file_name is None or self.filelock is None:
580
+ cache_file_name, filelock = self._create_cache_file() # get ready
581
+ self.cache_file_name = cache_file_name
582
+ self.filelock = filelock
583
+
584
+ self.writer = ArrowWriter(
585
+ features=self.info.features, path=self.cache_file_name, writer_batch_size=self.writer_batch_size
586
+ )
587
+ # Setup rendez-vous here if
588
+ if self.num_process > 1:
589
+ if self.process_id == 0:
590
+ self._check_all_processes_locks() # wait for everyone to be ready
591
+ self.rendez_vous_lock.release() # let everyone go
592
+ else:
593
+ self._check_rendez_vous() # wait for master to be ready and to let everyone go
594
+
595
+ def _info(self) -> MetricInfo:
596
+ """Construct the MetricInfo object. See `MetricInfo` for details.
597
+
598
+ Warning: This function is only called once and the result is cached for all
599
+ following .info() calls.
600
+
601
+ Returns:
602
+ info: (MetricInfo) The metrics information
603
+ """
604
+ raise NotImplementedError
605
+
606
+ def download_and_prepare(
607
+ self,
608
+ download_config: Optional[DownloadConfig] = None,
609
+ dl_manager: Optional[DownloadManager] = None,
610
+ ):
611
+ """Downloads and prepares dataset for reading.
612
+
613
+ Args:
614
+ download_config (:class:`DownloadConfig`, optional): Specific download configuration parameters.
615
+ dl_manager (:class:`DownloadManager`, optional): Specific download manager to use.
616
+ """
617
+ if dl_manager is None:
618
+ if download_config is None:
619
+ download_config = DownloadConfig()
620
+ download_config.cache_dir = os.path.join(self.data_dir, "downloads")
621
+ download_config.force_download = False
622
+
623
+ dl_manager = DownloadManager(
624
+ dataset_name=self.name, download_config=download_config, data_dir=self.data_dir
625
+ )
626
+
627
+ self._download_and_prepare(dl_manager)
628
+
629
+ def _download_and_prepare(self, dl_manager):
630
+ """Downloads and prepares resources for the metric.
631
+
632
+ This is the internal implementation to overwrite called when user calls
633
+ `download_and_prepare`. It should download all required resources for the metric.
634
+
635
+ Args:
636
+ dl_manager (:class:`DownloadManager`): `DownloadManager` used to download and cache data.
637
+ """
638
+ return None
639
+
640
+ def _compute(self, *, predictions=None, references=None, **kwargs) -> Dict[str, Any]:
641
+ """This method defines the common API for all the metrics in the library"""
642
+ raise NotImplementedError
643
+
644
+ def __del__(self):
645
+ if hasattr(self, "filelock") and self.filelock is not None:
646
+ self.filelock.release()
647
+ if hasattr(self, "rendez_vous_lock") and self.rendez_vous_lock is not None:
648
+ self.rendez_vous_lock.release()
649
+ if hasattr(self, "writer"): # in case it was already deleted
650
+ del self.writer
651
+ if hasattr(self, "data"): # in case it was already deleted
652
+ del self.data
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/datasets/search.py ADDED
@@ -0,0 +1,785 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import importlib.util
2
+ import os
3
+ import tempfile
4
+ from pathlib import PurePath
5
+ from typing import TYPE_CHECKING, Dict, List, NamedTuple, Optional, Union
6
+
7
+ import fsspec
8
+ import numpy as np
9
+
10
+ from .features import Sequence
11
+ from .utils import logging
12
+ from .utils import tqdm as hf_tqdm
13
+
14
+
15
+ if TYPE_CHECKING:
16
+ from .arrow_dataset import Dataset # noqa: F401
17
+
18
+ try:
19
+ from elasticsearch import Elasticsearch # noqa: F401
20
+
21
+ except ImportError:
22
+ pass
23
+ try:
24
+ import faiss # noqa: F401
25
+
26
+ except ImportError:
27
+ pass
28
+
29
+ _has_elasticsearch = importlib.util.find_spec("elasticsearch") is not None
30
+ _has_faiss = importlib.util.find_spec("faiss") is not None
31
+
32
+
33
+ logger = logging.get_logger(__name__)
34
+
35
+
36
+ class MissingIndex(Exception):
37
+ pass
38
+
39
+
40
+ class SearchResults(NamedTuple):
41
+ scores: List[float]
42
+ indices: List[int]
43
+
44
+
45
+ class BatchedSearchResults(NamedTuple):
46
+ total_scores: List[List[float]]
47
+ total_indices: List[List[int]]
48
+
49
+
50
+ class NearestExamplesResults(NamedTuple):
51
+ scores: List[float]
52
+ examples: dict
53
+
54
+
55
+ class BatchedNearestExamplesResults(NamedTuple):
56
+ total_scores: List[List[float]]
57
+ total_examples: List[dict]
58
+
59
+
60
+ class BaseIndex:
61
+ """Base class for indexing"""
62
+
63
+ def search(self, query, k: int = 10, **kwargs) -> SearchResults:
64
+ """
65
+ To implement.
66
+ This method has to return the scores and the indices of the retrieved examples given a certain query.
67
+ """
68
+ raise NotImplementedError
69
+
70
+ def search_batch(self, queries, k: int = 10, **kwargs) -> BatchedSearchResults:
71
+ """Find the nearest examples indices to the query.
72
+
73
+ Args:
74
+ queries (`Union[List[str], np.ndarray]`): The queries as a list of strings if `column` is a text index or as a numpy array if `column` is a vector index.
75
+ k (`int`): The number of examples to retrieve per query.
76
+
77
+ Ouput:
78
+ total_scores (`List[List[float]`): The retrieval scores of the retrieved examples per query.
79
+ total_indices (`List[List[int]]`): The indices of the retrieved examples per query.
80
+ """
81
+ total_scores, total_indices = [], []
82
+ for query in queries:
83
+ scores, indices = self.search(query, k)
84
+ total_scores.append(scores)
85
+ total_indices.append(indices)
86
+ return BatchedSearchResults(total_scores, total_indices)
87
+
88
+ def save(self, file: Union[str, PurePath]):
89
+ """Serialize the index on disk"""
90
+ raise NotImplementedError
91
+
92
+ @classmethod
93
+ def load(cls, file: Union[str, PurePath]) -> "BaseIndex":
94
+ """Deserialize the index from disk"""
95
+ raise NotImplementedError
96
+
97
+
98
+ class ElasticSearchIndex(BaseIndex):
99
+ """
100
+ Sparse index using Elasticsearch. It is used to index text and run queries based on BM25 similarity.
101
+ An Elasticsearch server needs to be accessible, and a python client is declared with
102
+ ```
103
+ es_client = Elasticsearch([{'host': 'localhost', 'port': '9200'}])
104
+ ```
105
+ for example.
106
+ """
107
+
108
+ def __init__(
109
+ self,
110
+ host: Optional[str] = None,
111
+ port: Optional[int] = None,
112
+ es_client: Optional["Elasticsearch"] = None,
113
+ es_index_name: Optional[str] = None,
114
+ es_index_config: Optional[dict] = None,
115
+ ):
116
+ if not _has_elasticsearch:
117
+ raise ImportError(
118
+ "You must install ElasticSearch to use ElasticSearchIndex. To do so you can run `pip install elasticsearch==7.7.1 for example`"
119
+ )
120
+ if es_client is not None and (host is not None or port is not None):
121
+ raise ValueError("Please specify either `es_client` or `(host, port)`, but not both.")
122
+ host = host or "localhost"
123
+ port = port or 9200
124
+
125
+ import elasticsearch.helpers # noqa: F401 - need this to properly load all the es features
126
+ from elasticsearch import Elasticsearch # noqa: F811
127
+
128
+ self.es_client = es_client if es_client is not None else Elasticsearch([{"host": host, "port": str(port)}])
129
+ self.es_index_name = (
130
+ es_index_name
131
+ if es_index_name is not None
132
+ else "huggingface_datasets_" + os.path.basename(tempfile.NamedTemporaryFile().name)
133
+ )
134
+ self.es_index_config = (
135
+ es_index_config
136
+ if es_index_config is not None
137
+ else {
138
+ "settings": {
139
+ "number_of_shards": 1,
140
+ "analysis": {"analyzer": {"stop_standard": {"type": "standard", " stopwords": "_english_"}}},
141
+ },
142
+ "mappings": {"properties": {"text": {"type": "text", "analyzer": "standard", "similarity": "BM25"}}},
143
+ }
144
+ )
145
+
146
+ def add_documents(self, documents: Union[List[str], "Dataset"], column: Optional[str] = None):
147
+ """
148
+ Add documents to the index.
149
+ If the documents are inside a certain column, you can specify it using the `column` argument.
150
+ """
151
+ index_name = self.es_index_name
152
+ index_config = self.es_index_config
153
+ self.es_client.indices.create(index=index_name, body=index_config)
154
+ number_of_docs = len(documents)
155
+ progress = hf_tqdm(unit="docs", total=number_of_docs)
156
+ successes = 0
157
+
158
+ def passage_generator():
159
+ if column is not None:
160
+ for i, example in enumerate(documents):
161
+ yield {"text": example[column], "_id": i}
162
+ else:
163
+ for i, example in enumerate(documents):
164
+ yield {"text": example, "_id": i}
165
+
166
+ # create the ES index
167
+ import elasticsearch as es
168
+
169
+ for ok, action in es.helpers.streaming_bulk(
170
+ client=self.es_client,
171
+ index=index_name,
172
+ actions=passage_generator(),
173
+ ):
174
+ progress.update(1)
175
+ successes += ok
176
+ if successes != len(documents):
177
+ logger.warning(
178
+ f"Some documents failed to be added to ElasticSearch. Failures: {len(documents)-successes}/{len(documents)}"
179
+ )
180
+ logger.info(f"Indexed {successes:d} documents")
181
+
182
+ def search(self, query: str, k=10, **kwargs) -> SearchResults:
183
+ """Find the nearest examples indices to the query.
184
+
185
+ Args:
186
+ query (`str`): The query as a string.
187
+ k (`int`): The number of examples to retrieve.
188
+
189
+ Ouput:
190
+ scores (`List[List[float]`): The retrieval scores of the retrieved examples.
191
+ indices (`List[List[int]]`): The indices of the retrieved examples.
192
+ """
193
+ response = self.es_client.search(
194
+ index=self.es_index_name,
195
+ body={"query": {"multi_match": {"query": query, "fields": ["text"], "type": "cross_fields"}}, "size": k},
196
+ **kwargs,
197
+ )
198
+ hits = response["hits"]["hits"]
199
+ return SearchResults([hit["_score"] for hit in hits], [int(hit["_id"]) for hit in hits])
200
+
201
+ def search_batch(self, queries, k: int = 10, max_workers=10, **kwargs) -> BatchedSearchResults:
202
+ import concurrent.futures
203
+
204
+ total_scores, total_indices = [None] * len(queries), [None] * len(queries)
205
+ with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
206
+ future_to_index = {executor.submit(self.search, query, k, **kwargs): i for i, query in enumerate(queries)}
207
+ for future in concurrent.futures.as_completed(future_to_index):
208
+ index = future_to_index[future]
209
+ results: SearchResults = future.result()
210
+ total_scores[index] = results.scores
211
+ total_indices[index] = results.indices
212
+ return BatchedSearchResults(total_indices=total_indices, total_scores=total_scores)
213
+
214
+
215
+ class FaissIndex(BaseIndex):
216
+ """
217
+ Dense index using Faiss. It is used to index vectors.
218
+ Faiss is a library for efficient similarity search and clustering of dense vectors.
219
+ It contains algorithms that search in sets of vectors of any size, up to ones that possibly do not fit in RAM.
220
+ You can find more information about Faiss here:
221
+ - For index types and the string factory: https://github.com/facebookresearch/faiss/wiki/The-index-factory
222
+ - For GPU settings: https://github.com/facebookresearch/faiss/wiki/Faiss-on-the-GPU
223
+ """
224
+
225
+ def __init__(
226
+ self,
227
+ device: Optional[Union[int, List[int]]] = None,
228
+ string_factory: Optional[str] = None,
229
+ metric_type: Optional[int] = None,
230
+ custom_index: Optional["faiss.Index"] = None,
231
+ ):
232
+ """
233
+ Create a Dense index using Faiss. You can specify `device` if you want to run it on GPU (`device` must be the GPU index).
234
+ You can find more information about Faiss here:
235
+ - For `string factory`: https://github.com/facebookresearch/faiss/wiki/The-index-factory
236
+ """
237
+ if string_factory is not None and custom_index is not None:
238
+ raise ValueError("Please specify either `string_factory` or `custom_index` but not both.")
239
+ if device is not None and custom_index is not None:
240
+ raise ValueError(
241
+ "Cannot pass both 'custom_index' and 'device'. "
242
+ "Pass 'custom_index' already transferred to the target device instead."
243
+ )
244
+ self.device = device
245
+ self.string_factory = string_factory
246
+ self.metric_type = metric_type
247
+ self.faiss_index = custom_index
248
+ if not _has_faiss:
249
+ raise ImportError(
250
+ "You must install Faiss to use FaissIndex. To do so you can run `conda install -c pytorch faiss-cpu` or `conda install -c pytorch faiss-gpu`. "
251
+ "A community supported package is also available on pypi: `pip install faiss-cpu` or `pip install faiss-gpu`. "
252
+ "Note that pip may not have the latest version of FAISS, and thus, some of the latest features and bug fixes may not be available."
253
+ )
254
+
255
+ def add_vectors(
256
+ self,
257
+ vectors: Union[np.array, "Dataset"],
258
+ column: Optional[str] = None,
259
+ batch_size: int = 1000,
260
+ train_size: Optional[int] = None,
261
+ faiss_verbose: Optional[bool] = None,
262
+ ):
263
+ """
264
+ Add vectors to the index.
265
+ If the arrays are inside a certain column, you can specify it using the `column` argument.
266
+ """
267
+ import faiss # noqa: F811
268
+
269
+ if column and not isinstance(vectors.features[column], Sequence):
270
+ raise ValueError(
271
+ f"Wrong feature type for column '{column}'. Expected 1d array, got {vectors.features[column]}"
272
+ )
273
+
274
+ # Create index
275
+ if self.faiss_index is None:
276
+ size = len(vectors[0]) if column is None else len(vectors[0][column])
277
+ if self.string_factory is not None:
278
+ if self.metric_type is None:
279
+ index = faiss.index_factory(size, self.string_factory)
280
+ else:
281
+ index = faiss.index_factory(size, self.string_factory, self.metric_type)
282
+ else:
283
+ if self.metric_type is None:
284
+ index = faiss.IndexFlat(size)
285
+ else:
286
+ index = faiss.IndexFlat(size, self.metric_type)
287
+
288
+ self.faiss_index = self._faiss_index_to_device(index, self.device)
289
+ logger.info(f"Created faiss index of type {type(self.faiss_index)}")
290
+
291
+ # Set verbosity level
292
+ if faiss_verbose is not None:
293
+ self.faiss_index.verbose = faiss_verbose
294
+ if hasattr(self.faiss_index, "index") and self.faiss_index.index is not None:
295
+ self.faiss_index.index.verbose = faiss_verbose
296
+ if hasattr(self.faiss_index, "quantizer") and self.faiss_index.quantizer is not None:
297
+ self.faiss_index.quantizer.verbose = faiss_verbose
298
+ if hasattr(self.faiss_index, "clustering_index") and self.faiss_index.clustering_index is not None:
299
+ self.faiss_index.clustering_index.verbose = faiss_verbose
300
+
301
+ # Train
302
+ if train_size is not None:
303
+ train_vecs = vectors[:train_size] if column is None else vectors[:train_size][column]
304
+ logger.info(f"Training the index with the first {len(train_vecs)} vectors")
305
+ self.faiss_index.train(train_vecs)
306
+ else:
307
+ logger.info("Ignored the training step of the faiss index as `train_size` is None.")
308
+
309
+ # Add vectors
310
+ logger.info(f"Adding {len(vectors)} vectors to the faiss index")
311
+ for i in hf_tqdm(range(0, len(vectors), batch_size)):
312
+ vecs = vectors[i : i + batch_size] if column is None else vectors[i : i + batch_size][column]
313
+ self.faiss_index.add(vecs)
314
+
315
+ @staticmethod
316
+ def _faiss_index_to_device(index: "faiss.Index", device: Optional[Union[int, List[int]]] = None) -> "faiss.Index":
317
+ """
318
+ Sends a faiss index to a device.
319
+ A device can either be a positive integer (GPU id), a negative integer (all GPUs),
320
+ or a list of positive integers (select GPUs to use), or `None` for CPU.
321
+ """
322
+
323
+ # If device is not specified, then it runs on CPU.
324
+ if device is None:
325
+ return index
326
+
327
+ import faiss # noqa: F811
328
+
329
+ # If the device id is given as an integer
330
+ if isinstance(device, int):
331
+ # Positive integers are directly mapped to GPU ids
332
+ if device > -1:
333
+ faiss_res = faiss.StandardGpuResources()
334
+ index = faiss.index_cpu_to_gpu(faiss_res, device, index)
335
+ # And negative integers mean using all GPUs
336
+ else:
337
+ index = faiss.index_cpu_to_all_gpus(index)
338
+ # Device ids given as a list mean mapping to those devices specified.
339
+ elif isinstance(device, (list, tuple)):
340
+ index = faiss.index_cpu_to_gpus_list(index, gpus=list(device))
341
+ else:
342
+ raise TypeError(
343
+ f"The argument type: {type(device)} is not expected. "
344
+ + "Please pass in either nothing, a positive int, a negative int, or a list of positive ints."
345
+ )
346
+
347
+ return index
348
+
349
+ def search(self, query: np.array, k=10, **kwargs) -> SearchResults:
350
+ """Find the nearest examples indices to the query.
351
+
352
+ Args:
353
+ query (`np.array`): The query as a numpy array.
354
+ k (`int`): The number of examples to retrieve.
355
+
356
+ Ouput:
357
+ scores (`List[List[float]`): The retrieval scores of the retrieved examples.
358
+ indices (`List[List[int]]`): The indices of the retrieved examples.
359
+ """
360
+ if len(query.shape) != 1 and (len(query.shape) != 2 or query.shape[0] != 1):
361
+ raise ValueError("Shape of query is incorrect, it has to be either a 1D array or 2D (1, N)")
362
+
363
+ queries = query.reshape(1, -1)
364
+ if not queries.flags.c_contiguous:
365
+ queries = np.asarray(queries, order="C")
366
+ scores, indices = self.faiss_index.search(queries, k, **kwargs)
367
+ return SearchResults(scores[0], indices[0].astype(int))
368
+
369
+ def search_batch(self, queries: np.array, k=10, **kwargs) -> BatchedSearchResults:
370
+ """Find the nearest examples indices to the queries.
371
+
372
+ Args:
373
+ queries (`np.array`): The queries as a numpy array.
374
+ k (`int`): The number of examples to retrieve.
375
+
376
+ Ouput:
377
+ total_scores (`List[List[float]`): The retrieval scores of the retrieved examples per query.
378
+ total_indices (`List[List[int]]`): The indices of the retrieved examples per query.
379
+ """
380
+ if len(queries.shape) != 2:
381
+ raise ValueError("Shape of query must be 2D")
382
+ if not queries.flags.c_contiguous:
383
+ queries = np.asarray(queries, order="C")
384
+ scores, indices = self.faiss_index.search(queries, k, **kwargs)
385
+ return BatchedSearchResults(scores, indices.astype(int))
386
+
387
+ def save(self, file: Union[str, PurePath], storage_options: Optional[Dict] = None):
388
+ """Serialize the FaissIndex on disk"""
389
+ import faiss # noqa: F811
390
+
391
+ if self.device is not None and isinstance(self.device, (int, list, tuple)):
392
+ index = faiss.index_gpu_to_cpu(self.faiss_index)
393
+ else:
394
+ index = self.faiss_index
395
+
396
+ with fsspec.open(str(file), "wb", **(storage_options or {})) as f:
397
+ faiss.write_index(index, faiss.BufferedIOWriter(faiss.PyCallbackIOWriter(f.write)))
398
+
399
+ @classmethod
400
+ def load(
401
+ cls,
402
+ file: Union[str, PurePath],
403
+ device: Optional[Union[int, List[int]]] = None,
404
+ storage_options: Optional[Dict] = None,
405
+ ) -> "FaissIndex":
406
+ """Deserialize the FaissIndex from disk"""
407
+ import faiss # noqa: F811
408
+
409
+ # Instances of FaissIndex is essentially just a wrapper for faiss indices.
410
+ faiss_index = cls(device=device)
411
+ with fsspec.open(str(file), "rb", **(storage_options or {})) as f:
412
+ index = faiss.read_index(faiss.BufferedIOReader(faiss.PyCallbackIOReader(f.read)))
413
+ faiss_index.faiss_index = faiss_index._faiss_index_to_device(index, faiss_index.device)
414
+ return faiss_index
415
+
416
+
417
+ class IndexableMixin:
418
+ """Add indexing features to `datasets.Dataset`"""
419
+
420
+ def __init__(self):
421
+ self._indexes: Dict[str, BaseIndex] = {}
422
+
423
+ def __len__(self):
424
+ raise NotImplementedError
425
+
426
+ def __getitem__(self, key):
427
+ raise NotImplementedError
428
+
429
+ def is_index_initialized(self, index_name: str) -> bool:
430
+ return index_name in self._indexes
431
+
432
+ def _check_index_is_initialized(self, index_name: str):
433
+ if not self.is_index_initialized(index_name):
434
+ raise MissingIndex(
435
+ f"Index with index_name '{index_name}' not initialized yet. Please make sure that you call `add_faiss_index` or `add_elasticsearch_index` first."
436
+ )
437
+
438
+ def list_indexes(self) -> List[str]:
439
+ """List the `colindex_nameumns`/identifiers of all the attached indexes."""
440
+ return list(self._indexes)
441
+
442
+ def get_index(self, index_name: str) -> BaseIndex:
443
+ """List the `index_name`/identifiers of all the attached indexes.
444
+
445
+ Args:
446
+ index_name (`str`): Index name.
447
+
448
+ Returns:
449
+ [`BaseIndex`]
450
+ """
451
+ self._check_index_is_initialized(index_name)
452
+ return self._indexes[index_name]
453
+
454
+ def add_faiss_index(
455
+ self,
456
+ column: str,
457
+ index_name: Optional[str] = None,
458
+ device: Optional[Union[int, List[int]]] = None,
459
+ string_factory: Optional[str] = None,
460
+ metric_type: Optional[int] = None,
461
+ custom_index: Optional["faiss.Index"] = None,
462
+ batch_size: int = 1000,
463
+ train_size: Optional[int] = None,
464
+ faiss_verbose: bool = False,
465
+ ):
466
+ """Add a dense index using Faiss for fast retrieval.
467
+ The index is created using the vectors of the specified column.
468
+ You can specify `device` if you want to run it on GPU (`device` must be the GPU index, see more below).
469
+ You can find more information about Faiss here:
470
+ - For `string factory`: https://github.com/facebookresearch/faiss/wiki/The-index-factory
471
+
472
+ Args:
473
+ column (`str`): The column of the vectors to add to the index.
474
+ index_name (Optional `str`): The index_name/identifier of the index. This is the index_name that is used to call `.get_nearest` or `.search`.
475
+ By default it corresponds to `column`.
476
+ device (Optional `Union[int, List[int]]`): If positive integer, this is the index of the GPU to use. If negative integer, use all GPUs.
477
+ If a list of positive integers is passed in, run only on those GPUs. By default it uses the CPU.
478
+ string_factory (Optional `str`): This is passed to the index factory of Faiss to create the index. Default index class is IndexFlatIP.
479
+ metric_type (Optional `int`): Type of metric. Ex: `faiss.METRIC_INNER_PRODUCT` or `faiss.METRIC_L2`.
480
+ custom_index (Optional `faiss.Index`): Custom Faiss index that you already have instantiated and configured for your needs.
481
+ batch_size (Optional `int`): Size of the batch to use while adding vectors to the FaissIndex. Default value is 1000.
482
+ <Added version="2.4.0"/>
483
+ train_size (Optional `int`): If the index needs a training step, specifies how many vectors will be used to train the index.
484
+ faiss_verbose (`bool`, defaults to False): Enable the verbosity of the Faiss index.
485
+ """
486
+ index_name = index_name if index_name is not None else column
487
+ faiss_index = FaissIndex(
488
+ device=device, string_factory=string_factory, metric_type=metric_type, custom_index=custom_index
489
+ )
490
+ faiss_index.add_vectors(
491
+ self, column=column, batch_size=batch_size, train_size=train_size, faiss_verbose=faiss_verbose
492
+ )
493
+ self._indexes[index_name] = faiss_index
494
+
495
+ def add_faiss_index_from_external_arrays(
496
+ self,
497
+ external_arrays: np.array,
498
+ index_name: str,
499
+ device: Optional[Union[int, List[int]]] = None,
500
+ string_factory: Optional[str] = None,
501
+ metric_type: Optional[int] = None,
502
+ custom_index: Optional["faiss.Index"] = None,
503
+ batch_size: int = 1000,
504
+ train_size: Optional[int] = None,
505
+ faiss_verbose: bool = False,
506
+ ):
507
+ """Add a dense index using Faiss for fast retrieval.
508
+ The index is created using the vectors of `external_arrays`.
509
+ You can specify `device` if you want to run it on GPU (`device` must be the GPU index).
510
+ You can find more information about Faiss here:
511
+ - For `string factory`: https://github.com/facebookresearch/faiss/wiki/The-index-factory
512
+
513
+ Args:
514
+ external_arrays (`np.array`): If you want to use arrays from outside the lib for the index, you can set `external_arrays`.
515
+ It will use `external_arrays` to create the Faiss index instead of the arrays in the given `column`.
516
+ index_name (`str`): The index_name/identifier of the index. This is the index_name that is used to call `.get_nearest` or `.search`.
517
+ device (Optional `Union[int, List[int]]`): If positive integer, this is the index of the GPU to use. If negative integer, use all GPUs.
518
+ If a list of positive integers is passed in, run only on those GPUs. By default it uses the CPU.
519
+ string_factory (Optional `str`): This is passed to the index factory of Faiss to create the index. Default index class is IndexFlatIP.
520
+ metric_type (Optional `int`): Type of metric. Ex: `faiss.METRIC_INNER_PRODUCT` or `faiss.METRIC_L2`.
521
+ custom_index (Optional `faiss.Index`): Custom Faiss index that you already have instantiated and configured for your needs.
522
+ batch_size (Optional `int`): Size of the batch to use while adding vectors to the FaissIndex. Default value is 1000.
523
+ <Added version="2.4.0"/>
524
+ train_size (Optional `int`): If the index needs a training step, specifies how many vectors will be used to train the index.
525
+ faiss_verbose (`bool`, defaults to False): Enable the verbosity of the Faiss index.
526
+ """
527
+ faiss_index = FaissIndex(
528
+ device=device, string_factory=string_factory, metric_type=metric_type, custom_index=custom_index
529
+ )
530
+ faiss_index.add_vectors(
531
+ external_arrays, column=None, batch_size=batch_size, train_size=train_size, faiss_verbose=faiss_verbose
532
+ )
533
+ self._indexes[index_name] = faiss_index
534
+
535
+ def save_faiss_index(self, index_name: str, file: Union[str, PurePath], storage_options: Optional[Dict] = None):
536
+ """Save a FaissIndex on disk.
537
+
538
+ Args:
539
+ index_name (`str`): The index_name/identifier of the index. This is the index_name that is used to call `.get_nearest` or `.search`.
540
+ file (`str`): The path to the serialized faiss index on disk or remote URI (e.g. `"s3://my-bucket/index.faiss"`).
541
+ storage_options (`dict`, *optional*):
542
+ Key/value pairs to be passed on to the file-system backend, if any.
543
+
544
+ <Added version="2.11.0"/>
545
+
546
+ """
547
+ index = self.get_index(index_name)
548
+ if not isinstance(index, FaissIndex):
549
+ raise ValueError(f"Index '{index_name}' is not a FaissIndex but a '{type(index)}'")
550
+ index.save(file, storage_options=storage_options)
551
+ logger.info(f"Saved FaissIndex {index_name} at {file}")
552
+
553
+ def load_faiss_index(
554
+ self,
555
+ index_name: str,
556
+ file: Union[str, PurePath],
557
+ device: Optional[Union[int, List[int]]] = None,
558
+ storage_options: Optional[Dict] = None,
559
+ ):
560
+ """Load a FaissIndex from disk.
561
+
562
+ If you want to do additional configurations, you can have access to the faiss index object by doing
563
+ `.get_index(index_name).faiss_index` to make it fit your needs.
564
+
565
+ Args:
566
+ index_name (`str`): The index_name/identifier of the index. This is the index_name that is used to
567
+ call `.get_nearest` or `.search`.
568
+ file (`str`): The path to the serialized faiss index on disk or remote URI (e.g. `"s3://my-bucket/index.faiss"`).
569
+ device (Optional `Union[int, List[int]]`): If positive integer, this is the index of the GPU to use. If negative integer, use all GPUs.
570
+ If a list of positive integers is passed in, run only on those GPUs. By default it uses the CPU.
571
+ storage_options (`dict`, *optional*):
572
+ Key/value pairs to be passed on to the file-system backend, if any.
573
+
574
+ <Added version="2.11.0"/>
575
+
576
+ """
577
+ index = FaissIndex.load(file, device=device, storage_options=storage_options)
578
+ if index.faiss_index.ntotal != len(self):
579
+ raise ValueError(
580
+ f"Index size should match Dataset size, but Index '{index_name}' at {file} has {index.faiss_index.ntotal} elements while the dataset has {len(self)} examples."
581
+ )
582
+ self._indexes[index_name] = index
583
+ logger.info(f"Loaded FaissIndex {index_name} from {file}")
584
+
585
+ def add_elasticsearch_index(
586
+ self,
587
+ column: str,
588
+ index_name: Optional[str] = None,
589
+ host: Optional[str] = None,
590
+ port: Optional[int] = None,
591
+ es_client: Optional["Elasticsearch"] = None,
592
+ es_index_name: Optional[str] = None,
593
+ es_index_config: Optional[dict] = None,
594
+ ):
595
+ """Add a text index using ElasticSearch for fast retrieval.
596
+
597
+ Args:
598
+ column (`str`): The column of the documents to add to the index.
599
+ index_name (Optional `str`): The index_name/identifier of the index. This is the index name that is used to call `.get_nearest` or `.search`.
600
+ By default it corresponds to `column`.
601
+ host (Optional `str`, defaults to localhost):
602
+ host of where ElasticSearch is running
603
+ port (Optional `str`, defaults to 9200):
604
+ port of where ElasticSearch is running
605
+ es_client (Optional `elasticsearch.Elasticsearch`):
606
+ The elasticsearch client used to create the index if host and port are None.
607
+ es_index_name (Optional `str`): The elasticsearch index name used to create the index.
608
+ es_index_config (Optional `dict`):
609
+ The configuration of the elasticsearch index.
610
+ Default config is:
611
+
612
+ Config::
613
+
614
+ {
615
+ "settings": {
616
+ "number_of_shards": 1,
617
+ "analysis": {"analyzer": {"stop_standard": {"type": "standard", " stopwords": "_english_"}}},
618
+ },
619
+ "mappings": {
620
+ "properties": {
621
+ "text": {
622
+ "type": "text",
623
+ "analyzer": "standard",
624
+ "similarity": "BM25"
625
+ },
626
+ }
627
+ },
628
+ }
629
+ """
630
+ index_name = index_name if index_name is not None else column
631
+ es_index = ElasticSearchIndex(
632
+ host=host, port=port, es_client=es_client, es_index_name=es_index_name, es_index_config=es_index_config
633
+ )
634
+ es_index.add_documents(self, column=column)
635
+ self._indexes[index_name] = es_index
636
+
637
+ def load_elasticsearch_index(
638
+ self,
639
+ index_name: str,
640
+ es_index_name: str,
641
+ host: Optional[str] = None,
642
+ port: Optional[int] = None,
643
+ es_client: Optional["Elasticsearch"] = None,
644
+ es_index_config: Optional[dict] = None,
645
+ ):
646
+ """Load an existing text index using ElasticSearch for fast retrieval.
647
+
648
+ Args:
649
+ index_name (`str`):
650
+ The `index_name`/identifier of the index. This is the index name that is used to call `get_nearest` or `search`.
651
+ es_index_name (`str`):
652
+ The name of elasticsearch index to load.
653
+ host (`str`, *optional*, defaults to `localhost`):
654
+ Host of where ElasticSearch is running.
655
+ port (`str`, *optional*, defaults to `9200`):
656
+ Port of where ElasticSearch is running.
657
+ es_client (`elasticsearch.Elasticsearch`, *optional*):
658
+ The elasticsearch client used to create the index if host and port are `None`.
659
+ es_index_config (`dict`, *optional*):
660
+ The configuration of the elasticsearch index.
661
+ Default config is:
662
+ ```
663
+ {
664
+ "settings": {
665
+ "number_of_shards": 1,
666
+ "analysis": {"analyzer": {"stop_standard": {"type": "standard", " stopwords": "_english_"}}},
667
+ },
668
+ "mappings": {
669
+ "properties": {
670
+ "text": {
671
+ "type": "text",
672
+ "analyzer": "standard",
673
+ "similarity": "BM25"
674
+ },
675
+ }
676
+ },
677
+ }
678
+ ```
679
+ """
680
+ self._indexes[index_name] = ElasticSearchIndex(
681
+ host=host, port=port, es_client=es_client, es_index_name=es_index_name, es_index_config=es_index_config
682
+ )
683
+
684
+ def drop_index(self, index_name: str):
685
+ """Drop the index with the specified column.
686
+
687
+ Args:
688
+ index_name (`str`):
689
+ The `index_name`/identifier of the index.
690
+ """
691
+ del self._indexes[index_name]
692
+
693
+ def search(self, index_name: str, query: Union[str, np.array], k: int = 10, **kwargs) -> SearchResults:
694
+ """Find the nearest examples indices in the dataset to the query.
695
+
696
+ Args:
697
+ index_name (`str`):
698
+ The name/identifier of the index.
699
+ query (`Union[str, np.ndarray]`):
700
+ The query as a string if `index_name` is a text index or as a numpy array if `index_name` is a vector index.
701
+ k (`int`):
702
+ The number of examples to retrieve.
703
+
704
+ Returns:
705
+ `(scores, indices)`:
706
+ A tuple of `(scores, indices)` where:
707
+ - **scores** (`List[List[float]`): the retrieval scores from either FAISS (`IndexFlatL2` by default) or ElasticSearch of the retrieved examples
708
+ - **indices** (`List[List[int]]`): the indices of the retrieved examples
709
+ """
710
+ self._check_index_is_initialized(index_name)
711
+ return self._indexes[index_name].search(query, k, **kwargs)
712
+
713
+ def search_batch(
714
+ self, index_name: str, queries: Union[List[str], np.array], k: int = 10, **kwargs
715
+ ) -> BatchedSearchResults:
716
+ """Find the nearest examples indices in the dataset to the query.
717
+
718
+ Args:
719
+ index_name (`str`):
720
+ The `index_name`/identifier of the index.
721
+ queries (`Union[List[str], np.ndarray]`):
722
+ The queries as a list of strings if `index_name` is a text index or as a numpy array if `index_name` is a vector index.
723
+ k (`int`):
724
+ The number of examples to retrieve per query.
725
+
726
+ Returns:
727
+ `(total_scores, total_indices)`:
728
+ A tuple of `(total_scores, total_indices)` where:
729
+ - **total_scores** (`List[List[float]`): the retrieval scores from either FAISS (`IndexFlatL2` by default) or ElasticSearch of the retrieved examples per query
730
+ - **total_indices** (`List[List[int]]`): the indices of the retrieved examples per query
731
+ """
732
+ self._check_index_is_initialized(index_name)
733
+ return self._indexes[index_name].search_batch(queries, k, **kwargs)
734
+
735
+ def get_nearest_examples(
736
+ self, index_name: str, query: Union[str, np.array], k: int = 10, **kwargs
737
+ ) -> NearestExamplesResults:
738
+ """Find the nearest examples in the dataset to the query.
739
+
740
+ Args:
741
+ index_name (`str`):
742
+ The index_name/identifier of the index.
743
+ query (`Union[str, np.ndarray]`):
744
+ The query as a string if `index_name` is a text index or as a numpy array if `index_name` is a vector index.
745
+ k (`int`):
746
+ The number of examples to retrieve.
747
+
748
+ Returns:
749
+ `(scores, examples)`:
750
+ A tuple of `(scores, examples)` where:
751
+ - **scores** (`List[float]`): the retrieval scores from either FAISS (`IndexFlatL2` by default) or ElasticSearch of the retrieved examples
752
+ - **examples** (`dict`): the retrieved examples
753
+ """
754
+ self._check_index_is_initialized(index_name)
755
+ scores, indices = self.search(index_name, query, k, **kwargs)
756
+ top_indices = [i for i in indices if i >= 0]
757
+ return NearestExamplesResults(scores[: len(top_indices)], self[top_indices])
758
+
759
+ def get_nearest_examples_batch(
760
+ self, index_name: str, queries: Union[List[str], np.array], k: int = 10, **kwargs
761
+ ) -> BatchedNearestExamplesResults:
762
+ """Find the nearest examples in the dataset to the query.
763
+
764
+ Args:
765
+ index_name (`str`):
766
+ The `index_name`/identifier of the index.
767
+ queries (`Union[List[str], np.ndarray]`):
768
+ The queries as a list of strings if `index_name` is a text index or as a numpy array if `index_name` is a vector index.
769
+ k (`int`):
770
+ The number of examples to retrieve per query.
771
+
772
+ Returns:
773
+ `(total_scores, total_examples)`:
774
+ A tuple of `(total_scores, total_examples)` where:
775
+ - **total_scores** (`List[List[float]`): the retrieval scores from either FAISS (`IndexFlatL2` by default) or ElasticSearch of the retrieved examples per query
776
+ - **total_examples** (`List[dict]`): the retrieved examples per query
777
+ """
778
+ self._check_index_is_initialized(index_name)
779
+ total_scores, total_indices = self.search_batch(index_name, queries, k, **kwargs)
780
+ total_scores = [
781
+ scores_i[: len([i for i in indices_i if i >= 0])]
782
+ for scores_i, indices_i in zip(total_scores, total_indices)
783
+ ]
784
+ total_samples = [self[[i for i in indices if i >= 0]] for indices in total_indices]
785
+ return BatchedNearestExamplesResults(total_scores, total_samples)
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/datasets/splits.py ADDED
@@ -0,0 +1,635 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ # Lint as: python3
16
+ """Splits related API."""
17
+
18
+ import abc
19
+ import collections
20
+ import copy
21
+ import dataclasses
22
+ import re
23
+ from dataclasses import dataclass
24
+ from typing import Dict, List, Optional, Union
25
+
26
+ from .arrow_reader import FileInstructions, make_file_instructions
27
+ from .naming import _split_re
28
+ from .utils.py_utils import NonMutableDict, asdict
29
+
30
+
31
+ @dataclass
32
+ class SplitInfo:
33
+ name: str = dataclasses.field(default="", metadata={"include_in_asdict_even_if_is_default": True})
34
+ num_bytes: int = dataclasses.field(default=0, metadata={"include_in_asdict_even_if_is_default": True})
35
+ num_examples: int = dataclasses.field(default=0, metadata={"include_in_asdict_even_if_is_default": True})
36
+ shard_lengths: Optional[List[int]] = None
37
+
38
+ # Deprecated
39
+ # For backward compatibility, this field needs to always be included in files like
40
+ # dataset_infos.json and dataset_info.json files
41
+ # To do so, we always include it in the output of datasets.utils.py_utils.asdict(split_info)
42
+ dataset_name: Optional[str] = dataclasses.field(
43
+ default=None, metadata={"include_in_asdict_even_if_is_default": True}
44
+ )
45
+
46
+ @property
47
+ def file_instructions(self):
48
+ """Returns the list of dict(filename, take, skip)."""
49
+ # `self.dataset_name` is assigned in `SplitDict.add()`.
50
+ instructions = make_file_instructions(
51
+ name=self.dataset_name,
52
+ split_infos=[self],
53
+ instruction=str(self.name),
54
+ )
55
+ return instructions.file_instructions
56
+
57
+
58
+ @dataclass
59
+ class SubSplitInfo:
60
+ """Wrapper around a sub split info.
61
+ This class expose info on the subsplit:
62
+ ```
63
+ ds, info = datasets.load_dataset(..., split='train[75%:]', with_info=True)
64
+ info.splits['train[75%:]'].num_examples
65
+ ```
66
+ """
67
+
68
+ instructions: FileInstructions
69
+
70
+ @property
71
+ def num_examples(self):
72
+ """Returns the number of example in the subsplit."""
73
+ return self.instructions.num_examples
74
+
75
+ @property
76
+ def file_instructions(self):
77
+ """Returns the list of dict(filename, take, skip)."""
78
+ return self.instructions.file_instructions
79
+
80
+
81
+ class SplitBase(metaclass=abc.ABCMeta):
82
+ # pylint: disable=line-too-long
83
+ """Abstract base class for Split compositionality.
84
+
85
+ See the
86
+ [guide on splits](../loading#slice-splits)
87
+ for more information.
88
+
89
+ There are three parts to the composition:
90
+ 1) The splits are composed (defined, merged, split,...) together before
91
+ calling the `.as_dataset()` function. This is done with the `__add__`,
92
+ `__getitem__`, which return a tree of `SplitBase` (whose leaf
93
+ are the `NamedSplit` objects)
94
+
95
+ ```
96
+ split = datasets.Split.TRAIN + datasets.Split.TEST.subsplit(datasets.percent[:50])
97
+ ```
98
+
99
+ 2) The `SplitBase` is forwarded to the `.as_dataset()` function
100
+ to be resolved into actual read instruction. This is done by the
101
+ `.get_read_instruction()` method which takes the real dataset splits
102
+ (name, number of shards,...) and parse the tree to return a
103
+ `SplitReadInstruction()` object
104
+
105
+ ```
106
+ read_instruction = split.get_read_instruction(self.info.splits)
107
+ ```
108
+
109
+ 3) The `SplitReadInstruction` is then used in the `tf.data.Dataset` pipeline
110
+ to define which files to read and how to skip examples within file.
111
+
112
+ """
113
+
114
+ # pylint: enable=line-too-long
115
+
116
+ @abc.abstractmethod
117
+ def get_read_instruction(self, split_dict):
118
+ """Parse the descriptor tree and compile all read instructions together.
119
+
120
+ Args:
121
+ split_dict: `dict`, The `dict[split_name, SplitInfo]` of the dataset
122
+
123
+ Returns:
124
+ split_read_instruction: `SplitReadInstruction`
125
+ """
126
+ raise NotImplementedError("Abstract method")
127
+
128
+ def __eq__(self, other):
129
+ """Equality: datasets.Split.TRAIN == 'train'."""
130
+ if isinstance(other, (NamedSplit, str)):
131
+ return False
132
+ raise NotImplementedError("Equality is not implemented between merged/sub splits.")
133
+
134
+ def __ne__(self, other):
135
+ """InEquality: datasets.Split.TRAIN != 'test'."""
136
+ return not self.__eq__(other)
137
+
138
+ def __add__(self, other):
139
+ """Merging: datasets.Split.TRAIN + datasets.Split.TEST."""
140
+ return _SplitMerged(self, other)
141
+
142
+ def subsplit(self, arg=None, k=None, percent=None, weighted=None): # pylint: disable=redefined-outer-name
143
+ """Divides this split into subsplits.
144
+
145
+ There are 3 ways to define subsplits, which correspond to the 3
146
+ arguments `k` (get `k` even subsplits), `percent` (get a slice of the
147
+ dataset with `datasets.percent`), and `weighted` (get subsplits with proportions
148
+ specified by `weighted`).
149
+
150
+ Example::
151
+
152
+ ```
153
+ # 50% train, 50% test
154
+ train, test = split.subsplit(k=2)
155
+ # 50% train, 25% test, 25% validation
156
+ train, test, validation = split.subsplit(weighted=[2, 1, 1])
157
+ # Extract last 20%
158
+ subsplit = split.subsplit(datasets.percent[-20:])
159
+ ```
160
+
161
+ Warning: k and weighted will be converted into percent which mean that
162
+ values below the percent will be rounded up or down. The final split may be
163
+ bigger to deal with remainders. For instance:
164
+
165
+ ```
166
+ train, test, valid = split.subsplit(k=3) # 33%, 33%, 34%
167
+ s1, s2, s3, s4 = split.subsplit(weighted=[2, 2, 1, 1]) # 33%, 33%, 16%, 18%
168
+ ```
169
+
170
+ Args:
171
+ arg: If no kwargs are given, `arg` will be interpreted as one of
172
+ `k`, `percent`, or `weighted` depending on the type.
173
+ For example:
174
+ ```
175
+ split.subsplit(10) # Equivalent to split.subsplit(k=10)
176
+ split.subsplit(datasets.percent[:-20]) # percent=datasets.percent[:-20]
177
+ split.subsplit([1, 1, 2]) # weighted=[1, 1, 2]
178
+ ```
179
+ k: `int` If set, subdivide the split into `k` equal parts.
180
+ percent: `datasets.percent slice`, return a single subsplit corresponding to
181
+ a slice of the original split. For example:
182
+ `split.subsplit(datasets.percent[-20:]) # Last 20% of the dataset`.
183
+ weighted: `list[int]`, return a list of subsplits whose proportions match
184
+ the normalized sum of the list. For example:
185
+ `split.subsplit(weighted=[1, 1, 2]) # 25%, 25%, 50%`.
186
+
187
+ Returns:
188
+ A subsplit or list of subsplits extracted from this split object.
189
+ """
190
+ # Note that the percent kwargs redefine the outer name datasets.percent. This
191
+ # is done for consistency (.subsplit(percent=datasets.percent[:40]))
192
+ if sum(bool(x) for x in (arg, k, percent, weighted)) != 1:
193
+ raise ValueError("Only one argument of subsplit should be set.")
194
+
195
+ # Auto deduce k
196
+ if isinstance(arg, int):
197
+ k = arg
198
+ elif isinstance(arg, slice):
199
+ percent = arg
200
+ elif isinstance(arg, list):
201
+ weighted = arg
202
+
203
+ if not (k or percent or weighted):
204
+ raise ValueError(
205
+ f"Invalid split argument {arg}. Only list, slice and int supported. "
206
+ "One of k, weighted or percent should be set to a non empty value."
207
+ )
208
+
209
+ def assert_slices_coverage(slices):
210
+ # Ensure that the expended slices cover all percents.
211
+ assert sum((list(range(*s.indices(100))) for s in slices), []) == list(range(100))
212
+
213
+ if k:
214
+ if not 0 < k <= 100:
215
+ raise ValueError(f"Subsplit k should be between 0 and 100, got {k}")
216
+ shift = 100 // k
217
+ slices = [slice(i * shift, (i + 1) * shift) for i in range(k)]
218
+ # Round up last element to ensure all elements are taken
219
+ slices[-1] = slice(slices[-1].start, 100)
220
+ # Internal check to ensure full coverage
221
+ assert_slices_coverage(slices)
222
+ return tuple(_SubSplit(self, s) for s in slices)
223
+ elif percent:
224
+ return _SubSplit(self, percent)
225
+ elif weighted:
226
+ # Normalize the weighted sum
227
+ total = sum(weighted)
228
+ weighted = [100 * x // total for x in weighted]
229
+ # Create the slice for each of the elements
230
+ start = 0
231
+ stop = 0
232
+ slices = []
233
+ for v in weighted:
234
+ stop += v
235
+ slices.append(slice(start, stop))
236
+ start = stop
237
+ # Round up last element to ensure all elements are taken
238
+ slices[-1] = slice(slices[-1].start, 100)
239
+ # Internal check to ensure full coverage
240
+ assert_slices_coverage(slices)
241
+ return tuple(_SubSplit(self, s) for s in slices)
242
+ else:
243
+ # Should not be possible
244
+ raise ValueError("Could not determine the split")
245
+
246
+
247
+ # 2 requirements:
248
+ # 1. datasets.percent be sliceable
249
+ # 2. datasets.percent be documented
250
+ #
251
+ # Instances are not documented, so we want datasets.percent to be a class, but to
252
+ # have it be sliceable, we need this metaclass.
253
+ class PercentSliceMeta(type):
254
+ def __getitem__(cls, slice_value):
255
+ if not isinstance(slice_value, slice):
256
+ raise ValueError(f"datasets.percent should only be called with slice, not {slice_value}")
257
+ return slice_value
258
+
259
+
260
+ class PercentSlice(metaclass=PercentSliceMeta):
261
+ # pylint: disable=line-too-long
262
+ """Syntactic sugar for defining slice subsplits: `datasets.percent[75:-5]`.
263
+
264
+ See the
265
+ [guide on splits](../loading#slice-splits)
266
+ for more information.
267
+ """
268
+
269
+ # pylint: enable=line-too-long
270
+ pass
271
+
272
+
273
+ percent = PercentSlice # pylint: disable=invalid-name
274
+
275
+
276
+ class _SplitMerged(SplitBase):
277
+ """Represent two split descriptors merged together."""
278
+
279
+ def __init__(self, split1, split2):
280
+ self._split1 = split1
281
+ self._split2 = split2
282
+
283
+ def get_read_instruction(self, split_dict):
284
+ read_instruction1 = self._split1.get_read_instruction(split_dict)
285
+ read_instruction2 = self._split2.get_read_instruction(split_dict)
286
+ return read_instruction1 + read_instruction2
287
+
288
+ def __repr__(self):
289
+ return f"({repr(self._split1)} + {repr(self._split2)})"
290
+
291
+
292
+ class _SubSplit(SplitBase):
293
+ """Represent a sub split of a split descriptor."""
294
+
295
+ def __init__(self, split, slice_value):
296
+ self._split = split
297
+ self._slice_value = slice_value
298
+
299
+ def get_read_instruction(self, split_dict):
300
+ return self._split.get_read_instruction(split_dict)[self._slice_value]
301
+
302
+ def __repr__(self):
303
+ slice_str = "{start}:{stop}"
304
+ if self._slice_value.step is not None:
305
+ slice_str += ":{step}"
306
+ slice_str = slice_str.format(
307
+ start="" if self._slice_value.start is None else self._slice_value.start,
308
+ stop="" if self._slice_value.stop is None else self._slice_value.stop,
309
+ step=self._slice_value.step,
310
+ )
311
+ return f"{repr(self._split)}(datasets.percent[{slice_str}])"
312
+
313
+
314
+ class NamedSplit(SplitBase):
315
+ """Descriptor corresponding to a named split (train, test, ...).
316
+
317
+ Example:
318
+ Each descriptor can be composed with other using addition or slice:
319
+
320
+ ```py
321
+ split = datasets.Split.TRAIN.subsplit(datasets.percent[0:25]) + datasets.Split.TEST
322
+ ```
323
+
324
+ The resulting split will correspond to 25% of the train split merged with
325
+ 100% of the test split.
326
+
327
+ A split cannot be added twice, so the following will fail:
328
+
329
+ ```py
330
+ split = (
331
+ datasets.Split.TRAIN.subsplit(datasets.percent[:25]) +
332
+ datasets.Split.TRAIN.subsplit(datasets.percent[75:])
333
+ ) # Error
334
+ split = datasets.Split.TEST + datasets.Split.ALL # Error
335
+ ```
336
+
337
+ The slices can be applied only one time. So the following are valid:
338
+
339
+ ```py
340
+ split = (
341
+ datasets.Split.TRAIN.subsplit(datasets.percent[:25]) +
342
+ datasets.Split.TEST.subsplit(datasets.percent[:50])
343
+ )
344
+ split = (datasets.Split.TRAIN + datasets.Split.TEST).subsplit(datasets.percent[:50])
345
+ ```
346
+
347
+ But this is not valid:
348
+
349
+ ```py
350
+ train = datasets.Split.TRAIN
351
+ test = datasets.Split.TEST
352
+ split = train.subsplit(datasets.percent[:25]).subsplit(datasets.percent[:25])
353
+ split = (train.subsplit(datasets.percent[:25]) + test).subsplit(datasets.percent[:50])
354
+ ```
355
+ """
356
+
357
+ def __init__(self, name):
358
+ self._name = name
359
+ split_names_from_instruction = [split_instruction.split("[")[0] for split_instruction in name.split("+")]
360
+ for split_name in split_names_from_instruction:
361
+ if not re.match(_split_re, split_name):
362
+ raise ValueError(f"Split name should match '{_split_re}' but got '{split_name}'.")
363
+
364
+ def __str__(self):
365
+ return self._name
366
+
367
+ def __repr__(self):
368
+ return f"NamedSplit({self._name!r})"
369
+
370
+ def __eq__(self, other):
371
+ """Equality: datasets.Split.TRAIN == 'train'."""
372
+ if isinstance(other, NamedSplit):
373
+ return self._name == other._name # pylint: disable=protected-access
374
+ elif isinstance(other, SplitBase):
375
+ return False
376
+ elif isinstance(other, str): # Other should be string
377
+ return self._name == other
378
+ else:
379
+ raise ValueError(f"Equality not supported between split {self} and {other}")
380
+
381
+ def __lt__(self, other):
382
+ return self._name < other._name # pylint: disable=protected-access
383
+
384
+ def __hash__(self):
385
+ return hash(self._name)
386
+
387
+ def get_read_instruction(self, split_dict):
388
+ return SplitReadInstruction(split_dict[self._name])
389
+
390
+
391
+ class NamedSplitAll(NamedSplit):
392
+ """Split corresponding to the union of all defined dataset splits."""
393
+
394
+ def __init__(self):
395
+ super().__init__("all")
396
+
397
+ def __repr__(self):
398
+ return "NamedSplitAll()"
399
+
400
+ def get_read_instruction(self, split_dict):
401
+ # Merge all dataset split together
402
+ read_instructions = [SplitReadInstruction(s) for s in split_dict.values()]
403
+ return sum(read_instructions, SplitReadInstruction())
404
+
405
+
406
+ class Split:
407
+ # pylint: disable=line-too-long
408
+ """`Enum` for dataset splits.
409
+
410
+ Datasets are typically split into different subsets to be used at various
411
+ stages of training and evaluation.
412
+
413
+ - `TRAIN`: the training data.
414
+ - `VALIDATION`: the validation data. If present, this is typically used as
415
+ evaluation data while iterating on a model (e.g. changing hyperparameters,
416
+ model architecture, etc.).
417
+ - `TEST`: the testing data. This is the data to report metrics on. Typically
418
+ you do not want to use this during model iteration as you may overfit to it.
419
+ - `ALL`: the union of all defined dataset splits.
420
+
421
+ All splits, including compositions inherit from `datasets.SplitBase`.
422
+
423
+ See the [guide](../load_hub#splits) on splits for more information.
424
+
425
+ Example:
426
+
427
+ ```py
428
+ >>> datasets.SplitGenerator(
429
+ ... name=datasets.Split.TRAIN,
430
+ ... gen_kwargs={"split_key": "train", "files": dl_manager.download_and extract(url)},
431
+ ... ),
432
+ ... datasets.SplitGenerator(
433
+ ... name=datasets.Split.VALIDATION,
434
+ ... gen_kwargs={"split_key": "validation", "files": dl_manager.download_and extract(url)},
435
+ ... ),
436
+ ... datasets.SplitGenerator(
437
+ ... name=datasets.Split.TEST,
438
+ ... gen_kwargs={"split_key": "test", "files": dl_manager.download_and extract(url)},
439
+ ... )
440
+ ```
441
+ """
442
+
443
+ # pylint: enable=line-too-long
444
+ TRAIN = NamedSplit("train")
445
+ TEST = NamedSplit("test")
446
+ VALIDATION = NamedSplit("validation")
447
+ ALL = NamedSplitAll()
448
+
449
+ def __new__(cls, name):
450
+ """Create a custom split with datasets.Split('custom_name')."""
451
+ return NamedSplitAll() if name == "all" else NamedSplit(name)
452
+
453
+
454
+ # Similar to SplitInfo, but contain an additional slice info
455
+ SlicedSplitInfo = collections.namedtuple(
456
+ "SlicedSplitInfo",
457
+ [
458
+ "split_info",
459
+ "slice_value",
460
+ ],
461
+ ) # noqa: E231
462
+
463
+
464
+ class SplitReadInstruction:
465
+ """Object containing the reading instruction for the dataset.
466
+
467
+ Similarly to `SplitDescriptor` nodes, this object can be composed with itself,
468
+ but the resolution happens instantaneously, instead of keeping track of the
469
+ tree, such as all instructions are compiled and flattened in a single
470
+ SplitReadInstruction object containing the list of files and slice to use.
471
+
472
+ Once resolved, the instructions can be accessed with:
473
+
474
+ ```
475
+ read_instructions.get_list_sliced_split_info() # List of splits to use
476
+ ```
477
+
478
+ """
479
+
480
+ def __init__(self, split_info=None):
481
+ self._splits = NonMutableDict(error_msg="Overlap between splits. Split {key} has been added with " "itself.")
482
+
483
+ if split_info:
484
+ self.add(SlicedSplitInfo(split_info=split_info, slice_value=None))
485
+
486
+ def add(self, sliced_split):
487
+ """Add a SlicedSplitInfo the read instructions."""
488
+ # TODO(epot): Check that the number of examples per shard % 100 == 0
489
+ # Otherwise the slices value may be unbalanced and not exactly reflect the
490
+ # requested slice.
491
+ self._splits[sliced_split.split_info.name] = sliced_split
492
+
493
+ def __add__(self, other):
494
+ """Merging split together."""
495
+ # Will raise error if a split has already be added (NonMutableDict)
496
+ # TODO(epot): If a split is already added but there is no overlap between
497
+ # the slices, should merge the slices (ex: [:10] + [80:])
498
+ split_instruction = SplitReadInstruction()
499
+ split_instruction._splits.update(self._splits) # pylint: disable=protected-access
500
+ split_instruction._splits.update(other._splits) # pylint: disable=protected-access
501
+ return split_instruction
502
+
503
+ def __getitem__(self, slice_value):
504
+ """Sub-splits."""
505
+ # Will raise an error if a split has already been sliced
506
+ split_instruction = SplitReadInstruction()
507
+ for v in self._splits.values():
508
+ if v.slice_value is not None:
509
+ raise ValueError(f"Trying to slice Split {v.split_info.name} which has already been sliced")
510
+ v = v._asdict()
511
+ v["slice_value"] = slice_value
512
+ split_instruction.add(SlicedSplitInfo(**v))
513
+ return split_instruction
514
+
515
+ def get_list_sliced_split_info(self):
516
+ return list(self._splits.values())
517
+
518
+
519
+ class SplitDict(dict):
520
+ """Split info object."""
521
+
522
+ def __init__(self, *args, dataset_name=None, **kwargs):
523
+ super().__init__(*args, **kwargs)
524
+ self.dataset_name = dataset_name
525
+
526
+ def __getitem__(self, key: Union[SplitBase, str]):
527
+ # 1st case: The key exists: `info.splits['train']`
528
+ if str(key) in self:
529
+ return super().__getitem__(str(key))
530
+ # 2nd case: Uses instructions: `info.splits['train[50%]']`
531
+ else:
532
+ instructions = make_file_instructions(
533
+ name=self.dataset_name,
534
+ split_infos=self.values(),
535
+ instruction=key,
536
+ )
537
+ return SubSplitInfo(instructions)
538
+
539
+ def __setitem__(self, key: Union[SplitBase, str], value: SplitInfo):
540
+ if key != value.name:
541
+ raise ValueError(f"Cannot add elem. (key mismatch: '{key}' != '{value.name}')")
542
+ super().__setitem__(key, value)
543
+
544
+ def add(self, split_info: SplitInfo):
545
+ """Add the split info."""
546
+ if split_info.name in self:
547
+ raise ValueError(f"Split {split_info.name} already present")
548
+ split_info.dataset_name = self.dataset_name
549
+ super().__setitem__(split_info.name, split_info)
550
+
551
+ @property
552
+ def total_num_examples(self):
553
+ """Return the total number of examples."""
554
+ return sum(s.num_examples for s in self.values())
555
+
556
+ @classmethod
557
+ def from_split_dict(cls, split_infos: Union[List, Dict], dataset_name: Optional[str] = None):
558
+ """Returns a new SplitDict initialized from a Dict or List of `split_infos`."""
559
+ if isinstance(split_infos, dict):
560
+ split_infos = list(split_infos.values())
561
+
562
+ if dataset_name is None:
563
+ dataset_name = split_infos[0].get("dataset_name") if split_infos else None
564
+
565
+ split_dict = cls(dataset_name=dataset_name)
566
+
567
+ for split_info in split_infos:
568
+ if isinstance(split_info, dict):
569
+ split_info = SplitInfo(**split_info)
570
+ split_dict.add(split_info)
571
+
572
+ return split_dict
573
+
574
+ def to_split_dict(self):
575
+ """Returns a list of SplitInfo protos that we have."""
576
+ out = []
577
+ for split_name, split_info in self.items():
578
+ split_info = copy.deepcopy(split_info)
579
+ split_info.name = split_name
580
+ out.append(split_info)
581
+ return out
582
+
583
+ def copy(self):
584
+ return SplitDict.from_split_dict(self.to_split_dict(), self.dataset_name)
585
+
586
+ def _to_yaml_list(self) -> list:
587
+ out = [asdict(s) for s in self.to_split_dict()]
588
+ # we don't need the shard lengths in YAML, since it depends on max_shard_size and num_proc
589
+ for split_info_dict in out:
590
+ split_info_dict.pop("shard_lengths", None)
591
+ # we don't need the dataset_name attribute that is deprecated
592
+ for split_info_dict in out:
593
+ split_info_dict.pop("dataset_name", None)
594
+ return out
595
+
596
+ @classmethod
597
+ def _from_yaml_list(cls, yaml_data: list) -> "SplitDict":
598
+ return cls.from_split_dict(yaml_data)
599
+
600
+
601
+ @dataclass
602
+ class SplitGenerator:
603
+ """Defines the split information for the generator.
604
+
605
+ This should be used as returned value of
606
+ `GeneratorBasedBuilder._split_generators`.
607
+ See `GeneratorBasedBuilder._split_generators` for more info and example
608
+ of usage.
609
+
610
+ Args:
611
+ name (`str`):
612
+ Name of the `Split` for which the generator will
613
+ create the examples.
614
+ **gen_kwargs (additional keyword arguments):
615
+ Keyword arguments to forward to the `DatasetBuilder._generate_examples` method
616
+ of the builder.
617
+
618
+ Example:
619
+
620
+ ```py
621
+ >>> datasets.SplitGenerator(
622
+ ... name=datasets.Split.TRAIN,
623
+ ... gen_kwargs={"split_key": "train", "files": dl_manager.download_and_extract(url)},
624
+ ... )
625
+ ```
626
+ """
627
+
628
+ name: str
629
+ gen_kwargs: Dict = dataclasses.field(default_factory=dict)
630
+ split_info: SplitInfo = dataclasses.field(init=False)
631
+
632
+ def __post_init__(self):
633
+ self.name = str(self.name) # Make sure we convert NamedSplits in strings
634
+ NamedSplit(self.name) # check that it's a valid split name
635
+ self.split_info = SplitInfo(name=self.name)
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/datasets/table.py ADDED
@@ -0,0 +1,2422 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import os
3
+ from functools import partial
4
+ from itertools import groupby
5
+ from typing import TYPE_CHECKING, Any, Callable, Iterator, List, Optional, Tuple, TypeVar, Union
6
+
7
+ import numpy as np
8
+ import pyarrow as pa
9
+ import pyarrow.compute as pc
10
+ import pyarrow.types
11
+
12
+ from . import config
13
+ from .utils.logging import get_logger
14
+
15
+
16
+ if TYPE_CHECKING:
17
+ from .features.features import Features, FeatureType
18
+
19
+
20
+ logger = get_logger(__name__)
21
+
22
+
23
+ def inject_arrow_table_documentation(arrow_table_method):
24
+ def wrapper(fn):
25
+ fn.__doc__ = arrow_table_method.__doc__ + (fn.__doc__ if fn.__doc__ is not None else "")
26
+ fn.__doc__ = fn.__doc__.replace("pyarrow.Table", "Table")
27
+ if hasattr(arrow_table_method, "__annotations__"):
28
+ fn.__annotations__ = arrow_table_method.__annotations__
29
+ return fn
30
+
31
+ return wrapper
32
+
33
+
34
+ def _in_memory_arrow_table_from_file(filename: str) -> pa.Table:
35
+ in_memory_stream = pa.input_stream(filename)
36
+ opened_stream = pa.ipc.open_stream(in_memory_stream)
37
+ pa_table = opened_stream.read_all()
38
+ return pa_table
39
+
40
+
41
+ def _in_memory_arrow_table_from_buffer(buffer: pa.Buffer) -> pa.Table:
42
+ stream = pa.BufferReader(buffer)
43
+ opened_stream = pa.ipc.open_stream(stream)
44
+ table = opened_stream.read_all()
45
+ return table
46
+
47
+
48
+ def _memory_mapped_record_batch_reader_from_file(filename: str) -> pa.RecordBatchStreamReader:
49
+ memory_mapped_stream = pa.memory_map(filename)
50
+ return pa.ipc.open_stream(memory_mapped_stream)
51
+
52
+
53
+ def read_schema_from_file(filename: str) -> pa.Schema:
54
+ """
55
+ Infer arrow table schema from file without loading whole file into memory.
56
+ Usefull especially while having very big files.
57
+ """
58
+ with pa.memory_map(filename) as memory_mapped_stream:
59
+ schema = pa.ipc.open_stream(memory_mapped_stream).schema
60
+ return schema
61
+
62
+
63
+ def _memory_mapped_arrow_table_from_file(filename: str) -> pa.Table:
64
+ opened_stream = _memory_mapped_record_batch_reader_from_file(filename)
65
+ pa_table = opened_stream.read_all()
66
+ return pa_table
67
+
68
+
69
+ def _deepcopy(x, memo: dict):
70
+ """deepcopy a regular class instance"""
71
+ cls = x.__class__
72
+ result = cls.__new__(cls)
73
+ memo[id(x)] = result
74
+ for k, v in x.__dict__.items():
75
+ setattr(result, k, copy.deepcopy(v, memo))
76
+ return result
77
+
78
+
79
+ def _interpolation_search(arr: List[int], x: int) -> int:
80
+ """
81
+ Return the position i of a sorted array so that arr[i] <= x < arr[i+1]
82
+
83
+ Args:
84
+ arr (`List[int]`): non-empty sorted list of integers
85
+ x (`int`): query
86
+
87
+ Returns:
88
+ `int`: the position i so that arr[i] <= x < arr[i+1]
89
+
90
+ Raises:
91
+ `IndexError`: if the array is empty or if the query is outside the array values
92
+ """
93
+ i, j = 0, len(arr) - 1
94
+ while i < j and arr[i] <= x < arr[j]:
95
+ k = i + ((j - i) * (x - arr[i]) // (arr[j] - arr[i]))
96
+ if arr[k] <= x < arr[k + 1]:
97
+ return k
98
+ elif arr[k] < x:
99
+ i, j = k + 1, j
100
+ else:
101
+ i, j = i, k
102
+ raise IndexError(f"Invalid query '{x}' for size {arr[-1] if len(arr) else 'none'}.")
103
+
104
+
105
+ class IndexedTableMixin:
106
+ def __init__(self, table: pa.Table):
107
+ self._schema: pa.Schema = table.schema
108
+ self._batches: List[pa.RecordBatch] = [
109
+ recordbatch for recordbatch in table.to_batches() if len(recordbatch) > 0
110
+ ]
111
+ self._offsets: np.ndarray = np.cumsum([0] + [len(b) for b in self._batches], dtype=np.int64)
112
+
113
+ def fast_gather(self, indices: Union[List[int], np.ndarray]) -> pa.Table:
114
+ """
115
+ Create a pa.Table by gathering the records at the records at the specified indices. Should be faster
116
+ than pa.concat_tables(table.fast_slice(int(i) % table.num_rows, 1) for i in indices) since NumPy can compute
117
+ the binary searches in parallel, highly optimized C
118
+ """
119
+ if not len(indices):
120
+ raise ValueError("Indices must be non-empty")
121
+ batch_indices = np.searchsorted(self._offsets, indices, side="right") - 1
122
+ return pa.Table.from_batches(
123
+ [
124
+ self._batches[batch_idx].slice(i - self._offsets[batch_idx], 1)
125
+ for batch_idx, i in zip(batch_indices, indices)
126
+ ],
127
+ schema=self._schema,
128
+ )
129
+
130
+ def fast_slice(self, offset=0, length=None) -> pa.Table:
131
+ """
132
+ Slice the Table using interpolation search.
133
+ The behavior is the same as `pyarrow.Table.slice` but it's significantly faster.
134
+
135
+ Interpolation search is used to find the start and end indexes of the batches we want to keep.
136
+ The batches to keep are then concatenated to form the sliced Table.
137
+ """
138
+ if offset < 0:
139
+ raise IndexError("Offset must be non-negative")
140
+ elif offset >= self._offsets[-1] or (length is not None and length <= 0):
141
+ return pa.Table.from_batches([], schema=self._schema)
142
+ i = _interpolation_search(self._offsets, offset)
143
+ if length is None or length + offset >= self._offsets[-1]:
144
+ batches = self._batches[i:]
145
+ batches[0] = batches[0].slice(offset - self._offsets[i])
146
+ else:
147
+ j = _interpolation_search(self._offsets, offset + length - 1)
148
+ batches = self._batches[i : j + 1]
149
+ batches[-1] = batches[-1].slice(0, offset + length - self._offsets[j])
150
+ batches[0] = batches[0].slice(offset - self._offsets[i])
151
+ return pa.Table.from_batches(batches, schema=self._schema)
152
+
153
+
154
+ class Table(IndexedTableMixin):
155
+ """
156
+ Wraps a pyarrow Table by using composition.
157
+ This is the base class for `InMemoryTable`, `MemoryMappedTable` and `ConcatenationTable`.
158
+
159
+ It implements all the basic attributes/methods of the pyarrow Table class except
160
+ the Table transforms: `slice, filter, flatten, combine_chunks, cast, add_column,
161
+ append_column, remove_column, set_column, rename_columns` and `drop`.
162
+
163
+ The implementation of these methods differs for the subclasses.
164
+ """
165
+
166
+ def __init__(self, table: pa.Table):
167
+ super().__init__(table)
168
+ self.table = table
169
+
170
+ def __deepcopy__(self, memo: dict):
171
+ # arrow tables are immutable, so there's no need to copy self.table
172
+ # moreover calling deepcopy on a pyarrow table seems to make pa.total_allocated_bytes() decrease for some reason
173
+ # by adding it to the memo, self.table won't be copied
174
+ memo[id(self.table)] = self.table
175
+ # same for the recordbatches used by the index
176
+ memo[id(self._batches)] = list(self._batches)
177
+ return _deepcopy(self, memo)
178
+
179
+ def validate(self, *args, **kwargs):
180
+ """
181
+ Perform validation checks. An exception is raised if validation fails.
182
+
183
+ By default only cheap validation checks are run. Pass `full=True`
184
+ for thorough validation checks (potentially `O(n)`).
185
+
186
+ Args:
187
+ full (`bool`, defaults to `False`):
188
+ If `True`, run expensive checks, otherwise cheap checks only.
189
+
190
+ Raises:
191
+ `pa.lib.ArrowInvalid`: if validation fails
192
+ """
193
+ return self.table.validate(*args, **kwargs)
194
+
195
+ def equals(self, *args, **kwargs):
196
+ """
197
+ Check if contents of two tables are equal.
198
+
199
+ Args:
200
+ other ([`~datasets.table.Table`]):
201
+ Table to compare against.
202
+ check_metadata `bool`, defaults to `False`):
203
+ Whether schema metadata equality should be checked as well.
204
+
205
+ Returns:
206
+ `bool`
207
+ """
208
+ args = tuple(arg.table if isinstance(arg, Table) else arg for arg in args)
209
+ kwargs = {k: v.table if isinstance(v, Table) else v for k, v in kwargs}
210
+ return self.table.equals(*args, **kwargs)
211
+
212
+ def to_batches(self, *args, **kwargs):
213
+ """
214
+ Convert Table to list of (contiguous) `RecordBatch` objects.
215
+
216
+ Args:
217
+ max_chunksize (`int`, defaults to `None`):
218
+ Maximum size for `RecordBatch` chunks. Individual chunks may be
219
+ smaller depending on the chunk layout of individual columns.
220
+
221
+ Returns:
222
+ `List[pyarrow.RecordBatch]`
223
+ """
224
+ return self.table.to_batches(*args, **kwargs)
225
+
226
+ def to_pydict(self, *args, **kwargs):
227
+ """
228
+ Convert the Table to a `dict` or `OrderedDict`.
229
+
230
+ Returns:
231
+ `dict`
232
+ """
233
+ return self.table.to_pydict(*args, **kwargs)
234
+
235
+ def to_pylist(self, *args, **kwargs):
236
+ """
237
+ Convert the Table to a list
238
+
239
+ Returns:
240
+ `list`
241
+ """
242
+ return self.table.to_pylist(*args, **kwargs)
243
+
244
+ def to_pandas(self, *args, **kwargs):
245
+ """
246
+ Convert to a pandas-compatible NumPy array or DataFrame, as appropriate.
247
+
248
+ Args:
249
+ memory_pool (`MemoryPool`, defaults to `None`):
250
+ Arrow MemoryPool to use for allocations. Uses the default memory
251
+ pool is not passed.
252
+ strings_to_categorical (`bool`, defaults to `False`):
253
+ Encode string (UTF8) and binary types to `pandas.Categorical`.
254
+ categories (`list`, defaults to `empty`):
255
+ List of fields that should be returned as `pandas.Categorical`. Only
256
+ applies to table-like data structures.
257
+ zero_copy_only (`bool`, defaults to `False`):
258
+ Raise an `ArrowException` if this function call would require copying
259
+ the underlying data.
260
+ integer_object_nulls (`bool`, defaults to `False`):
261
+ Cast integers with nulls to objects.
262
+ date_as_object (`bool`, defaults to `True`):
263
+ Cast dates to objects. If `False`, convert to `datetime64[ns]` dtype.
264
+ timestamp_as_object (`bool`, defaults to `False`):
265
+ Cast non-nanosecond timestamps (`np.datetime64`) to objects. This is
266
+ useful if you have timestamps that don't fit in the normal date
267
+ range of nanosecond timestamps (1678 CE-2262 CE).
268
+ If `False`, all timestamps are converted to `datetime64[ns]` dtype.
269
+ use_threads (`bool`, defaults to `True`):
270
+ Whether to parallelize the conversion using multiple threads.
271
+ deduplicate_objects (`bool`, defaults to `False`):
272
+ Do not create multiple copies Python objects when created, to save
273
+ on memory use. Conversion will be slower.
274
+ ignore_metadata (`bool`, defaults to `False`):
275
+ If `True`, do not use the 'pandas' metadata to reconstruct the
276
+ DataFrame index, if present.
277
+ safe (`bool`, defaults to `True`):
278
+ For certain data types, a cast is needed in order to store the
279
+ data in a pandas DataFrame or Series (e.g. timestamps are always
280
+ stored as nanoseconds in pandas). This option controls whether it
281
+ is a safe cast or not.
282
+ split_blocks (`bool`, defaults to `False`):
283
+ If `True`, generate one internal "block" for each column when
284
+ creating a pandas.DataFrame from a `RecordBatch` or `Table`. While this
285
+ can temporarily reduce memory note that various pandas operations
286
+ can trigger "consolidation" which may balloon memory use.
287
+ self_destruct (`bool`, defaults to `False`):
288
+ EXPERIMENTAL: If `True`, attempt to deallocate the originating Arrow
289
+ memory while converting the Arrow object to pandas. If you use the
290
+ object after calling `to_pandas` with this option it will crash your
291
+ program.
292
+ types_mapper (`function`, defaults to `None`):
293
+ A function mapping a pyarrow DataType to a pandas `ExtensionDtype`.
294
+ This can be used to override the default pandas type for conversion
295
+ of built-in pyarrow types or in absence of `pandas_metadata` in the
296
+ Table schema. The function receives a pyarrow DataType and is
297
+ expected to return a pandas `ExtensionDtype` or `None` if the
298
+ default conversion should be used for that type. If you have
299
+ a dictionary mapping, you can pass `dict.get` as function.
300
+
301
+ Returns:
302
+ `pandas.Series` or `pandas.DataFrame`: `pandas.Series` or `pandas.DataFrame` depending on type of object
303
+ """
304
+ return self.table.to_pandas(*args, **kwargs)
305
+
306
+ def to_string(self, *args, **kwargs):
307
+ return self.table.to_string(*args, **kwargs)
308
+
309
+ def to_reader(self, max_chunksize: Optional[int] = None):
310
+ """
311
+ Convert the Table to a RecordBatchReader.
312
+
313
+ Note that this method is zero-copy, it merely exposes the same data under a different API.
314
+
315
+ Args:
316
+ max_chunksize (`int`, defaults to `None`)
317
+ Maximum size for RecordBatch chunks. Individual chunks may be smaller depending
318
+ on the chunk layout of individual columns.
319
+
320
+ Returns:
321
+ `pyarrow.RecordBatchReader`
322
+ """
323
+ return self.table.to_reader(max_chunksize=max_chunksize)
324
+
325
+ def field(self, *args, **kwargs):
326
+ """
327
+ Select a schema field by its column name or numeric index.
328
+
329
+ Args:
330
+ i (`Union[int, str]`):
331
+ The index or name of the field to retrieve.
332
+
333
+ Returns:
334
+ `pyarrow.Field`
335
+ """
336
+ return self.table.field(*args, **kwargs)
337
+
338
+ def column(self, *args, **kwargs):
339
+ """
340
+ Select a column by its column name, or numeric index.
341
+
342
+ Args:
343
+ i (`Union[int, str]`):
344
+ The index or name of the column to retrieve.
345
+
346
+ Returns:
347
+ `pyarrow.ChunkedArray`
348
+ """
349
+ return self.table.column(*args, **kwargs)
350
+
351
+ def itercolumns(self, *args, **kwargs):
352
+ """
353
+ Iterator over all columns in their numerical order.
354
+
355
+ Yields:
356
+ `pyarrow.ChunkedArray`
357
+ """
358
+ return self.table.itercolumns(*args, **kwargs)
359
+
360
+ @property
361
+ def schema(self):
362
+ """
363
+ Schema of the table and its columns.
364
+
365
+ Returns:
366
+ `pyarrow.Schema`
367
+ """
368
+ return self.table.schema
369
+
370
+ @property
371
+ def columns(self):
372
+ """
373
+ List of all columns in numerical order.
374
+
375
+ Returns:
376
+ `List[pa.ChunkedArray]`
377
+ """
378
+ return self.table.columns
379
+
380
+ @property
381
+ def num_columns(self):
382
+ """
383
+ Number of columns in this table.
384
+
385
+ Returns:
386
+ int
387
+ """
388
+ return self.table.num_columns
389
+
390
+ @property
391
+ def num_rows(self):
392
+ """
393
+ Number of rows in this table.
394
+
395
+ Due to the definition of a table, all columns have the same number of
396
+ rows.
397
+
398
+ Returns:
399
+ int
400
+ """
401
+ return self.table.num_rows
402
+
403
+ @property
404
+ def shape(self):
405
+ """
406
+ Dimensions of the table: (#rows, #columns).
407
+
408
+ Returns:
409
+ `(int, int)`: Number of rows and number of columns.
410
+ """
411
+ return self.table.shape
412
+
413
+ @property
414
+ def nbytes(self):
415
+ """
416
+ Total number of bytes consumed by the elements of the table.
417
+ """
418
+ return self.table.nbytes
419
+
420
+ @property
421
+ def column_names(self):
422
+ """
423
+ Names of the table's columns.
424
+ """
425
+ return self.table.column_names
426
+
427
+ def __eq__(self, other):
428
+ return self.equals(other)
429
+
430
+ def __getitem__(self, i):
431
+ return self.table[i]
432
+
433
+ def __len__(self):
434
+ return len(self.table)
435
+
436
+ def __repr__(self):
437
+ return self.table.__repr__().replace("pyarrow.Table", self.__class__.__name__)
438
+
439
+ def __str__(self):
440
+ return self.table.__str__().replace("pyarrow.Table", self.__class__.__name__)
441
+
442
+ def slice(self, *args, **kwargs):
443
+ """
444
+ Compute zero-copy slice of this Table.
445
+
446
+ Args:
447
+ offset (`int`, defaults to `0`):
448
+ Offset from start of table to slice.
449
+ length (`int`, defaults to `None`):
450
+ Length of slice (default is until end of table starting from
451
+ offset).
452
+
453
+ Returns:
454
+ `datasets.table.Table`
455
+ """
456
+ raise NotImplementedError()
457
+
458
+ def filter(self, *args, **kwargs):
459
+ """
460
+ Select records from a Table. See `pyarrow.compute.filter` for full usage.
461
+ """
462
+ raise NotImplementedError()
463
+
464
+ def flatten(self, *args, **kwargs):
465
+ """
466
+ Flatten this Table. Each column with a struct type is flattened
467
+ into one column per struct field. Other columns are left unchanged.
468
+
469
+ Args:
470
+ memory_pool (`MemoryPool`, defaults to `None`):
471
+ For memory allocations, if required, otherwise use default pool.
472
+
473
+ Returns:
474
+ `datasets.table.Table`
475
+ """
476
+ raise NotImplementedError()
477
+
478
+ def combine_chunks(self, *args, **kwargs):
479
+ """
480
+ Make a new table by combining the chunks this table has.
481
+
482
+ All the underlying chunks in the `ChunkedArray` of each column are
483
+ concatenated into zero or one chunk.
484
+
485
+ Args:
486
+ memory_pool (`MemoryPool`, defaults to `None`):
487
+ For memory allocations, if required, otherwise use default pool.
488
+
489
+ Returns:
490
+ `datasets.table.Table`
491
+ """
492
+ raise NotImplementedError()
493
+
494
+ def cast(self, *args, **kwargs):
495
+ """
496
+ Cast table values to another schema.
497
+
498
+ Args:
499
+ target_schema (`Schema`):
500
+ Schema to cast to, the names and order of fields must match.
501
+ safe (`bool`, defaults to `True`):
502
+ Check for overflows or other unsafe conversions.
503
+
504
+ Returns:
505
+ `datasets.table.Table`
506
+ """
507
+ raise NotImplementedError()
508
+
509
+ def replace_schema_metadata(self, *args, **kwargs):
510
+ """
511
+ EXPERIMENTAL: Create shallow copy of table by replacing schema
512
+ key-value metadata with the indicated new metadata (which may be None,
513
+ which deletes any existing metadata
514
+
515
+ Args:
516
+ metadata (`dict`, defaults to `None`):
517
+
518
+ Returns:
519
+ `datasets.table.Table`: shallow_copy
520
+ """
521
+ raise NotImplementedError()
522
+
523
+ def add_column(self, *args, **kwargs):
524
+ """
525
+ Add column to Table at position.
526
+
527
+ A new table is returned with the column added, the original table
528
+ object is left unchanged.
529
+
530
+ Args:
531
+ i (`int`):
532
+ Index to place the column at.
533
+ field_ (`Union[str, pyarrow.Field]`):
534
+ If a string is passed then the type is deduced from the column
535
+ data.
536
+ column (`Union[pyarrow.Array, List[pyarrow.Array]]`):
537
+ Column data.
538
+
539
+ Returns:
540
+ `datasets.table.Table`: New table with the passed column added.
541
+ """
542
+ raise NotImplementedError()
543
+
544
+ def append_column(self, *args, **kwargs):
545
+ """
546
+ Append column at end of columns.
547
+
548
+ Args:
549
+ field_ (`Union[str, pyarrow.Field]`):
550
+ If a string is passed then the type is deduced from the column
551
+ data.
552
+ column (`Union[pyarrow.Array, List[pyarrow.Array]]`):
553
+ Column data.
554
+
555
+ Returns:
556
+ `datasets.table.Table`: New table with the passed column added.
557
+ """
558
+ raise NotImplementedError()
559
+
560
+ def remove_column(self, *args, **kwargs):
561
+ """
562
+ Create new Table with the indicated column removed.
563
+
564
+ Args:
565
+ i (`int`):
566
+ Index of column to remove.
567
+
568
+ Returns:
569
+ `datasets.table.Table`: New table without the column.
570
+ """
571
+ raise NotImplementedError()
572
+
573
+ def set_column(self, *args, **kwargs):
574
+ """
575
+ Replace column in Table at position.
576
+
577
+ Args:
578
+ i (`int`):
579
+ Index to place the column at.
580
+ field_ (`Union[str, pyarrow.Field]`):
581
+ If a string is passed then the type is deduced from the column
582
+ data.
583
+ column (`Union[pyarrow.Array, List[pyarrow.Array]]`):
584
+ Column data.
585
+
586
+ Returns:
587
+ `datasets.table.Table`: New table with the passed column set.
588
+ """
589
+ raise NotImplementedError()
590
+
591
+ def rename_columns(self, *args, **kwargs):
592
+ """
593
+ Create new table with columns renamed to provided names.
594
+ """
595
+ raise NotImplementedError()
596
+
597
+ def drop(self, *args, **kwargs):
598
+ """
599
+ Drop one or more columns and return a new table.
600
+
601
+ Args:
602
+ columns (`List[str]`):
603
+ List of field names referencing existing columns.
604
+
605
+ Raises:
606
+ `KeyError` : if any of the passed columns name are not existing.
607
+
608
+ Returns:
609
+ `datasets.table.Table`: New table without the columns.
610
+ """
611
+ raise NotImplementedError()
612
+
613
+ def select(self, *args, **kwargs):
614
+ """
615
+ Select columns of the table.
616
+
617
+ Returns a new table with the specified columns, and metadata preserved.
618
+
619
+ Args:
620
+ columns (:obj:`Union[List[str], List[int]]`):
621
+ The column names or integer indices to select.
622
+
623
+ Returns:
624
+ `datasets.table.Table`: table with only a subset of the columns
625
+ """
626
+ raise NotImplementedError()
627
+
628
+
629
+ class TableBlock(Table):
630
+ """
631
+ `TableBlock` is the allowed class inside a `ConcanetationTable`.
632
+ Only `MemoryMappedTable` and `InMemoryTable` are `TableBlock`.
633
+ This is because we don't want a `ConcanetationTable` made out of other `ConcanetationTables`.
634
+ """
635
+
636
+ pass
637
+
638
+
639
+ class InMemoryTable(TableBlock):
640
+ """
641
+ The table is said in-memory when it is loaded into the user's RAM.
642
+
643
+ Pickling it does copy all the data using memory.
644
+ Its implementation is simple and uses the underlying pyarrow Table methods directly.
645
+
646
+ This is different from the `MemoryMapped` table, for which pickling doesn't copy all the
647
+ data in memory. For a `MemoryMapped`, unpickling instead reloads the table from the disk.
648
+
649
+ `InMemoryTable` must be used when data fit in memory, while `MemoryMapped` are reserved for
650
+ data bigger than memory or when you want the memory footprint of your application to
651
+ stay low.
652
+ """
653
+
654
+ @classmethod
655
+ def from_file(cls, filename: str):
656
+ table = _in_memory_arrow_table_from_file(filename)
657
+ return cls(table)
658
+
659
+ @classmethod
660
+ def from_buffer(cls, buffer: pa.Buffer):
661
+ table = _in_memory_arrow_table_from_buffer(buffer)
662
+ return cls(table)
663
+
664
+ @classmethod
665
+ def from_pandas(cls, *args, **kwargs):
666
+ """
667
+ Convert pandas.DataFrame to an Arrow Table.
668
+
669
+ The column types in the resulting Arrow Table are inferred from the
670
+ dtypes of the pandas.Series in the DataFrame. In the case of non-object
671
+ Series, the NumPy dtype is translated to its Arrow equivalent. In the
672
+ case of `object`, we need to guess the datatype by looking at the
673
+ Python objects in this Series.
674
+
675
+ Be aware that Series of the `object` dtype don't carry enough
676
+ information to always lead to a meaningful Arrow type. In the case that
677
+ we cannot infer a type, e.g. because the DataFrame is of length 0 or
678
+ the Series only contains `None/nan` objects, the type is set to
679
+ null. This behavior can be avoided by constructing an explicit schema
680
+ and passing it to this function.
681
+
682
+ Args:
683
+ df (`pandas.DataFrame`):
684
+ schema (`pyarrow.Schema`, *optional*):
685
+ The expected schema of the Arrow Table. This can be used to
686
+ indicate the type of columns if we cannot infer it automatically.
687
+ If passed, the output will have exactly this schema. Columns
688
+ specified in the schema that are not found in the DataFrame columns
689
+ or its index will raise an error. Additional columns or index
690
+ levels in the DataFrame which are not specified in the schema will
691
+ be ignored.
692
+ preserve_index (`bool`, *optional*):
693
+ Whether to store the index as an additional column in the resulting
694
+ `Table`. The default of None will store the index as a column,
695
+ except for RangeIndex which is stored as metadata only. Use
696
+ `preserve_index=True` to force it to be stored as a column.
697
+ nthreads (`int`, defaults to `None` (may use up to system CPU count threads))
698
+ If greater than 1, convert columns to Arrow in parallel using
699
+ indicated number of threads.
700
+ columns (`List[str]`, *optional*):
701
+ List of column to be converted. If `None`, use all columns.
702
+ safe (`bool`, defaults to `True`):
703
+ Check for overflows or other unsafe conversions,
704
+
705
+ Returns:
706
+ `datasets.table.Table`:
707
+
708
+ Examples:
709
+ ```python
710
+ >>> import pandas as pd
711
+ >>> import pyarrow as pa
712
+ >>> df = pd.DataFrame({
713
+ ... 'int': [1, 2],
714
+ ... 'str': ['a', 'b']
715
+ ... })
716
+ >>> pa.Table.from_pandas(df)
717
+ <pyarrow.lib.Table object at 0x7f05d1fb1b40>
718
+ ```
719
+ """
720
+ return cls(pa.Table.from_pandas(*args, **kwargs))
721
+
722
+ @classmethod
723
+ def from_arrays(cls, *args, **kwargs):
724
+ """
725
+ Construct a Table from Arrow arrays.
726
+
727
+ Args:
728
+ arrays (`List[Union[pyarrow.Array, pyarrow.ChunkedArray]]`):
729
+ Equal-length arrays that should form the table.
730
+ names (`List[str]`, *optional*):
731
+ Names for the table columns. If not passed, schema must be passed.
732
+ schema (`Schema`, defaults to `None`):
733
+ Schema for the created table. If not passed, names must be passed.
734
+ metadata (`Union[dict, Mapping]`, defaults to `None`):
735
+ Optional metadata for the schema (if inferred).
736
+
737
+ Returns:
738
+ `datasets.table.Table`
739
+ """
740
+ return cls(pa.Table.from_arrays(*args, **kwargs))
741
+
742
+ @classmethod
743
+ def from_pydict(cls, *args, **kwargs):
744
+ """
745
+ Construct a Table from Arrow arrays or columns.
746
+
747
+ Args:
748
+ mapping (`Union[dict, Mapping]`):
749
+ A mapping of strings to Arrays or Python lists.
750
+ schema (`Schema`, defaults to `None`):
751
+ If not passed, will be inferred from the Mapping values
752
+ metadata (`Union[dict, Mapping]`, defaults to `None`):
753
+ Optional metadata for the schema (if inferred).
754
+
755
+ Returns:
756
+ `datasets.table.Table`
757
+ """
758
+ return cls(pa.Table.from_pydict(*args, **kwargs))
759
+
760
+ @classmethod
761
+ def from_pylist(cls, mapping, *args, **kwargs):
762
+ """
763
+ Construct a Table from list of rows / dictionaries.
764
+
765
+ Args:
766
+ mapping (`List[dict]`):
767
+ A mapping of strings to row values.
768
+ schema (`Schema`, defaults to `None`):
769
+ If not passed, will be inferred from the Mapping values
770
+ metadata (`Union[dict, Mapping]`, defaults to `None`):
771
+ Optional metadata for the schema (if inferred).
772
+
773
+ Returns:
774
+ `datasets.table.Table`
775
+ """
776
+ return cls(pa.Table.from_pylist(mapping, *args, **kwargs))
777
+
778
+ @classmethod
779
+ def from_batches(cls, *args, **kwargs):
780
+ """
781
+ Construct a Table from a sequence or iterator of Arrow `RecordBatches`.
782
+
783
+ Args:
784
+ batches (`Union[Sequence[pyarrow.RecordBatch], Iterator[pyarrow.RecordBatch]]`):
785
+ Sequence of `RecordBatch` to be converted, all schemas must be equal.
786
+ schema (`Schema`, defaults to `None`):
787
+ If not passed, will be inferred from the first `RecordBatch`.
788
+
789
+ Returns:
790
+ `datasets.table.Table`:
791
+ """
792
+ return cls(pa.Table.from_batches(*args, **kwargs))
793
+
794
+ def slice(self, offset=0, length=None):
795
+ """
796
+ Compute zero-copy slice of this Table.
797
+
798
+ Args:
799
+ offset (`int`, defaults to `0`):
800
+ Offset from start of table to slice.
801
+ length (`int`, defaults to `None`):
802
+ Length of slice (default is until end of table starting from
803
+ offset).
804
+
805
+ Returns:
806
+ `datasets.table.Table`
807
+ """
808
+ # Use fast slicing here
809
+ return InMemoryTable(self.fast_slice(offset=offset, length=length))
810
+
811
+ def filter(self, *args, **kwargs):
812
+ """
813
+ Select records from a Table. See `pyarrow.compute.filter` for full usage.
814
+ """
815
+ return InMemoryTable(self.table.filter(*args, **kwargs))
816
+
817
+ def flatten(self, *args, **kwargs):
818
+ """
819
+ Flatten this Table. Each column with a struct type is flattened
820
+ into one column per struct field. Other columns are left unchanged.
821
+
822
+ Args:
823
+ memory_pool (`MemoryPool`, defaults to `None`):
824
+ For memory allocations, if required, otherwise use default pool.
825
+
826
+ Returns:
827
+ `datasets.table.Table`
828
+ """
829
+ return InMemoryTable(table_flatten(self.table, *args, **kwargs))
830
+
831
+ def combine_chunks(self, *args, **kwargs):
832
+ """
833
+ Make a new table by combining the chunks this table has.
834
+
835
+ All the underlying chunks in the `ChunkedArray` of each column are
836
+ concatenated into zero or one chunk.
837
+
838
+ Args:
839
+ memory_pool (`MemoryPool`, defaults to `None`):
840
+ For memory allocations, if required, otherwise use default pool.
841
+
842
+ Returns:
843
+ `datasets.table.Table`
844
+ """
845
+ return InMemoryTable(self.table.combine_chunks(*args, **kwargs))
846
+
847
+ def cast(self, *args, **kwargs):
848
+ """
849
+ Cast table values to another schema.
850
+
851
+ Args:
852
+ target_schema (`Schema`):
853
+ Schema to cast to, the names and order of fields must match.
854
+ safe (`bool`, defaults to `True`):
855
+ Check for overflows or other unsafe conversions.
856
+
857
+ Returns:
858
+ `datasets.table.Table`
859
+ """
860
+ return InMemoryTable(table_cast(self.table, *args, **kwargs))
861
+
862
+ def replace_schema_metadata(self, *args, **kwargs):
863
+ """
864
+ EXPERIMENTAL: Create shallow copy of table by replacing schema
865
+ key-value metadata with the indicated new metadata (which may be `None`,
866
+ which deletes any existing metadata).
867
+
868
+ Args:
869
+ metadata (`dict`, defaults to `None`):
870
+
871
+ Returns:
872
+ `datasets.table.Table`: shallow_copy
873
+ """
874
+ return InMemoryTable(self.table.replace_schema_metadata(*args, **kwargs))
875
+
876
+ def add_column(self, *args, **kwargs):
877
+ """
878
+ Add column to Table at position.
879
+
880
+ A new table is returned with the column added, the original table
881
+ object is left unchanged.
882
+
883
+ Args:
884
+ i (`int`):
885
+ Index to place the column at.
886
+ field_ (`Union[str, pyarrow.Field]`):
887
+ If a string is passed then the type is deduced from the column
888
+ data.
889
+ column (`Union[pyarrow.Array, List[pyarrow.Array]]`):
890
+ Column data.
891
+
892
+ Returns:
893
+ `datasets.table.Table`: New table with the passed column added.
894
+ """
895
+ return InMemoryTable(self.table.add_column(*args, **kwargs))
896
+
897
+ def append_column(self, *args, **kwargs):
898
+ """
899
+ Append column at end of columns.
900
+
901
+ Args:
902
+ field_ (`Union[str, pyarrow.Field]`):
903
+ If a string is passed then the type is deduced from the column
904
+ data.
905
+ column (`Union[pyarrow.Array, List[pyarrow.Array]]`):
906
+ Column data.
907
+
908
+ Returns:
909
+ `datasets.table.Table`:
910
+ New table with the passed column added.
911
+ """
912
+ return InMemoryTable(self.table.append_column(*args, **kwargs))
913
+
914
+ def remove_column(self, *args, **kwargs):
915
+ """
916
+ Create new Table with the indicated column removed.
917
+
918
+ Args:
919
+ i (`int`):
920
+ Index of column to remove.
921
+
922
+ Returns:
923
+ `datasets.table.Table`:
924
+ New table without the column.
925
+ """
926
+ return InMemoryTable(self.table.remove_column(*args, **kwargs))
927
+
928
+ def set_column(self, *args, **kwargs):
929
+ """
930
+ Replace column in Table at position.
931
+
932
+ Args:
933
+ i (`int`):
934
+ Index to place the column at.
935
+ field_ (`Union[str, pyarrow.Field]`):
936
+ If a string is passed then the type is deduced from the column
937
+ data.
938
+ column (`Union[pyarrow.Array, List[pyarrow.Array]]`):
939
+ Column data.
940
+
941
+ Returns:
942
+ `datasets.table.Table`:
943
+ New table with the passed column set.
944
+ """
945
+ return InMemoryTable(self.table.set_column(*args, **kwargs))
946
+
947
+ def rename_columns(self, *args, **kwargs):
948
+ """
949
+ Create new table with columns renamed to provided names.
950
+ """
951
+ return InMemoryTable(self.table.rename_columns(*args, **kwargs))
952
+
953
+ def drop(self, *args, **kwargs):
954
+ """
955
+ Drop one or more columns and return a new table.
956
+
957
+ Args:
958
+ columns (`List[str]`):
959
+ List of field names referencing existing columns.
960
+
961
+ Raises:
962
+ `KeyError` : if any of the passed columns name are not existing.
963
+
964
+ Returns:
965
+ `datasets.table.Table`:
966
+ New table without the columns.
967
+ """
968
+ return InMemoryTable(self.table.drop(*args, **kwargs))
969
+
970
+ def select(self, *args, **kwargs):
971
+ """
972
+ Select columns of the table.
973
+
974
+ Returns a new table with the specified columns, and metadata preserved.
975
+
976
+ Args:
977
+ columns (:obj:`Union[List[str], List[int]]`):
978
+ The column names or integer indices to select.
979
+
980
+ Returns:
981
+ :class:`datasets.table.Table`: New table with the specified columns, and metadata preserved.
982
+ """
983
+ return InMemoryTable(self.table.select(*args, **kwargs))
984
+
985
+
986
+ # The MemoryMappedTable needs replays to properly reload tables from the disk
987
+ Replay = Tuple[str, tuple, dict]
988
+
989
+
990
+ class MemoryMappedTable(TableBlock):
991
+ """
992
+ The table is said memory mapped when it doesn't use the user's RAM but loads the data
993
+ from the disk instead.
994
+
995
+ Pickling it doesn't copy the data into memory.
996
+ Instead, only the path to the memory mapped arrow file is pickled, as well as the list
997
+ of transforms to "replay" when reloading the table from the disk.
998
+
999
+ Its implementation requires to store an history of all the transforms that were applied
1000
+ to the underlying pyarrow Table, so that they can be "replayed" when reloading the Table
1001
+ from the disk.
1002
+
1003
+ This is different from the `InMemoryTable` table, for which pickling does copy all the
1004
+ data in memory.
1005
+
1006
+ `InMemoryTable` must be used when data fit in memory, while `MemoryMapped` are reserved for
1007
+ data bigger than memory or when you want the memory footprint of your application to
1008
+ stay low.
1009
+ """
1010
+
1011
+ def __init__(self, table: pa.Table, path: str, replays: Optional[List[Replay]] = None):
1012
+ super().__init__(table)
1013
+ self.path = os.path.abspath(path)
1014
+ self.replays: List[Replay] = replays if replays is not None else []
1015
+
1016
+ @classmethod
1017
+ def from_file(cls, filename: str, replays=None):
1018
+ table = _memory_mapped_arrow_table_from_file(filename)
1019
+ table = cls._apply_replays(table, replays)
1020
+ return cls(table, filename, replays)
1021
+
1022
+ def __getstate__(self):
1023
+ return {"path": self.path, "replays": self.replays}
1024
+
1025
+ def __setstate__(self, state):
1026
+ path = state["path"]
1027
+ replays = state["replays"]
1028
+ table = _memory_mapped_arrow_table_from_file(path)
1029
+ table = self._apply_replays(table, replays)
1030
+ MemoryMappedTable.__init__(self, table, path=path, replays=replays)
1031
+
1032
+ @staticmethod
1033
+ def _apply_replays(table: pa.Table, replays: Optional[List[Replay]] = None) -> pa.Table:
1034
+ if replays is not None:
1035
+ for name, args, kwargs in replays:
1036
+ if name == "cast":
1037
+ table = table_cast(table, *args, **kwargs)
1038
+ elif name == "flatten":
1039
+ table = table_flatten(table, *args, **kwargs)
1040
+ else:
1041
+ table = getattr(table, name)(*args, **kwargs)
1042
+ return table
1043
+
1044
+ def _append_replay(self, replay: Replay) -> List[Replay]:
1045
+ replays = copy.deepcopy(self.replays)
1046
+ replays.append(replay)
1047
+ return replays
1048
+
1049
+ def slice(self, offset=0, length=None):
1050
+ """
1051
+ Compute zero-copy slice of this Table.
1052
+
1053
+ Args:
1054
+ offset (`int`, defaults to `0`):
1055
+ Offset from start of table to slice.
1056
+ length (`int`, defaults to `None`):
1057
+ Length of slice (default is until end of table starting from
1058
+ offset).
1059
+
1060
+ Returns:
1061
+ `datasets.table.Table`
1062
+ """
1063
+ replay = ("slice", (offset, length), {})
1064
+ replays = self._append_replay(replay)
1065
+ # Use fast slicing here
1066
+ return MemoryMappedTable(self.fast_slice(offset=offset, length=length), self.path, replays)
1067
+
1068
+ def filter(self, *args, **kwargs):
1069
+ """
1070
+ Select records from a Table. See `pyarrow.compute.filter` for full usage.
1071
+ """
1072
+ replay = ("filter", copy.deepcopy(args), copy.deepcopy(kwargs))
1073
+ replays = self._append_replay(replay)
1074
+ return MemoryMappedTable(self.table.filter(*args, **kwargs), self.path, replays)
1075
+
1076
+ def flatten(self, *args, **kwargs):
1077
+ """
1078
+ Flatten this Table. Each column with a struct type is flattened
1079
+ into one column per struct field. Other columns are left unchanged.
1080
+
1081
+ Args:
1082
+ memory_pool (`MemoryPool`, defaults to `None`):
1083
+ For memory allocations, if required, otherwise use default pool.
1084
+
1085
+ Returns:
1086
+ `datasets.table.Table`
1087
+ """
1088
+ replay = ("flatten", copy.deepcopy(args), copy.deepcopy(kwargs))
1089
+ replays = self._append_replay(replay)
1090
+ return MemoryMappedTable(table_flatten(self.table, *args, **kwargs), self.path, replays)
1091
+
1092
+ def combine_chunks(self, *args, **kwargs):
1093
+ """
1094
+ Make a new table by combining the chunks this table has.
1095
+
1096
+ All the underlying chunks in the ChunkedArray of each column are
1097
+ concatenated into zero or one chunk.
1098
+
1099
+ Args:
1100
+ memory_pool (`MemoryPool`, defaults to `None`):
1101
+ For memory allocations, if required, otherwise use default pool.
1102
+
1103
+ Returns:
1104
+ `datasets.table.Table`
1105
+ """
1106
+ replay = ("combine_chunks", copy.deepcopy(args), copy.deepcopy(kwargs))
1107
+ replays = self._append_replay(replay)
1108
+ return MemoryMappedTable(self.table.combine_chunks(*args, **kwargs), self.path, replays)
1109
+
1110
+ def cast(self, *args, **kwargs):
1111
+ """
1112
+ Cast table values to another schema
1113
+
1114
+ Args:
1115
+ target_schema (`Schema`):
1116
+ Schema to cast to, the names and order of fields must match.
1117
+ safe (`bool`, defaults to `True`):
1118
+ Check for overflows or other unsafe conversions.
1119
+
1120
+ Returns:
1121
+ `datasets.table.Table`
1122
+ """
1123
+ replay = ("cast", copy.deepcopy(args), copy.deepcopy(kwargs))
1124
+ replays = self._append_replay(replay)
1125
+ return MemoryMappedTable(table_cast(self.table, *args, **kwargs), self.path, replays)
1126
+
1127
+ def replace_schema_metadata(self, *args, **kwargs):
1128
+ """
1129
+ EXPERIMENTAL: Create shallow copy of table by replacing schema
1130
+ key-value metadata with the indicated new metadata (which may be None,
1131
+ which deletes any existing metadata.
1132
+
1133
+ Args:
1134
+ metadata (`dict`, defaults to `None`):
1135
+
1136
+ Returns:
1137
+ `datasets.table.Table`: shallow_copy
1138
+ """
1139
+ replay = ("replace_schema_metadata", copy.deepcopy(args), copy.deepcopy(kwargs))
1140
+ replays = self._append_replay(replay)
1141
+ return MemoryMappedTable(self.table.replace_schema_metadata(*args, **kwargs), self.path, replays)
1142
+
1143
+ def add_column(self, *args, **kwargs):
1144
+ """
1145
+ Add column to Table at position.
1146
+
1147
+ A new table is returned with the column added, the original table
1148
+ object is left unchanged.
1149
+
1150
+ Args:
1151
+ i (`int`):
1152
+ Index to place the column at.
1153
+ field_ (`Union[str, pyarrow.Field]`):
1154
+ If a string is passed then the type is deduced from the column
1155
+ data.
1156
+ column (`Union[pyarrow.Array, List[pyarrow.Array]]`):
1157
+ Column data.
1158
+
1159
+ Returns:
1160
+ `datasets.table.Table`: New table with the passed column added.
1161
+ """
1162
+ replay = ("add_column", copy.deepcopy(args), copy.deepcopy(kwargs))
1163
+ replays = self._append_replay(replay)
1164
+ return MemoryMappedTable(self.table.add_column(*args, **kwargs), self.path, replays)
1165
+
1166
+ def append_column(self, *args, **kwargs):
1167
+ """
1168
+ Append column at end of columns.
1169
+
1170
+ Args:
1171
+ field_ (`Union[str, pyarrow.Field]`):
1172
+ If a string is passed then the type is deduced from the column
1173
+ data.
1174
+ column (`Union[pyarrow.Array, List[pyarrow.Array]]`):
1175
+ Column data.
1176
+
1177
+ Returns:
1178
+ `datasets.table.Table`:
1179
+ New table with the passed column added.
1180
+ """
1181
+ replay = ("append_column", copy.deepcopy(args), copy.deepcopy(kwargs))
1182
+ replays = self._append_replay(replay)
1183
+ return MemoryMappedTable(self.table.append_column(*args, **kwargs), self.path, replays)
1184
+
1185
+ def remove_column(self, *args, **kwargs):
1186
+ """
1187
+ Create new Table with the indicated column removed.
1188
+
1189
+ Args:
1190
+ i (`int`):
1191
+ Index of column to remove.
1192
+
1193
+ Returns:
1194
+ `datasets.table.Table`:
1195
+ New table without the column.
1196
+ """
1197
+ replay = ("remove_column", copy.deepcopy(args), copy.deepcopy(kwargs))
1198
+ replays = self._append_replay(replay)
1199
+ return MemoryMappedTable(self.table.remove_column(*args, **kwargs), self.path, replays)
1200
+
1201
+ def set_column(self, *args, **kwargs):
1202
+ """
1203
+ Replace column in Table at position.
1204
+
1205
+ Args:
1206
+ i (`int`):
1207
+ Index to place the column at.
1208
+ field_ (`Union[str, pyarrow.Field]`):
1209
+ If a string is passed then the type is deduced from the column
1210
+ data.
1211
+ column (`Union[pyarrow.Array, List[pyarrow.Array]]`):
1212
+ Column data.
1213
+
1214
+ Returns:
1215
+ `datasets.table.Table`:
1216
+ New table with the passed column set.
1217
+ """
1218
+ replay = ("set_column", copy.deepcopy(args), copy.deepcopy(kwargs))
1219
+ replays = self._append_replay(replay)
1220
+ return MemoryMappedTable(self.table.set_column(*args, **kwargs), self.path, replays)
1221
+
1222
+ def rename_columns(self, *args, **kwargs):
1223
+ """
1224
+ Create new table with columns renamed to provided names.
1225
+ """
1226
+ replay = ("rename_columns", copy.deepcopy(args), copy.deepcopy(kwargs))
1227
+ replays = self._append_replay(replay)
1228
+ return MemoryMappedTable(self.table.rename_columns(*args, **kwargs), self.path, replays)
1229
+
1230
+ def drop(self, *args, **kwargs):
1231
+ """
1232
+ Drop one or more columns and return a new table.
1233
+
1234
+ Args:
1235
+ columns (`List[str]`):
1236
+ List of field names referencing existing columns.
1237
+
1238
+ Raises:
1239
+ `KeyError` : if any of the passed columns name are not existing.
1240
+
1241
+ Returns:
1242
+ `datasets.table.Table`:
1243
+ New table without the columns.
1244
+ """
1245
+ replay = ("drop", copy.deepcopy(args), copy.deepcopy(kwargs))
1246
+ replays = self._append_replay(replay)
1247
+ return MemoryMappedTable(self.table.drop(*args, **kwargs), self.path, replays)
1248
+
1249
+ def select(self, *args, **kwargs):
1250
+ """
1251
+ Select columns of the table.
1252
+
1253
+ Returns a new table with the specified columns, and metadata preserved.
1254
+
1255
+ Args:
1256
+ columns (:obj:`Union[List[str], List[int]]`):
1257
+ The column names or integer indices to select.
1258
+
1259
+ Returns:
1260
+ :class:`datasets.table.Table`: New table with the specified columns, and metadata preserved.
1261
+ """
1262
+ replay = ("select", copy.deepcopy(args), copy.deepcopy(kwargs))
1263
+ replays = self._append_replay(replay)
1264
+ return MemoryMappedTable(self.table.select(*args, **kwargs), self.path, replays)
1265
+
1266
+
1267
+ # A ConcatenationTable is the concatenation of several tables.
1268
+ # The ``blocks`` attributes stores a list of list of blocks.
1269
+ # The first axis concatenates the tables along the axis 0 (it appends rows),
1270
+ # while the second axis concatenates tables along the axis 1 (it appends columns).
1271
+ TableBlockContainer = TypeVar("TableBlockContainer", TableBlock, List[TableBlock], List[List[TableBlock]])
1272
+
1273
+
1274
+ class ConcatenationTable(Table):
1275
+ """
1276
+ The table comes from the concatenation of several tables called blocks.
1277
+ It enables concatenation on both axis 0 (append rows) and axis 1 (append columns).
1278
+
1279
+ The underlying tables are called "blocks" and can be either `InMemoryTable`
1280
+ or `MemoryMappedTable` objects.
1281
+ This allows to combine tables that come from memory or that are memory mapped.
1282
+ When a `ConcatenationTable` is pickled, then each block is pickled:
1283
+ - the `InMemoryTable` objects are pickled by copying all the data in memory.
1284
+ - the MemoryMappedTable objects are pickled without copying the data into memory.
1285
+ Instead, only the path to the memory mapped arrow file is pickled, as well as the list
1286
+ of transforms to "replays" when reloading the table from the disk.
1287
+
1288
+ Its implementation requires to store each block separately.
1289
+ The `blocks` attributes stores a list of list of blocks.
1290
+ The first axis concatenates the tables along the axis 0 (it appends rows),
1291
+ while the second axis concatenates tables along the axis 1 (it appends columns).
1292
+
1293
+ If some columns are missing when concatenating on axis 0, they are filled with null values.
1294
+ This is done using `pyarrow.concat_tables(tables, promote=True)`.
1295
+
1296
+ You can access the fully combined table by accessing the `ConcatenationTable.table` attribute,
1297
+ and the blocks by accessing the `ConcatenationTable.blocks` attribute.
1298
+ """
1299
+
1300
+ def __init__(self, table: pa.Table, blocks: List[List[TableBlock]]):
1301
+ super().__init__(table)
1302
+ self.blocks = blocks
1303
+ # Check that all the blocks have the right type.
1304
+ # Only InMemoryTable and MemoryMappedTable are allowed.
1305
+ for subtables in blocks:
1306
+ for subtable in subtables:
1307
+ if not isinstance(subtable, TableBlock):
1308
+ raise TypeError(
1309
+ "The blocks of a ConcatenationTable must be InMemoryTable or MemoryMappedTable objects"
1310
+ f", but got {_short_str(subtable)}."
1311
+ )
1312
+
1313
+ def __getstate__(self):
1314
+ return {"blocks": self.blocks, "schema": self.table.schema}
1315
+
1316
+ def __setstate__(self, state):
1317
+ blocks = state["blocks"]
1318
+ schema = state["schema"]
1319
+ table = self._concat_blocks_horizontally_and_vertically(blocks)
1320
+ if schema is not None and table.schema != schema:
1321
+ # We fix the columns by concatenating with an empty table with the right columns
1322
+ empty_table = pa.Table.from_batches([], schema=schema)
1323
+ # we set promote=True to fill missing columns with null values
1324
+ if config.PYARROW_VERSION.major < 14:
1325
+ table = pa.concat_tables([table, empty_table], promote=True)
1326
+ else:
1327
+ table = pa.concat_tables([table, empty_table], promote_options="default")
1328
+ ConcatenationTable.__init__(self, table, blocks=blocks)
1329
+
1330
+ @staticmethod
1331
+ def _concat_blocks(blocks: List[Union[TableBlock, pa.Table]], axis: int = 0) -> pa.Table:
1332
+ pa_tables = [table.table if hasattr(table, "table") else table for table in blocks]
1333
+ if axis == 0:
1334
+ # we set promote=True to fill missing columns with null values
1335
+ if config.PYARROW_VERSION.major < 14:
1336
+ return pa.concat_tables(pa_tables, promote=True)
1337
+ else:
1338
+ return pa.concat_tables(pa_tables, promote_options="default")
1339
+ elif axis == 1:
1340
+ for i, table in enumerate(pa_tables):
1341
+ if i == 0:
1342
+ pa_table = table
1343
+ else:
1344
+ for name, col in zip(table.column_names, table.columns):
1345
+ pa_table = pa_table.append_column(name, col)
1346
+ return pa_table
1347
+ else:
1348
+ raise ValueError("'axis' must be either 0 or 1")
1349
+
1350
+ @classmethod
1351
+ def _concat_blocks_horizontally_and_vertically(cls, blocks: List[List[TableBlock]]) -> pa.Table:
1352
+ pa_tables_to_concat_vertically = []
1353
+ for i, tables in enumerate(blocks):
1354
+ if not tables:
1355
+ continue
1356
+ pa_table_horizontally_concatenated = cls._concat_blocks(tables, axis=1)
1357
+ pa_tables_to_concat_vertically.append(pa_table_horizontally_concatenated)
1358
+ return cls._concat_blocks(pa_tables_to_concat_vertically, axis=0)
1359
+
1360
+ @classmethod
1361
+ def _merge_blocks(cls, blocks: TableBlockContainer, axis: Optional[int] = None) -> TableBlockContainer:
1362
+ if axis is not None:
1363
+ merged_blocks = []
1364
+ for is_in_memory, block_group in groupby(blocks, key=lambda x: isinstance(x, InMemoryTable)):
1365
+ if is_in_memory:
1366
+ block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
1367
+ merged_blocks += list(block_group)
1368
+ else: # both
1369
+ merged_blocks = [cls._merge_blocks(row_block, axis=1) for row_block in blocks]
1370
+ if all(len(row_block) == 1 for row_block in merged_blocks):
1371
+ merged_blocks = cls._merge_blocks(
1372
+ [block for row_block in merged_blocks for block in row_block], axis=0
1373
+ )
1374
+ return merged_blocks
1375
+
1376
+ @classmethod
1377
+ def _consolidate_blocks(cls, blocks: TableBlockContainer) -> TableBlockContainer:
1378
+ if isinstance(blocks, TableBlock):
1379
+ return blocks
1380
+ elif isinstance(blocks[0], TableBlock):
1381
+ return cls._merge_blocks(blocks, axis=0)
1382
+ else:
1383
+ return cls._merge_blocks(blocks)
1384
+
1385
+ @classmethod
1386
+ def from_blocks(cls, blocks: TableBlockContainer) -> "ConcatenationTable":
1387
+ blocks = cls._consolidate_blocks(blocks)
1388
+ if isinstance(blocks, TableBlock):
1389
+ table = blocks
1390
+ return cls(table.table, [[table]])
1391
+ elif isinstance(blocks[0], TableBlock):
1392
+ table = cls._concat_blocks(blocks, axis=0)
1393
+ blocks = [[t] for t in blocks]
1394
+ return cls(table, blocks)
1395
+ else:
1396
+ table = cls._concat_blocks_horizontally_and_vertically(blocks)
1397
+ return cls(table, blocks)
1398
+
1399
+ @classmethod
1400
+ def from_tables(cls, tables: List[Union[pa.Table, Table]], axis: int = 0) -> "ConcatenationTable":
1401
+ """Create `ConcatenationTable` from list of tables.
1402
+
1403
+ Args:
1404
+ tables (list of `Table` or list of `pyarrow.Table`):
1405
+ List of tables.
1406
+ axis (`{0, 1}`, defaults to `0`, meaning over rows):
1407
+ Axis to concatenate over, where `0` means over rows (vertically) and `1` means over columns
1408
+ (horizontally).
1409
+
1410
+ <Added version="1.6.0"/>
1411
+ """
1412
+
1413
+ def to_blocks(table: Union[pa.Table, Table]) -> List[List[TableBlock]]:
1414
+ if isinstance(table, pa.Table):
1415
+ return [[InMemoryTable(table)]]
1416
+ elif isinstance(table, ConcatenationTable):
1417
+ return copy.deepcopy(table.blocks)
1418
+ else:
1419
+ return [[table]]
1420
+
1421
+ def _slice_row_block(row_block: List[TableBlock], length: int) -> Tuple[List[TableBlock], List[TableBlock]]:
1422
+ sliced = [table.slice(0, length) for table in row_block]
1423
+ remainder = [table.slice(length, len(row_block[0]) - length) for table in row_block]
1424
+ return sliced, remainder
1425
+
1426
+ def _split_both_like(
1427
+ result: List[List[TableBlock]], blocks: List[List[TableBlock]]
1428
+ ) -> Tuple[List[List[TableBlock]], List[List[TableBlock]]]:
1429
+ """
1430
+ Make sure each row_block contain the same num_rows to be able to concatenate them on axis=1.
1431
+
1432
+ To do so, we modify both blocks sets to have the same row_blocks boundaries.
1433
+ For example, if `result` has 2 row_blocks of 3 rows and `blocks` has 3 row_blocks of 2 rows,
1434
+ we modify both to have 4 row_blocks of size 2, 1, 1 and 2:
1435
+
1436
+ [ x x x | x x x ]
1437
+ + [ y y | y y | y y ]
1438
+ -----------------------------
1439
+ = [ x x | x | x | x x ]
1440
+ [ y y | y | y | y y ]
1441
+
1442
+ """
1443
+ result, blocks = list(result), list(blocks)
1444
+ new_result, new_blocks = [], []
1445
+ while result and blocks:
1446
+ # we slice the longest row block to save two row blocks of same length
1447
+ # and we replace the long row block by its remainder if necessary
1448
+ if len(result[0][0]) > len(blocks[0][0]):
1449
+ new_blocks.append(blocks[0])
1450
+ sliced, result[0] = _slice_row_block(result[0], len(blocks.pop(0)[0]))
1451
+ new_result.append(sliced)
1452
+ elif len(result[0][0]) < len(blocks[0][0]):
1453
+ new_result.append(result[0])
1454
+ sliced, blocks[0] = _slice_row_block(blocks[0], len(result.pop(0)[0]))
1455
+ new_blocks.append(sliced)
1456
+ else:
1457
+ new_result.append(result.pop(0))
1458
+ new_blocks.append(blocks.pop(0))
1459
+ if result or blocks:
1460
+ raise ValueError("Failed to concatenate on axis=1 because tables don't have the same number of rows")
1461
+ return new_result, new_blocks
1462
+
1463
+ def _extend_blocks(
1464
+ result: List[List[TableBlock]], blocks: List[List[TableBlock]], axis: int = 0
1465
+ ) -> List[List[TableBlock]]:
1466
+ if axis == 0:
1467
+ result.extend(blocks)
1468
+ elif axis == 1:
1469
+ # We make sure each row_block have the same num_rows
1470
+ result, blocks = _split_both_like(result, blocks)
1471
+ for i, row_block in enumerate(blocks):
1472
+ result[i].extend(row_block)
1473
+ return result
1474
+
1475
+ blocks = to_blocks(tables[0])
1476
+ for table in tables[1:]:
1477
+ table_blocks = to_blocks(table)
1478
+ blocks = _extend_blocks(blocks, table_blocks, axis=axis)
1479
+ return cls.from_blocks(blocks)
1480
+
1481
+ @property
1482
+ def _slices(self):
1483
+ offset = 0
1484
+ for tables in self.blocks:
1485
+ length = len(tables[0])
1486
+ yield (offset, length)
1487
+ offset += length
1488
+
1489
+ def slice(self, offset=0, length=None):
1490
+ """
1491
+ Compute zero-copy slice of this Table.
1492
+
1493
+ Args:
1494
+ offset (`int`, defaults to `0`):
1495
+ Offset from start of table to slice.
1496
+ length (`int`, defaults to `None`):
1497
+ Length of slice (default is until end of table starting from
1498
+ offset).
1499
+
1500
+ Returns:
1501
+ `datasets.table.Table`
1502
+ """
1503
+ table = self.table.slice(offset, length=length)
1504
+ length = length if length is not None else self.num_rows - offset
1505
+ blocks = []
1506
+ for tables in self.blocks:
1507
+ n_rows = len(tables[0])
1508
+ if length == 0:
1509
+ break
1510
+ elif n_rows <= offset:
1511
+ offset = offset - n_rows
1512
+ elif n_rows <= offset + length:
1513
+ blocks.append([t.slice(offset) for t in tables])
1514
+ length, offset = length + offset - n_rows, 0
1515
+ else:
1516
+ blocks.append([t.slice(offset, length) for t in tables])
1517
+ length, offset = 0, 0
1518
+ return ConcatenationTable(table, blocks)
1519
+
1520
+ def filter(self, mask, *args, **kwargs):
1521
+ """
1522
+ Select records from a Table. See `pyarrow.compute.filter` for full usage.
1523
+ """
1524
+ table = self.table.filter(mask, *args, **kwargs)
1525
+ blocks = []
1526
+ for (offset, length), tables in zip(self._slices, self.blocks):
1527
+ submask = mask.slice(offset, length)
1528
+ blocks.append([t.filter(submask, *args, **kwargs) for t in tables])
1529
+ return ConcatenationTable(table, blocks)
1530
+
1531
+ def flatten(self, *args, **kwargs):
1532
+ """
1533
+ Flatten this Table. Each column with a struct type is flattened
1534
+ into one column per struct field. Other columns are left unchanged.
1535
+
1536
+ Args:
1537
+ memory_pool (`MemoryPool`, defaults to `None`):
1538
+ For memory allocations, if required, otherwise use default pool.
1539
+
1540
+ Returns:
1541
+ `datasets.table.Table`
1542
+ """
1543
+ table = table_flatten(self.table, *args, **kwargs)
1544
+ blocks = []
1545
+ for tables in self.blocks:
1546
+ blocks.append([t.flatten(*args, **kwargs) for t in tables])
1547
+ return ConcatenationTable(table, blocks)
1548
+
1549
+ def combine_chunks(self, *args, **kwargs):
1550
+ """
1551
+ Make a new table by combining the chunks this table has.
1552
+
1553
+ All the underlying chunks in the `ChunkedArray` of each column are
1554
+ concatenated into zero or one chunk.
1555
+
1556
+ Args:
1557
+ memory_pool (`MemoryPool`, defaults to `None`):
1558
+ For memory allocations, if required, otherwise use default pool.
1559
+
1560
+ Returns:
1561
+ `datasets.table.Table`
1562
+ """
1563
+ table = self.table.combine_chunks(*args, **kwargs)
1564
+ blocks = []
1565
+ for tables in self.blocks:
1566
+ blocks.append([t.combine_chunks(*args, **kwargs) for t in tables])
1567
+ return ConcatenationTable(table, blocks)
1568
+
1569
+ def cast(self, target_schema, *args, **kwargs):
1570
+ """
1571
+ Cast table values to another schema.
1572
+
1573
+ Args:
1574
+ target_schema (`Schema`):
1575
+ Schema to cast to, the names and order of fields must match.
1576
+ safe (`bool`, defaults to `True`):
1577
+ Check for overflows or other unsafe conversions.
1578
+
1579
+ Returns:
1580
+ `datasets.table.Table`
1581
+ """
1582
+ from .features import Features
1583
+
1584
+ table = table_cast(self.table, target_schema, *args, **kwargs)
1585
+ target_features = Features.from_arrow_schema(target_schema)
1586
+ blocks = []
1587
+ for subtables in self.blocks:
1588
+ new_tables = []
1589
+ fields = list(target_schema)
1590
+ for subtable in subtables:
1591
+ subfields = []
1592
+ for name in subtable.column_names:
1593
+ subfields.append(fields.pop(next(i for i, field in enumerate(fields) if field.name == name)))
1594
+ subfeatures = Features({subfield.name: target_features[subfield.name] for subfield in subfields})
1595
+ subschema = subfeatures.arrow_schema
1596
+ new_tables.append(subtable.cast(subschema, *args, **kwargs))
1597
+ blocks.append(new_tables)
1598
+ return ConcatenationTable(table, blocks)
1599
+
1600
+ def replace_schema_metadata(self, *args, **kwargs):
1601
+ """
1602
+ EXPERIMENTAL: Create shallow copy of table by replacing schema
1603
+ key-value metadata with the indicated new metadata (which may be `None`,
1604
+ which deletes any existing metadata).
1605
+
1606
+ Args:
1607
+ metadata (`dict`, defaults to `None`):
1608
+
1609
+ Returns:
1610
+ `datasets.table.Table`: shallow_copy
1611
+ """
1612
+ table = self.table.replace_schema_metadata(*args, **kwargs)
1613
+ blocks = []
1614
+ for tables in self.blocks:
1615
+ blocks.append([t.replace_schema_metadata(*args, **kwargs) for t in tables])
1616
+ return ConcatenationTable(table, self.blocks)
1617
+
1618
+ def add_column(self, *args, **kwargs):
1619
+ """
1620
+ Add column to Table at position.
1621
+
1622
+ A new table is returned with the column added, the original table
1623
+ object is left unchanged.
1624
+
1625
+ Args:
1626
+ i (`int`):
1627
+ Index to place the column at.
1628
+ field_ (`Union[str, pyarrow.Field]`):
1629
+ If a string is passed then the type is deduced from the column
1630
+ data.
1631
+ column (`Union[pyarrow.Array, List[pyarrow.Array]]`):
1632
+ Column data.
1633
+
1634
+ Returns:
1635
+ `datasets.table.Table`: New table with the passed column added.
1636
+ """
1637
+ raise NotImplementedError()
1638
+
1639
+ def append_column(self, *args, **kwargs):
1640
+ """
1641
+ Append column at end of columns.
1642
+
1643
+ Args:
1644
+ field_ (`Union[str, pyarrow.Field]`):
1645
+ If a string is passed then the type is deduced from the column
1646
+ data.
1647
+ column (`Union[pyarrow.Array, List[pyarrow.Array]]`):
1648
+ Column data.
1649
+
1650
+ Returns:
1651
+ `datasets.table.Table`:
1652
+ New table with the passed column added.
1653
+ """
1654
+ raise NotImplementedError()
1655
+
1656
+ def remove_column(self, i, *args, **kwargs):
1657
+ """
1658
+ Create new Table with the indicated column removed.
1659
+
1660
+ Args:
1661
+ i (`int`):
1662
+ Index of column to remove.
1663
+
1664
+ Returns:
1665
+ `datasets.table.Table`:
1666
+ New table without the column.
1667
+ """
1668
+ table = self.table.remove_column(i, *args, **kwargs)
1669
+ name = self.table.column_names[i]
1670
+ blocks = []
1671
+ for tables in self.blocks:
1672
+ blocks.append(
1673
+ [
1674
+ t.remove_column(t.column_names.index(name), *args, **kwargs) if name in t.column_names else t
1675
+ for t in tables
1676
+ ]
1677
+ )
1678
+ return ConcatenationTable(table, blocks)
1679
+
1680
+ def set_column(self, *args, **kwargs):
1681
+ """
1682
+ Replace column in Table at position.
1683
+
1684
+ Args:
1685
+ i (`int`):
1686
+ Index to place the column at.
1687
+ field_ (`Union[str, pyarrow.Field]`):
1688
+ If a string is passed then the type is deduced from the column
1689
+ data.
1690
+ column (`Union[pyarrow.Array, List[pyarrow.Array]]`):
1691
+ Column data.
1692
+
1693
+ Returns:
1694
+ `datasets.table.Table`:
1695
+ New table with the passed column set.
1696
+ """
1697
+ raise NotImplementedError()
1698
+
1699
+ def rename_columns(self, names, *args, **kwargs):
1700
+ """
1701
+ Create new table with columns renamed to provided names.
1702
+ """
1703
+ table = self.table.rename_columns(names, *args, **kwargs)
1704
+ names = dict(zip(self.table.column_names, names))
1705
+ blocks = []
1706
+ for tables in self.blocks:
1707
+ blocks.append(
1708
+ [t.rename_columns([names[name] for name in t.column_names], *args, **kwargs) for t in tables]
1709
+ )
1710
+ return ConcatenationTable(table, blocks)
1711
+
1712
+ def drop(self, columns, *args, **kwargs):
1713
+ """
1714
+ Drop one or more columns and return a new table.
1715
+
1716
+ Args:
1717
+ columns (`List[str]`):
1718
+ List of field names referencing existing columns.
1719
+
1720
+ Raises:
1721
+ `KeyError` : if any of the passed columns name are not existing.
1722
+
1723
+ Returns:
1724
+ `datasets.table.Table`:
1725
+ New table without the columns.
1726
+ """
1727
+ table = self.table.drop(columns, *args, **kwargs)
1728
+ blocks = []
1729
+ for tables in self.blocks:
1730
+ blocks.append([t.drop([c for c in columns if c in t.column_names], *args, **kwargs) for t in tables])
1731
+ return ConcatenationTable(table, blocks)
1732
+
1733
+ def select(self, columns, *args, **kwargs):
1734
+ """
1735
+ Select columns of the table.
1736
+
1737
+ Returns a new table with the specified columns, and metadata preserved.
1738
+
1739
+ Args:
1740
+ columns (:obj:`Union[List[str], List[int]]`):
1741
+ The column names or integer indices to select.
1742
+
1743
+ Returns:
1744
+ :class:`datasets.table.Table`: New table with the specified columns, and metadata preserved.
1745
+ """
1746
+ table = self.table.select(columns, *args, **kwargs)
1747
+ blocks = []
1748
+ for tables in self.blocks:
1749
+ blocks.append([t.select([c for c in columns if c in t.column_names], *args, **kwargs) for t in tables])
1750
+ return ConcatenationTable(table, blocks)
1751
+
1752
+
1753
+ def concat_tables(tables: List[Table], axis: int = 0) -> Table:
1754
+ """
1755
+ Concatenate tables.
1756
+
1757
+ Args:
1758
+ tables (list of `Table`):
1759
+ List of tables to be concatenated.
1760
+ axis (`{0, 1}`, defaults to `0`, meaning over rows):
1761
+ Axis to concatenate over, where `0` means over rows (vertically) and `1` means over columns
1762
+ (horizontally).
1763
+
1764
+ <Added version="1.6.0"/>
1765
+ Returns:
1766
+ `datasets.table.Table`:
1767
+ If the number of input tables is > 1, then the returned table is a `datasets.table.ConcatenationTable`.
1768
+ Otherwise if there's only one table, it is returned as is.
1769
+ """
1770
+ tables = list(tables)
1771
+ if len(tables) == 1:
1772
+ return tables[0]
1773
+ return ConcatenationTable.from_tables(tables, axis=axis)
1774
+
1775
+
1776
+ def list_table_cache_files(table: Table) -> List[str]:
1777
+ """
1778
+ Get the cache files that are loaded by the table.
1779
+ Cache file are used when parts of the table come from the disk via memory mapping.
1780
+
1781
+ Returns:
1782
+ `List[str]`:
1783
+ A list of paths to the cache files loaded by the table.
1784
+ """
1785
+ if isinstance(table, ConcatenationTable):
1786
+ cache_files = []
1787
+ for subtables in table.blocks:
1788
+ for subtable in subtables:
1789
+ cache_files += list_table_cache_files(subtable)
1790
+ return cache_files
1791
+ elif isinstance(table, MemoryMappedTable):
1792
+ return [table.path]
1793
+ else:
1794
+ return []
1795
+
1796
+
1797
+ def _wrap_for_chunked_arrays(func):
1798
+ """Apply the function on each chunk of a `pyarrow.ChunkedArray`, or on the array directly"""
1799
+
1800
+ def wrapper(array, *args, **kwargs):
1801
+ if isinstance(array, pa.ChunkedArray):
1802
+ return pa.chunked_array([func(chunk, *args, **kwargs) for chunk in array.chunks])
1803
+ else:
1804
+ return func(array, *args, **kwargs)
1805
+
1806
+ return wrapper
1807
+
1808
+
1809
+ def _are_list_values_of_length(array: pa.ListArray, length: int) -> bool:
1810
+ """Check if all the sub-lists of a `pa.ListArray` have the specified length."""
1811
+ return pc.all(pc.equal(array.value_lengths(), length)).as_py() or array.null_count == len(array)
1812
+
1813
+
1814
+ def _combine_list_array_offsets_with_mask(array: pa.ListArray) -> pa.Array:
1815
+ """Add the null bitmap to the offsets of a `pa.ListArray`."""
1816
+ offsets = array.offsets
1817
+ if array.null_count > 0:
1818
+ offsets = pa.concat_arrays(
1819
+ [
1820
+ pc.replace_with_mask(offsets[:-1], array.is_null(), pa.nulls(len(array), pa.int32())),
1821
+ offsets[-1:],
1822
+ ]
1823
+ )
1824
+ return offsets
1825
+
1826
+
1827
+ def _storage_type(type: pa.DataType) -> pa.DataType:
1828
+ """Convert a (possibly nested) `pa.ExtensionType` to its storage type."""
1829
+ if isinstance(type, pa.ExtensionType):
1830
+ return _storage_type(type.storage_type)
1831
+ elif isinstance(type, pa.StructType):
1832
+ return pa.struct([pa.field(field.name, _storage_type(field.type)) for field in type])
1833
+ elif isinstance(type, pa.ListType):
1834
+ return pa.list_(_storage_type(type.value_type))
1835
+ elif isinstance(type, pa.FixedSizeListType):
1836
+ return pa.list_(_storage_type(type.value_type), type.list_size)
1837
+ return type
1838
+
1839
+
1840
+ def _short_str(value: Any) -> str:
1841
+ out = str(value)
1842
+ if len(out) > 3000:
1843
+ out = out[:1500] + "\n...\n" + out[-1500:]
1844
+ return out
1845
+
1846
+
1847
+ @_wrap_for_chunked_arrays
1848
+ def array_cast(
1849
+ array: pa.Array, pa_type: pa.DataType, allow_primitive_to_str: bool = True, allow_decimal_to_str: bool = True
1850
+ ) -> Union[pa.Array, pa.FixedSizeListArray, pa.ListArray, pa.StructArray, pa.ExtensionArray]:
1851
+ """Improved version of `pa.Array.cast`
1852
+
1853
+ It supports casting `pa.StructArray` objects to re-order the fields.
1854
+ It also let you control certain aspects of the casting, e.g. whether
1855
+ to disable casting primitives (`booleans`, `floats` or `ints`) or
1856
+ disable casting decimals to strings.
1857
+
1858
+ Args:
1859
+ array (`pa.Array`):
1860
+ PyArrow array to cast
1861
+ pa_type (`pa.DataType`):
1862
+ Target PyArrow type
1863
+ allow_primitive_to_str (`bool`, defaults to `True`):
1864
+ Whether to allow casting primitives to strings.
1865
+ Defaults to `True`.
1866
+ allow_decimal_to_str (`bool`, defaults to `True`):
1867
+ Whether to allow casting decimals to strings.
1868
+ Defaults to `True`.
1869
+
1870
+ Raises:
1871
+ `pa.ArrowInvalidError`: if the arrow data casting fails
1872
+ `TypeError`: if the target type is not supported according, e.g.
1873
+
1874
+ - if a field is missing
1875
+ - if casting from primitives to strings and `allow_primitive_to_str` is `False`
1876
+ - if casting from decimals to strings and `allow_decimal_to_str` is `False`
1877
+
1878
+ Returns:
1879
+ `List[pyarrow.Array]`: the casted array
1880
+ """
1881
+ _c = partial(array_cast, allow_primitive_to_str=allow_primitive_to_str, allow_decimal_to_str=allow_decimal_to_str)
1882
+ if isinstance(array, pa.ExtensionArray):
1883
+ array = array.storage
1884
+ if isinstance(pa_type, pa.ExtensionType):
1885
+ return pa_type.wrap_array(_c(array, pa_type.storage_type))
1886
+ elif array.type == pa_type:
1887
+ return array
1888
+ elif pa.types.is_struct(array.type):
1889
+ if pa.types.is_struct(pa_type) and ({field.name for field in pa_type} == {field.name for field in array.type}):
1890
+ if array.type.num_fields == 0:
1891
+ return array
1892
+ arrays = [_c(array.field(field.name), field.type) for field in pa_type]
1893
+ return pa.StructArray.from_arrays(arrays, fields=list(pa_type), mask=array.is_null())
1894
+ elif pa.types.is_list(array.type):
1895
+ if pa.types.is_fixed_size_list(pa_type):
1896
+ if _are_list_values_of_length(array, pa_type.list_size):
1897
+ if array.null_count > 0:
1898
+ # Ensure each null value in the array translates to [null] * pa_type.list_size in the array's values array
1899
+ array_type = array.type
1900
+ storage_type = _storage_type(array_type)
1901
+ if array_type != storage_type:
1902
+ # Temporarily convert to the storage type to support extension types in the slice operation
1903
+ array = _c(array, storage_type)
1904
+ array = pc.list_slice(array, 0, pa_type.list_size, return_fixed_size_list=True)
1905
+ array = _c(array, array_type)
1906
+ else:
1907
+ array = pc.list_slice(array, 0, pa_type.list_size, return_fixed_size_list=True)
1908
+ array_values = array.values
1909
+ if config.PYARROW_VERSION.major < 15:
1910
+ return pa.Array.from_buffers(
1911
+ pa_type,
1912
+ len(array),
1913
+ [array.is_valid().buffers()[1]],
1914
+ children=[_c(array_values, pa_type.value_type)],
1915
+ )
1916
+ else:
1917
+ return pa.FixedSizeListArray.from_arrays(
1918
+ _c(array_values, pa_type.value_type), pa_type.list_size, mask=array.is_null()
1919
+ )
1920
+ else:
1921
+ array_values = array.values[
1922
+ array.offset * pa_type.length : (array.offset + len(array)) * pa_type.length
1923
+ ]
1924
+ return pa.FixedSizeListArray.from_arrays(_c(array_values, pa_type.value_type), pa_type.list_size)
1925
+ elif pa.types.is_list(pa_type):
1926
+ # Merge offsets with the null bitmap to avoid the "Null bitmap with offsets slice not supported" ArrowNotImplementedError
1927
+ array_offsets = _combine_list_array_offsets_with_mask(array)
1928
+ return pa.ListArray.from_arrays(array_offsets, _c(array.values, pa_type.value_type))
1929
+ elif pa.types.is_fixed_size_list(array.type):
1930
+ if pa.types.is_fixed_size_list(pa_type):
1931
+ if pa_type.list_size == array.type.list_size:
1932
+ array_values = array.values[
1933
+ array.offset * array.type.list_size : (array.offset + len(array)) * array.type.list_size
1934
+ ]
1935
+ if config.PYARROW_VERSION.major < 15:
1936
+ return pa.Array.from_buffers(
1937
+ pa_type,
1938
+ len(array),
1939
+ [array.is_valid().buffers()[1]],
1940
+ children=[_c(array_values, pa_type.value_type)],
1941
+ )
1942
+ else:
1943
+ return pa.FixedSizeListArray.from_arrays(
1944
+ _c(array_values, pa_type.value_type), pa_type.list_size, mask=array.is_null()
1945
+ )
1946
+ elif pa.types.is_list(pa_type):
1947
+ array_offsets = (np.arange(len(array) + 1) + array.offset) * array.type.list_size
1948
+ return pa.ListArray.from_arrays(array_offsets, _c(array.values, pa_type.value_type), mask=array.is_null())
1949
+ else:
1950
+ if pa.types.is_string(pa_type):
1951
+ if not allow_primitive_to_str and pa.types.is_primitive(array.type):
1952
+ raise TypeError(
1953
+ f"Couldn't cast array of type {_short_str(array.type)} to {_short_str(pa_type)} "
1954
+ f"since allow_primitive_to_str is set to {allow_primitive_to_str} "
1955
+ )
1956
+ if not allow_decimal_to_str and pa.types.is_decimal(array.type):
1957
+ raise TypeError(
1958
+ f"Couldn't cast array of type {_short_str(array.type)} to {_short_str(pa_type)} "
1959
+ f"and allow_decimal_to_str is set to {allow_decimal_to_str}"
1960
+ )
1961
+ if pa.types.is_null(pa_type) and not pa.types.is_null(array.type):
1962
+ raise TypeError(f"Couldn't cast array of type {_short_str(array.type)} to {_short_str(pa_type)}")
1963
+ return array.cast(pa_type)
1964
+ raise TypeError(f"Couldn't cast array of type {_short_str(array.type)} to {_short_str(pa_type)}")
1965
+
1966
+
1967
+ @_wrap_for_chunked_arrays
1968
+ def cast_array_to_feature(
1969
+ array: pa.Array, feature: "FeatureType", allow_primitive_to_str: bool = True, allow_decimal_to_str: bool = True
1970
+ ) -> pa.Array:
1971
+ """Cast an array to the arrow type that corresponds to the requested feature type.
1972
+ For custom features like [`Audio`] or [`Image`], it takes into account the "cast_storage" methods
1973
+ they defined to enable casting from other arrow types.
1974
+
1975
+ Args:
1976
+ array (`pa.Array`):
1977
+ The PyArrow array to cast.
1978
+ feature (`datasets.features.FeatureType`):
1979
+ The target feature type.
1980
+ allow_primitive_to_str (`bool`, defaults to `True`):
1981
+ Whether to allow casting primitives to strings.
1982
+ Defaults to `True`.
1983
+ allow_decimal_to_str (`bool`, defaults to `True`):
1984
+ Whether to allow casting decimals to strings.
1985
+ Defaults to `True`.
1986
+
1987
+ Raises:
1988
+ `pa.ArrowInvalidError`: if the arrow data casting fails
1989
+ `TypeError`: if the target type is not supported according, e.g.
1990
+
1991
+ - if a field is missing
1992
+ - if casting from primitives and `allow_primitive_to_str` is `False`
1993
+ - if casting from decimals and `allow_decimal_to_str` is `False`
1994
+
1995
+ Returns:
1996
+ array (`pyarrow.Array`): the casted array
1997
+ """
1998
+ from .features.features import Sequence, get_nested_type
1999
+
2000
+ _c = partial(
2001
+ cast_array_to_feature,
2002
+ allow_primitive_to_str=allow_primitive_to_str,
2003
+ allow_decimal_to_str=allow_decimal_to_str,
2004
+ )
2005
+
2006
+ if isinstance(array, pa.ExtensionArray):
2007
+ array = array.storage
2008
+ if hasattr(feature, "cast_storage"):
2009
+ return feature.cast_storage(array)
2010
+
2011
+ elif pa.types.is_struct(array.type):
2012
+ # feature must be a dict or Sequence(subfeatures_dict)
2013
+ if isinstance(feature, Sequence) and isinstance(feature.feature, dict):
2014
+ feature = {
2015
+ name: Sequence(subfeature, length=feature.length) for name, subfeature in feature.feature.items()
2016
+ }
2017
+ if isinstance(feature, dict) and {field.name for field in array.type} == set(feature):
2018
+ if array.type.num_fields == 0:
2019
+ return array
2020
+ arrays = [_c(array.field(name), subfeature) for name, subfeature in feature.items()]
2021
+ return pa.StructArray.from_arrays(arrays, names=list(feature), mask=array.is_null())
2022
+ elif pa.types.is_list(array.type):
2023
+ # feature must be either [subfeature] or Sequence(subfeature)
2024
+ if isinstance(feature, list):
2025
+ casted_array_values = _c(array.values, feature[0])
2026
+ if casted_array_values.type == array.values.type:
2027
+ return array
2028
+ else:
2029
+ # Merge offsets with the null bitmap to avoid the "Null bitmap with offsets slice not supported" ArrowNotImplementedError
2030
+ array_offsets = _combine_list_array_offsets_with_mask(array)
2031
+ return pa.ListArray.from_arrays(array_offsets, casted_array_values)
2032
+ elif isinstance(feature, Sequence):
2033
+ if feature.length > -1:
2034
+ if _are_list_values_of_length(array, feature.length):
2035
+ if array.null_count > 0:
2036
+ # Ensure each null value in the array translates to [null] * pa_type.list_size in the array's values array
2037
+ array_type = array.type
2038
+ storage_type = _storage_type(array_type)
2039
+ if array_type != storage_type:
2040
+ # Temporarily convert to the storage type to support extension types in the slice operation
2041
+ array = array_cast(
2042
+ array,
2043
+ storage_type,
2044
+ allow_primitive_to_str=allow_primitive_to_str,
2045
+ allow_decimal_to_str=allow_decimal_to_str,
2046
+ )
2047
+ array = pc.list_slice(array, 0, feature.length, return_fixed_size_list=True)
2048
+ array = array_cast(
2049
+ array,
2050
+ array_type,
2051
+ allow_primitive_to_str=allow_primitive_to_str,
2052
+ allow_decimal_to_str=allow_decimal_to_str,
2053
+ )
2054
+ else:
2055
+ array = pc.list_slice(array, 0, feature.length, return_fixed_size_list=True)
2056
+ array_values = array.values
2057
+ casted_array_values = _c(array_values, feature.feature)
2058
+ if config.PYARROW_VERSION.major < 15:
2059
+ return pa.Array.from_buffers(
2060
+ pa.list_(casted_array_values.type, feature.length),
2061
+ len(array),
2062
+ [array.is_valid().buffers()[1]],
2063
+ children=[casted_array_values],
2064
+ )
2065
+ else:
2066
+ return pa.FixedSizeListArray.from_arrays(
2067
+ casted_array_values, feature.length, mask=array.is_null()
2068
+ )
2069
+ else:
2070
+ array_values = array.values[
2071
+ array.offset * feature.length : (array.offset + len(array)) * feature.length
2072
+ ]
2073
+ return pa.FixedSizeListArray.from_arrays(_c(array_values, feature.feature), feature.length)
2074
+ else:
2075
+ casted_array_values = _c(array.values, feature.feature)
2076
+ if casted_array_values.type == array.values.type:
2077
+ return array
2078
+ else:
2079
+ # Merge offsets with the null bitmap to avoid the "Null bitmap with offsets slice not supported" ArrowNotImplementedError
2080
+ array_offsets = _combine_list_array_offsets_with_mask(array)
2081
+ return pa.ListArray.from_arrays(array_offsets, casted_array_values)
2082
+ elif pa.types.is_fixed_size_list(array.type):
2083
+ # feature must be either [subfeature] or Sequence(subfeature)
2084
+ if isinstance(feature, list):
2085
+ array_offsets = (np.arange(len(array) + 1) + array.offset) * array.type.list_size
2086
+ return pa.ListArray.from_arrays(array_offsets, _c(array.values, feature[0]), mask=array.is_null())
2087
+ elif isinstance(feature, Sequence):
2088
+ if feature.length > -1:
2089
+ if feature.length == array.type.list_size:
2090
+ array_values = array.values[
2091
+ array.offset * array.type.list_size : (array.offset + len(array)) * array.type.list_size
2092
+ ]
2093
+ casted_array_values = _c(array_values, feature.feature)
2094
+ if config.PYARROW_VERSION.major < 15:
2095
+ return pa.Array.from_buffers(
2096
+ pa.list_(casted_array_values.type, feature.length),
2097
+ len(array),
2098
+ [array.is_valid().buffers()[1]],
2099
+ children=[casted_array_values],
2100
+ )
2101
+ else:
2102
+ return pa.FixedSizeListArray.from_arrays(
2103
+ casted_array_values, feature.length, mask=array.is_null()
2104
+ )
2105
+ else:
2106
+ array_offsets = (np.arange(len(array) + 1) + array.offset) * array.type.list_size
2107
+ return pa.ListArray.from_arrays(array_offsets, _c(array.values, feature.feature), mask=array.is_null())
2108
+ if pa.types.is_null(array.type):
2109
+ return array_cast(
2110
+ array,
2111
+ get_nested_type(feature),
2112
+ allow_primitive_to_str=allow_primitive_to_str,
2113
+ allow_decimal_to_str=allow_decimal_to_str,
2114
+ )
2115
+ elif not isinstance(feature, (Sequence, dict, list, tuple)):
2116
+ return array_cast(
2117
+ array,
2118
+ feature(),
2119
+ allow_primitive_to_str=allow_primitive_to_str,
2120
+ allow_decimal_to_str=allow_decimal_to_str,
2121
+ )
2122
+ raise TypeError(f"Couldn't cast array of type\n{_short_str(array.type)}\nto\n{_short_str(feature)}")
2123
+
2124
+
2125
+ @_wrap_for_chunked_arrays
2126
+ def embed_array_storage(array: pa.Array, feature: "FeatureType"):
2127
+ """Embed data into an arrays's storage.
2128
+ For custom features like Audio or Image, it takes into account the "embed_storage" methods
2129
+ they define to embed external data (e.g. an image file) into an array.
2130
+
2131
+ <Added version="2.4.0"/>
2132
+
2133
+ Args:
2134
+ array (`pa.Array`):
2135
+ The PyArrow array in which to embed data.
2136
+ feature (`datasets.features.FeatureType`):
2137
+ Array features.
2138
+
2139
+ Raises:
2140
+ `TypeError`: if the target type is not supported according, e.g.
2141
+
2142
+ - if a field is missing
2143
+
2144
+ Returns:
2145
+ array (`pyarrow.Array`): the casted array
2146
+ """
2147
+ from .features import Sequence
2148
+
2149
+ _e = embed_array_storage
2150
+
2151
+ if isinstance(array, pa.ExtensionArray):
2152
+ array = array.storage
2153
+ if hasattr(feature, "embed_storage"):
2154
+ return feature.embed_storage(array)
2155
+ elif pa.types.is_struct(array.type):
2156
+ # feature must be a dict or Sequence(subfeatures_dict)
2157
+ if isinstance(feature, Sequence) and isinstance(feature.feature, dict):
2158
+ feature = {
2159
+ name: Sequence(subfeature, length=feature.length) for name, subfeature in feature.feature.items()
2160
+ }
2161
+ if isinstance(feature, dict):
2162
+ arrays = [_e(array.field(name), subfeature) for name, subfeature in feature.items()]
2163
+ return pa.StructArray.from_arrays(arrays, names=list(feature), mask=array.is_null())
2164
+ elif pa.types.is_list(array.type):
2165
+ # feature must be either [subfeature] or Sequence(subfeature)
2166
+ # Merge offsets with the null bitmap to avoid the "Null bitmap with offsets slice not supported" ArrowNotImplementedError
2167
+ array_offsets = _combine_list_array_offsets_with_mask(array)
2168
+ if isinstance(feature, list):
2169
+ return pa.ListArray.from_arrays(array_offsets, _e(array.values, feature[0]))
2170
+ if isinstance(feature, Sequence) and feature.length == -1:
2171
+ return pa.ListArray.from_arrays(array_offsets, _e(array.values, feature.feature))
2172
+ elif pa.types.is_fixed_size_list(array.type):
2173
+ # feature must be Sequence(subfeature)
2174
+ if isinstance(feature, Sequence) and feature.length > -1:
2175
+ array_values = array.values[
2176
+ array.offset * array.type.list_size : (array.offset + len(array)) * array.type.list_size
2177
+ ]
2178
+ embedded_array_values = _e(array_values, feature.feature)
2179
+ if config.PYARROW_VERSION.major < 15:
2180
+ return pa.Array.from_buffers(
2181
+ pa.list_(array_values.type, feature.length),
2182
+ len(array),
2183
+ [array.is_valid().buffers()[1]],
2184
+ children=[embedded_array_values],
2185
+ )
2186
+ else:
2187
+ return pa.FixedSizeListArray.from_arrays(embedded_array_values, feature.length, mask=array.is_null())
2188
+ if not isinstance(feature, (Sequence, dict, list, tuple)):
2189
+ return array
2190
+ raise TypeError(f"Couldn't embed array of type\n{_short_str(array.type)}\nwith\n{_short_str(feature)}")
2191
+
2192
+
2193
+ class CastError(ValueError):
2194
+ """When it's not possible to cast an Arrow table to a specific schema or set of features"""
2195
+
2196
+ def __init__(self, *args, table_column_names: List[str], requested_column_names: List[str]) -> None:
2197
+ super().__init__(*args)
2198
+ self.table_column_names = table_column_names
2199
+ self.requested_column_names = requested_column_names
2200
+
2201
+ def __reduce__(self):
2202
+ # Fix unpickling: TypeError: __init__() missing 2 required keyword-only arguments: 'table_column_names' and 'requested_column_names'
2203
+ return partial(
2204
+ CastError, table_column_names=self.table_column_names, requested_column_names=self.requested_column_names
2205
+ ), ()
2206
+
2207
+ def details(self):
2208
+ new_columns = set(self.table_column_names) - set(self.requested_column_names)
2209
+ missing_columns = set(self.requested_column_names) - set(self.table_column_names)
2210
+ if new_columns and missing_columns:
2211
+ return f"there are {len(new_columns)} new columns ({_short_str(new_columns)}) and {len(missing_columns)} missing columns ({_short_str(missing_columns)})."
2212
+ elif new_columns:
2213
+ return f"there are {len(new_columns)} new columns ({_short_str(new_columns)})"
2214
+ else:
2215
+ return f"there are {len(missing_columns)} missing columns ({_short_str(missing_columns)})"
2216
+
2217
+
2218
+ def cast_table_to_features(table: pa.Table, features: "Features"):
2219
+ """Cast a table to the arrow schema that corresponds to the requested features.
2220
+
2221
+ Args:
2222
+ table (`pyarrow.Table`):
2223
+ PyArrow table to cast.
2224
+ features ([`Features`]):
2225
+ Target features.
2226
+
2227
+ Returns:
2228
+ table (`pyarrow.Table`): the casted table
2229
+ """
2230
+ if sorted(table.column_names) != sorted(features):
2231
+ raise CastError(
2232
+ f"Couldn't cast\n{_short_str(table.schema)}\nto\n{_short_str(features)}\nbecause column names don't match",
2233
+ table_column_names=table.column_names,
2234
+ requested_column_names=list(features),
2235
+ )
2236
+ arrays = [cast_array_to_feature(table[name], feature) for name, feature in features.items()]
2237
+ return pa.Table.from_arrays(arrays, schema=features.arrow_schema)
2238
+
2239
+
2240
+ def cast_table_to_schema(table: pa.Table, schema: pa.Schema):
2241
+ """Cast a table to the arrow schema. Different from `cast_table_to_features`, this method can preserve nullability.
2242
+
2243
+ Args:
2244
+ table (`pa.Table`):
2245
+ PyArrow table to cast.
2246
+ features ([`Features`]):
2247
+ Target features.
2248
+
2249
+ Returns:
2250
+ `pa.Table`: the casted table
2251
+ """
2252
+ from .features import Features
2253
+
2254
+ features = Features.from_arrow_schema(schema)
2255
+ if sorted(table.column_names) != sorted(features):
2256
+ raise CastError(
2257
+ f"Couldn't cast\n{_short_str(table.schema)}\nto\n{_short_str(features)}\nbecause column names don't match",
2258
+ table_column_names=table.column_names,
2259
+ requested_column_names=list(features),
2260
+ )
2261
+ arrays = [cast_array_to_feature(table[name], feature) for name, feature in features.items()]
2262
+ return pa.Table.from_arrays(arrays, schema=schema)
2263
+
2264
+
2265
+ def embed_table_storage(table: pa.Table):
2266
+ """Embed external data into a table's storage.
2267
+
2268
+ <Added version="2.4.0"/>
2269
+
2270
+ Args:
2271
+ table (`pyarrow.Table`):
2272
+ PyArrow table in which to embed data.
2273
+
2274
+ Returns:
2275
+ table (`pyarrow.Table`): the table with embedded data
2276
+ """
2277
+ from .features.features import Features, require_storage_embed
2278
+
2279
+ features = Features.from_arrow_schema(table.schema)
2280
+ arrays = [
2281
+ embed_array_storage(table[name], feature) if require_storage_embed(feature) else table[name]
2282
+ for name, feature in features.items()
2283
+ ]
2284
+ return pa.Table.from_arrays(arrays, schema=features.arrow_schema)
2285
+
2286
+
2287
+ def table_cast(table: pa.Table, schema: pa.Schema):
2288
+ """Improved version of `pa.Table.cast`.
2289
+
2290
+ It supports casting to feature types stored in the schema metadata.
2291
+
2292
+ Args:
2293
+ table (`pyarrow.Table`):
2294
+ PyArrow table to cast.
2295
+ schema (`pyarrow.Schema`):
2296
+ Target PyArrow schema.
2297
+
2298
+ Returns:
2299
+ table (`pyarrow.Table`): the casted table
2300
+ """
2301
+ if table.schema != schema:
2302
+ return cast_table_to_schema(table, schema)
2303
+ elif table.schema.metadata != schema.metadata:
2304
+ return table.replace_schema_metadata(schema.metadata)
2305
+ else:
2306
+ return table
2307
+
2308
+
2309
+ def table_flatten(table: pa.Table):
2310
+ """Improved version of `pa.Table.flatten`.
2311
+
2312
+ It behaves as `pa.Table.flatten` in a sense it does 1-step flatten of the columns with a struct type into one column per struct field,
2313
+ but updates the metadata and skips decodable features unless the `decode` attribute of these features is set to False.
2314
+
2315
+ Args:
2316
+ table (`pa.Table`):
2317
+ PyArrow table to flatten.
2318
+
2319
+ Returns:
2320
+ `Table`: the flattened table
2321
+ """
2322
+ from .features import Features
2323
+
2324
+ features = Features.from_arrow_schema(table.schema)
2325
+ if any(hasattr(subfeature, "flatten") and subfeature.flatten() == subfeature for subfeature in features.values()):
2326
+ flat_arrays = []
2327
+ flat_column_names = []
2328
+ for field in table.schema:
2329
+ array = table.column(field.name)
2330
+ subfeature = features[field.name]
2331
+ if pa.types.is_struct(field.type) and (
2332
+ not hasattr(subfeature, "flatten") or subfeature.flatten() != subfeature
2333
+ ):
2334
+ flat_arrays.extend(array.flatten())
2335
+ flat_column_names.extend([f"{field.name}.{subfield.name}" for subfield in field.type])
2336
+ else:
2337
+ flat_arrays.append(array)
2338
+ flat_column_names.append(field.name)
2339
+ flat_table = pa.Table.from_arrays(
2340
+ flat_arrays,
2341
+ names=flat_column_names,
2342
+ )
2343
+ else:
2344
+ flat_table = table.flatten()
2345
+ # Preserve complex types in the metadata
2346
+ flat_features = features.flatten(max_depth=2)
2347
+ flat_features = Features({column_name: flat_features[column_name] for column_name in flat_table.column_names})
2348
+ return flat_table.replace_schema_metadata(flat_features.arrow_schema.metadata)
2349
+
2350
+
2351
+ def table_visitor(table: pa.Table, function: Callable[[pa.Array], None]):
2352
+ """Visit all arrays in a table and apply a function to them.
2353
+
2354
+ Args:
2355
+ table (`pyarrow.Table`):
2356
+ PyArrow table to visit.
2357
+ function (`Callable[[pa.Array], None]`):
2358
+ Function to apply to each array.
2359
+ """
2360
+ from .features import Features, Sequence
2361
+
2362
+ features = Features.from_arrow_schema(table.schema)
2363
+
2364
+ def _visit(array, feature):
2365
+ if isinstance(array, pa.ChunkedArray):
2366
+ for chunk in array.chunks:
2367
+ _visit(chunk, feature)
2368
+ else:
2369
+ if isinstance(array, pa.ExtensionArray):
2370
+ array = array.storage
2371
+ function(array, feature)
2372
+ if pa.types.is_struct(array.type) and not hasattr(feature, "cast_storage"):
2373
+ if isinstance(feature, Sequence) and isinstance(feature.feature, dict):
2374
+ feature = {
2375
+ name: Sequence(subfeature, length=feature.length)
2376
+ for name, subfeature in feature.feature.items()
2377
+ }
2378
+ for name, subfeature in feature.items():
2379
+ _visit(array.field(name), subfeature)
2380
+ elif pa.types.is_list(array.type):
2381
+ if isinstance(feature, list):
2382
+ _visit(array.values, feature[0])
2383
+ elif isinstance(feature, Sequence):
2384
+ _visit(array.values, feature.feature)
2385
+
2386
+ for name, feature in features.items():
2387
+ _visit(table[name], feature)
2388
+
2389
+
2390
+ def table_iter(table: Table, batch_size: int, drop_last_batch=False) -> Iterator[pa.Table]:
2391
+ """Iterate over sub-tables of size `batch_size`.
2392
+
2393
+ Args:
2394
+ table (`pyarrow.Table`):
2395
+ PyArrow table to iterate over.
2396
+ batch_size (`int`):
2397
+ Size of each sub-table to yield.
2398
+ drop_last_batch (`bool`, defaults to `False`):
2399
+ Drop the last batch if it is smaller than `batch_size`.
2400
+ """
2401
+ chunks_buffer = []
2402
+ chunks_buffer_size = 0
2403
+ for chunk in table.to_reader(max_chunksize=batch_size):
2404
+ if len(chunk) == 0:
2405
+ continue
2406
+ elif chunks_buffer_size + len(chunk) < batch_size:
2407
+ chunks_buffer.append(chunk)
2408
+ chunks_buffer_size += len(chunk)
2409
+ continue
2410
+ elif chunks_buffer_size + len(chunk) == batch_size:
2411
+ chunks_buffer.append(chunk)
2412
+ yield pa.Table.from_batches(chunks_buffer)
2413
+ chunks_buffer = []
2414
+ chunks_buffer_size = 0
2415
+ else:
2416
+ cropped_chunk_length = batch_size - chunks_buffer_size
2417
+ chunks_buffer.append(chunk.slice(0, cropped_chunk_length))
2418
+ yield pa.Table.from_batches(chunks_buffer)
2419
+ chunks_buffer = [chunk.slice(cropped_chunk_length, len(chunk) - cropped_chunk_length)]
2420
+ chunks_buffer_size = len(chunk) - cropped_chunk_length
2421
+ if not drop_last_batch and chunks_buffer:
2422
+ yield pa.Table.from_batches(chunks_buffer)
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/frozenlist/__init__.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import types
4
+ from collections.abc import MutableSequence
5
+ from functools import total_ordering
6
+ from typing import Any, Type
7
+
8
+ __version__ = "1.5.0"
9
+
10
+ __all__ = ("FrozenList", "PyFrozenList") # type: Tuple[str, ...]
11
+
12
+
13
+ NO_EXTENSIONS = bool(os.environ.get("FROZENLIST_NO_EXTENSIONS")) # type: bool
14
+
15
+
16
+ @total_ordering
17
+ class FrozenList(MutableSequence):
18
+ __slots__ = ("_frozen", "_items")
19
+
20
+ if sys.version_info >= (3, 9):
21
+ __class_getitem__ = classmethod(types.GenericAlias)
22
+ else:
23
+
24
+ @classmethod
25
+ def __class_getitem__(
26
+ cls: Type["FrozenList"],
27
+ cls_item: Any,
28
+ ) -> Type["FrozenList"]:
29
+ return cls
30
+
31
+ def __init__(self, items=None):
32
+ self._frozen = False
33
+ if items is not None:
34
+ items = list(items)
35
+ else:
36
+ items = []
37
+ self._items = items
38
+
39
+ @property
40
+ def frozen(self):
41
+ return self._frozen
42
+
43
+ def freeze(self):
44
+ self._frozen = True
45
+
46
+ def __getitem__(self, index):
47
+ return self._items[index]
48
+
49
+ def __setitem__(self, index, value):
50
+ if self._frozen:
51
+ raise RuntimeError("Cannot modify frozen list.")
52
+ self._items[index] = value
53
+
54
+ def __delitem__(self, index):
55
+ if self._frozen:
56
+ raise RuntimeError("Cannot modify frozen list.")
57
+ del self._items[index]
58
+
59
+ def __len__(self):
60
+ return self._items.__len__()
61
+
62
+ def __iter__(self):
63
+ return self._items.__iter__()
64
+
65
+ def __reversed__(self):
66
+ return self._items.__reversed__()
67
+
68
+ def __eq__(self, other):
69
+ return list(self) == other
70
+
71
+ def __le__(self, other):
72
+ return list(self) <= other
73
+
74
+ def insert(self, pos, item):
75
+ if self._frozen:
76
+ raise RuntimeError("Cannot modify frozen list.")
77
+ self._items.insert(pos, item)
78
+
79
+ def __repr__(self):
80
+ return f"<FrozenList(frozen={self._frozen}, {self._items!r})>"
81
+
82
+ def __hash__(self):
83
+ if self._frozen:
84
+ return hash(tuple(self))
85
+ else:
86
+ raise RuntimeError("Cannot hash unfrozen list.")
87
+
88
+
89
+ PyFrozenList = FrozenList
90
+
91
+
92
+ if not NO_EXTENSIONS:
93
+ try:
94
+ from ._frozenlist import FrozenList as CFrozenList # type: ignore
95
+ except ImportError: # pragma: no cover
96
+ pass
97
+ else:
98
+ FrozenList = CFrozenList # type: ignore
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/frozenlist/__init__.pyi ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import (
2
+ Generic,
3
+ Iterable,
4
+ Iterator,
5
+ List,
6
+ MutableSequence,
7
+ Optional,
8
+ TypeVar,
9
+ Union,
10
+ overload,
11
+ )
12
+
13
+ _T = TypeVar("_T")
14
+ _Arg = Union[List[_T], Iterable[_T]]
15
+
16
+ class FrozenList(MutableSequence[_T], Generic[_T]):
17
+ def __init__(self, items: Optional[_Arg[_T]] = None) -> None: ...
18
+ @property
19
+ def frozen(self) -> bool: ...
20
+ def freeze(self) -> None: ...
21
+ @overload
22
+ def __getitem__(self, i: int) -> _T: ...
23
+ @overload
24
+ def __getitem__(self, s: slice) -> FrozenList[_T]: ...
25
+ @overload
26
+ def __setitem__(self, i: int, o: _T) -> None: ...
27
+ @overload
28
+ def __setitem__(self, s: slice, o: Iterable[_T]) -> None: ...
29
+ @overload
30
+ def __delitem__(self, i: int) -> None: ...
31
+ @overload
32
+ def __delitem__(self, i: slice) -> None: ...
33
+ def __len__(self) -> int: ...
34
+ def __iter__(self) -> Iterator[_T]: ...
35
+ def __reversed__(self) -> Iterator[_T]: ...
36
+ def __eq__(self, other: object) -> bool: ...
37
+ def __le__(self, other: FrozenList[_T]) -> bool: ...
38
+ def __ne__(self, other: object) -> bool: ...
39
+ def __lt__(self, other: FrozenList[_T]) -> bool: ...
40
+ def __ge__(self, other: FrozenList[_T]) -> bool: ...
41
+ def __gt__(self, other: FrozenList[_T]) -> bool: ...
42
+ def insert(self, pos: int, item: _T) -> None: ...
43
+ def __repr__(self) -> str: ...
44
+ def __hash__(self) -> int: ...
45
+
46
+ # types for C accelerators are the same
47
+ CFrozenList = PyFrozenList = FrozenList
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/frozenlist/_frozenlist.pyx ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import types
3
+ from collections.abc import MutableSequence
4
+
5
+
6
+ cdef class FrozenList:
7
+
8
+ if sys.version_info >= (3, 9):
9
+ __class_getitem__ = classmethod(types.GenericAlias)
10
+ else:
11
+ @classmethod
12
+ def __class_getitem__(cls, cls_item):
13
+ return cls
14
+
15
+ cdef readonly bint frozen
16
+ cdef list _items
17
+
18
+ def __init__(self, items=None):
19
+ self.frozen = False
20
+ if items is not None:
21
+ items = list(items)
22
+ else:
23
+ items = []
24
+ self._items = items
25
+
26
+ cdef object _check_frozen(self):
27
+ if self.frozen:
28
+ raise RuntimeError("Cannot modify frozen list.")
29
+
30
+ cdef inline object _fast_len(self):
31
+ return len(self._items)
32
+
33
+ def freeze(self):
34
+ self.frozen = True
35
+
36
+ def __getitem__(self, index):
37
+ return self._items[index]
38
+
39
+ def __setitem__(self, index, value):
40
+ self._check_frozen()
41
+ self._items[index] = value
42
+
43
+ def __delitem__(self, index):
44
+ self._check_frozen()
45
+ del self._items[index]
46
+
47
+ def __len__(self):
48
+ return self._fast_len()
49
+
50
+ def __iter__(self):
51
+ return self._items.__iter__()
52
+
53
+ def __reversed__(self):
54
+ return self._items.__reversed__()
55
+
56
+ def __richcmp__(self, other, op):
57
+ if op == 0: # <
58
+ return list(self) < other
59
+ if op == 1: # <=
60
+ return list(self) <= other
61
+ if op == 2: # ==
62
+ return list(self) == other
63
+ if op == 3: # !=
64
+ return list(self) != other
65
+ if op == 4: # >
66
+ return list(self) > other
67
+ if op == 5: # =>
68
+ return list(self) >= other
69
+
70
+ def insert(self, pos, item):
71
+ self._check_frozen()
72
+ self._items.insert(pos, item)
73
+
74
+ def __contains__(self, item):
75
+ return item in self._items
76
+
77
+ def __iadd__(self, items):
78
+ self._check_frozen()
79
+ self._items += list(items)
80
+ return self
81
+
82
+ def index(self, item):
83
+ return self._items.index(item)
84
+
85
+ def remove(self, item):
86
+ self._check_frozen()
87
+ self._items.remove(item)
88
+
89
+ def clear(self):
90
+ self._check_frozen()
91
+ self._items.clear()
92
+
93
+ def extend(self, items):
94
+ self._check_frozen()
95
+ self._items += list(items)
96
+
97
+ def reverse(self):
98
+ self._check_frozen()
99
+ self._items.reverse()
100
+
101
+ def pop(self, index=-1):
102
+ self._check_frozen()
103
+ return self._items.pop(index)
104
+
105
+ def append(self, item):
106
+ self._check_frozen()
107
+ return self._items.append(item)
108
+
109
+ def count(self, item):
110
+ return self._items.count(item)
111
+
112
+ def __repr__(self):
113
+ return '<FrozenList(frozen={}, {!r})>'.format(self.frozen,
114
+ self._items)
115
+
116
+ def __hash__(self):
117
+ if self.frozen:
118
+ return hash(tuple(self._items))
119
+ else:
120
+ raise RuntimeError("Cannot hash unfrozen list.")
121
+
122
+
123
+ MutableSequence.register(FrozenList)
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/frozenlist/py.typed ADDED
@@ -0,0 +1 @@
 
 
1
+ Marker
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_models.py ADDED
@@ -0,0 +1,516 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import base64
4
+ import ssl
5
+ import typing
6
+ import urllib.parse
7
+
8
+ # Functions for typechecking...
9
+
10
+
11
+ ByteOrStr = typing.Union[bytes, str]
12
+ HeadersAsSequence = typing.Sequence[typing.Tuple[ByteOrStr, ByteOrStr]]
13
+ HeadersAsMapping = typing.Mapping[ByteOrStr, ByteOrStr]
14
+ HeaderTypes = typing.Union[HeadersAsSequence, HeadersAsMapping, None]
15
+
16
+ Extensions = typing.MutableMapping[str, typing.Any]
17
+
18
+
19
+ def enforce_bytes(value: bytes | str, *, name: str) -> bytes:
20
+ """
21
+ Any arguments that are ultimately represented as bytes can be specified
22
+ either as bytes or as strings.
23
+
24
+ However we enforce that any string arguments must only contain characters in
25
+ the plain ASCII range. chr(0)...chr(127). If you need to use characters
26
+ outside that range then be precise, and use a byte-wise argument.
27
+ """
28
+ if isinstance(value, str):
29
+ try:
30
+ return value.encode("ascii")
31
+ except UnicodeEncodeError:
32
+ raise TypeError(f"{name} strings may not include unicode characters.")
33
+ elif isinstance(value, bytes):
34
+ return value
35
+
36
+ seen_type = type(value).__name__
37
+ raise TypeError(f"{name} must be bytes or str, but got {seen_type}.")
38
+
39
+
40
+ def enforce_url(value: URL | bytes | str, *, name: str) -> URL:
41
+ """
42
+ Type check for URL parameters.
43
+ """
44
+ if isinstance(value, (bytes, str)):
45
+ return URL(value)
46
+ elif isinstance(value, URL):
47
+ return value
48
+
49
+ seen_type = type(value).__name__
50
+ raise TypeError(f"{name} must be a URL, bytes, or str, but got {seen_type}.")
51
+
52
+
53
+ def enforce_headers(
54
+ value: HeadersAsMapping | HeadersAsSequence | None = None, *, name: str
55
+ ) -> list[tuple[bytes, bytes]]:
56
+ """
57
+ Convienence function that ensure all items in request or response headers
58
+ are either bytes or strings in the plain ASCII range.
59
+ """
60
+ if value is None:
61
+ return []
62
+ elif isinstance(value, typing.Mapping):
63
+ return [
64
+ (
65
+ enforce_bytes(k, name="header name"),
66
+ enforce_bytes(v, name="header value"),
67
+ )
68
+ for k, v in value.items()
69
+ ]
70
+ elif isinstance(value, typing.Sequence):
71
+ return [
72
+ (
73
+ enforce_bytes(k, name="header name"),
74
+ enforce_bytes(v, name="header value"),
75
+ )
76
+ for k, v in value
77
+ ]
78
+
79
+ seen_type = type(value).__name__
80
+ raise TypeError(
81
+ f"{name} must be a mapping or sequence of two-tuples, but got {seen_type}."
82
+ )
83
+
84
+
85
+ def enforce_stream(
86
+ value: bytes | typing.Iterable[bytes] | typing.AsyncIterable[bytes] | None,
87
+ *,
88
+ name: str,
89
+ ) -> typing.Iterable[bytes] | typing.AsyncIterable[bytes]:
90
+ if value is None:
91
+ return ByteStream(b"")
92
+ elif isinstance(value, bytes):
93
+ return ByteStream(value)
94
+ return value
95
+
96
+
97
+ # * https://tools.ietf.org/html/rfc3986#section-3.2.3
98
+ # * https://url.spec.whatwg.org/#url-miscellaneous
99
+ # * https://url.spec.whatwg.org/#scheme-state
100
+ DEFAULT_PORTS = {
101
+ b"ftp": 21,
102
+ b"http": 80,
103
+ b"https": 443,
104
+ b"ws": 80,
105
+ b"wss": 443,
106
+ }
107
+
108
+
109
+ def include_request_headers(
110
+ headers: list[tuple[bytes, bytes]],
111
+ *,
112
+ url: "URL",
113
+ content: None | bytes | typing.Iterable[bytes] | typing.AsyncIterable[bytes],
114
+ ) -> list[tuple[bytes, bytes]]:
115
+ headers_set = set(k.lower() for k, v in headers)
116
+
117
+ if b"host" not in headers_set:
118
+ default_port = DEFAULT_PORTS.get(url.scheme)
119
+ if url.port is None or url.port == default_port:
120
+ header_value = url.host
121
+ else:
122
+ header_value = b"%b:%d" % (url.host, url.port)
123
+ headers = [(b"Host", header_value)] + headers
124
+
125
+ if (
126
+ content is not None
127
+ and b"content-length" not in headers_set
128
+ and b"transfer-encoding" not in headers_set
129
+ ):
130
+ if isinstance(content, bytes):
131
+ content_length = str(len(content)).encode("ascii")
132
+ headers += [(b"Content-Length", content_length)]
133
+ else:
134
+ headers += [(b"Transfer-Encoding", b"chunked")] # pragma: nocover
135
+
136
+ return headers
137
+
138
+
139
+ # Interfaces for byte streams...
140
+
141
+
142
+ class ByteStream:
143
+ """
144
+ A container for non-streaming content, and that supports both sync and async
145
+ stream iteration.
146
+ """
147
+
148
+ def __init__(self, content: bytes) -> None:
149
+ self._content = content
150
+
151
+ def __iter__(self) -> typing.Iterator[bytes]:
152
+ yield self._content
153
+
154
+ async def __aiter__(self) -> typing.AsyncIterator[bytes]:
155
+ yield self._content
156
+
157
+ def __repr__(self) -> str:
158
+ return f"<{self.__class__.__name__} [{len(self._content)} bytes]>"
159
+
160
+
161
+ class Origin:
162
+ def __init__(self, scheme: bytes, host: bytes, port: int) -> None:
163
+ self.scheme = scheme
164
+ self.host = host
165
+ self.port = port
166
+
167
+ def __eq__(self, other: typing.Any) -> bool:
168
+ return (
169
+ isinstance(other, Origin)
170
+ and self.scheme == other.scheme
171
+ and self.host == other.host
172
+ and self.port == other.port
173
+ )
174
+
175
+ def __str__(self) -> str:
176
+ scheme = self.scheme.decode("ascii")
177
+ host = self.host.decode("ascii")
178
+ port = str(self.port)
179
+ return f"{scheme}://{host}:{port}"
180
+
181
+
182
+ class URL:
183
+ """
184
+ Represents the URL against which an HTTP request may be made.
185
+
186
+ The URL may either be specified as a plain string, for convienence:
187
+
188
+ ```python
189
+ url = httpcore.URL("https://www.example.com/")
190
+ ```
191
+
192
+ Or be constructed with explicitily pre-parsed components:
193
+
194
+ ```python
195
+ url = httpcore.URL(scheme=b'https', host=b'www.example.com', port=None, target=b'/')
196
+ ```
197
+
198
+ Using this second more explicit style allows integrations that are using
199
+ `httpcore` to pass through URLs that have already been parsed in order to use
200
+ libraries such as `rfc-3986` rather than relying on the stdlib. It also ensures
201
+ that URL parsing is treated identically at both the networking level and at any
202
+ higher layers of abstraction.
203
+
204
+ The four components are important here, as they allow the URL to be precisely
205
+ specified in a pre-parsed format. They also allow certain types of request to
206
+ be created that could not otherwise be expressed.
207
+
208
+ For example, an HTTP request to `http://www.example.com/` forwarded via a proxy
209
+ at `http://localhost:8080`...
210
+
211
+ ```python
212
+ # Constructs an HTTP request with a complete URL as the target:
213
+ # GET https://www.example.com/ HTTP/1.1
214
+ url = httpcore.URL(
215
+ scheme=b'http',
216
+ host=b'localhost',
217
+ port=8080,
218
+ target=b'https://www.example.com/'
219
+ )
220
+ request = httpcore.Request(
221
+ method="GET",
222
+ url=url
223
+ )
224
+ ```
225
+
226
+ Another example is constructing an `OPTIONS *` request...
227
+
228
+ ```python
229
+ # Constructs an 'OPTIONS *' HTTP request:
230
+ # OPTIONS * HTTP/1.1
231
+ url = httpcore.URL(scheme=b'https', host=b'www.example.com', target=b'*')
232
+ request = httpcore.Request(method="OPTIONS", url=url)
233
+ ```
234
+
235
+ This kind of request is not possible to formulate with a URL string,
236
+ because the `/` delimiter is always used to demark the target from the
237
+ host/port portion of the URL.
238
+
239
+ For convenience, string-like arguments may be specified either as strings or
240
+ as bytes. However, once a request is being issue over-the-wire, the URL
241
+ components are always ultimately required to be a bytewise representation.
242
+
243
+ In order to avoid any ambiguity over character encodings, when strings are used
244
+ as arguments, they must be strictly limited to the ASCII range `chr(0)`-`chr(127)`.
245
+ If you require a bytewise representation that is outside this range you must
246
+ handle the character encoding directly, and pass a bytes instance.
247
+ """
248
+
249
+ def __init__(
250
+ self,
251
+ url: bytes | str = "",
252
+ *,
253
+ scheme: bytes | str = b"",
254
+ host: bytes | str = b"",
255
+ port: int | None = None,
256
+ target: bytes | str = b"",
257
+ ) -> None:
258
+ """
259
+ Parameters:
260
+ url: The complete URL as a string or bytes.
261
+ scheme: The URL scheme as a string or bytes.
262
+ Typically either `"http"` or `"https"`.
263
+ host: The URL host as a string or bytes. Such as `"www.example.com"`.
264
+ port: The port to connect to. Either an integer or `None`.
265
+ target: The target of the HTTP request. Such as `"/items?search=red"`.
266
+ """
267
+ if url:
268
+ parsed = urllib.parse.urlparse(enforce_bytes(url, name="url"))
269
+ self.scheme = parsed.scheme
270
+ self.host = parsed.hostname or b""
271
+ self.port = parsed.port
272
+ self.target = (parsed.path or b"/") + (
273
+ b"?" + parsed.query if parsed.query else b""
274
+ )
275
+ else:
276
+ self.scheme = enforce_bytes(scheme, name="scheme")
277
+ self.host = enforce_bytes(host, name="host")
278
+ self.port = port
279
+ self.target = enforce_bytes(target, name="target")
280
+
281
+ @property
282
+ def origin(self) -> Origin:
283
+ default_port = {
284
+ b"http": 80,
285
+ b"https": 443,
286
+ b"ws": 80,
287
+ b"wss": 443,
288
+ b"socks5": 1080,
289
+ b"socks5h": 1080,
290
+ }[self.scheme]
291
+ return Origin(
292
+ scheme=self.scheme, host=self.host, port=self.port or default_port
293
+ )
294
+
295
+ def __eq__(self, other: typing.Any) -> bool:
296
+ return (
297
+ isinstance(other, URL)
298
+ and other.scheme == self.scheme
299
+ and other.host == self.host
300
+ and other.port == self.port
301
+ and other.target == self.target
302
+ )
303
+
304
+ def __bytes__(self) -> bytes:
305
+ if self.port is None:
306
+ return b"%b://%b%b" % (self.scheme, self.host, self.target)
307
+ return b"%b://%b:%d%b" % (self.scheme, self.host, self.port, self.target)
308
+
309
+ def __repr__(self) -> str:
310
+ return (
311
+ f"{self.__class__.__name__}(scheme={self.scheme!r}, "
312
+ f"host={self.host!r}, port={self.port!r}, target={self.target!r})"
313
+ )
314
+
315
+
316
+ class Request:
317
+ """
318
+ An HTTP request.
319
+ """
320
+
321
+ def __init__(
322
+ self,
323
+ method: bytes | str,
324
+ url: URL | bytes | str,
325
+ *,
326
+ headers: HeaderTypes = None,
327
+ content: bytes
328
+ | typing.Iterable[bytes]
329
+ | typing.AsyncIterable[bytes]
330
+ | None = None,
331
+ extensions: Extensions | None = None,
332
+ ) -> None:
333
+ """
334
+ Parameters:
335
+ method: The HTTP request method, either as a string or bytes.
336
+ For example: `GET`.
337
+ url: The request URL, either as a `URL` instance, or as a string or bytes.
338
+ For example: `"https://www.example.com".`
339
+ headers: The HTTP request headers.
340
+ content: The content of the request body.
341
+ extensions: A dictionary of optional extra information included on
342
+ the request. Possible keys include `"timeout"`, and `"trace"`.
343
+ """
344
+ self.method: bytes = enforce_bytes(method, name="method")
345
+ self.url: URL = enforce_url(url, name="url")
346
+ self.headers: list[tuple[bytes, bytes]] = enforce_headers(
347
+ headers, name="headers"
348
+ )
349
+ self.stream: typing.Iterable[bytes] | typing.AsyncIterable[bytes] = (
350
+ enforce_stream(content, name="content")
351
+ )
352
+ self.extensions = {} if extensions is None else extensions
353
+
354
+ if "target" in self.extensions:
355
+ self.url = URL(
356
+ scheme=self.url.scheme,
357
+ host=self.url.host,
358
+ port=self.url.port,
359
+ target=self.extensions["target"],
360
+ )
361
+
362
+ def __repr__(self) -> str:
363
+ return f"<{self.__class__.__name__} [{self.method!r}]>"
364
+
365
+
366
+ class Response:
367
+ """
368
+ An HTTP response.
369
+ """
370
+
371
+ def __init__(
372
+ self,
373
+ status: int,
374
+ *,
375
+ headers: HeaderTypes = None,
376
+ content: bytes
377
+ | typing.Iterable[bytes]
378
+ | typing.AsyncIterable[bytes]
379
+ | None = None,
380
+ extensions: Extensions | None = None,
381
+ ) -> None:
382
+ """
383
+ Parameters:
384
+ status: The HTTP status code of the response. For example `200`.
385
+ headers: The HTTP response headers.
386
+ content: The content of the response body.
387
+ extensions: A dictionary of optional extra information included on
388
+ the responseself.Possible keys include `"http_version"`,
389
+ `"reason_phrase"`, and `"network_stream"`.
390
+ """
391
+ self.status: int = status
392
+ self.headers: list[tuple[bytes, bytes]] = enforce_headers(
393
+ headers, name="headers"
394
+ )
395
+ self.stream: typing.Iterable[bytes] | typing.AsyncIterable[bytes] = (
396
+ enforce_stream(content, name="content")
397
+ )
398
+ self.extensions = {} if extensions is None else extensions
399
+
400
+ self._stream_consumed = False
401
+
402
+ @property
403
+ def content(self) -> bytes:
404
+ if not hasattr(self, "_content"):
405
+ if isinstance(self.stream, typing.Iterable):
406
+ raise RuntimeError(
407
+ "Attempted to access 'response.content' on a streaming response. "
408
+ "Call 'response.read()' first."
409
+ )
410
+ else:
411
+ raise RuntimeError(
412
+ "Attempted to access 'response.content' on a streaming response. "
413
+ "Call 'await response.aread()' first."
414
+ )
415
+ return self._content
416
+
417
+ def __repr__(self) -> str:
418
+ return f"<{self.__class__.__name__} [{self.status}]>"
419
+
420
+ # Sync interface...
421
+
422
+ def read(self) -> bytes:
423
+ if not isinstance(self.stream, typing.Iterable): # pragma: nocover
424
+ raise RuntimeError(
425
+ "Attempted to read an asynchronous response using 'response.read()'. "
426
+ "You should use 'await response.aread()' instead."
427
+ )
428
+ if not hasattr(self, "_content"):
429
+ self._content = b"".join([part for part in self.iter_stream()])
430
+ return self._content
431
+
432
+ def iter_stream(self) -> typing.Iterator[bytes]:
433
+ if not isinstance(self.stream, typing.Iterable): # pragma: nocover
434
+ raise RuntimeError(
435
+ "Attempted to stream an asynchronous response using 'for ... in "
436
+ "response.iter_stream()'. "
437
+ "You should use 'async for ... in response.aiter_stream()' instead."
438
+ )
439
+ if self._stream_consumed:
440
+ raise RuntimeError(
441
+ "Attempted to call 'for ... in response.iter_stream()' more than once."
442
+ )
443
+ self._stream_consumed = True
444
+ for chunk in self.stream:
445
+ yield chunk
446
+
447
+ def close(self) -> None:
448
+ if not isinstance(self.stream, typing.Iterable): # pragma: nocover
449
+ raise RuntimeError(
450
+ "Attempted to close an asynchronous response using 'response.close()'. "
451
+ "You should use 'await response.aclose()' instead."
452
+ )
453
+ if hasattr(self.stream, "close"):
454
+ self.stream.close()
455
+
456
+ # Async interface...
457
+
458
+ async def aread(self) -> bytes:
459
+ if not isinstance(self.stream, typing.AsyncIterable): # pragma: nocover
460
+ raise RuntimeError(
461
+ "Attempted to read an synchronous response using "
462
+ "'await response.aread()'. "
463
+ "You should use 'response.read()' instead."
464
+ )
465
+ if not hasattr(self, "_content"):
466
+ self._content = b"".join([part async for part in self.aiter_stream()])
467
+ return self._content
468
+
469
+ async def aiter_stream(self) -> typing.AsyncIterator[bytes]:
470
+ if not isinstance(self.stream, typing.AsyncIterable): # pragma: nocover
471
+ raise RuntimeError(
472
+ "Attempted to stream an synchronous response using 'async for ... in "
473
+ "response.aiter_stream()'. "
474
+ "You should use 'for ... in response.iter_stream()' instead."
475
+ )
476
+ if self._stream_consumed:
477
+ raise RuntimeError(
478
+ "Attempted to call 'async for ... in response.aiter_stream()' "
479
+ "more than once."
480
+ )
481
+ self._stream_consumed = True
482
+ async for chunk in self.stream:
483
+ yield chunk
484
+
485
+ async def aclose(self) -> None:
486
+ if not isinstance(self.stream, typing.AsyncIterable): # pragma: nocover
487
+ raise RuntimeError(
488
+ "Attempted to close a synchronous response using "
489
+ "'await response.aclose()'. "
490
+ "You should use 'response.close()' instead."
491
+ )
492
+ if hasattr(self.stream, "aclose"):
493
+ await self.stream.aclose()
494
+
495
+
496
+ class Proxy:
497
+ def __init__(
498
+ self,
499
+ url: URL | bytes | str,
500
+ auth: tuple[bytes | str, bytes | str] | None = None,
501
+ headers: HeadersAsMapping | HeadersAsSequence | None = None,
502
+ ssl_context: ssl.SSLContext | None = None,
503
+ ):
504
+ self.url = enforce_url(url, name="url")
505
+ self.headers = enforce_headers(headers, name="headers")
506
+ self.ssl_context = ssl_context
507
+
508
+ if auth is not None:
509
+ username = enforce_bytes(auth[0], name="auth")
510
+ password = enforce_bytes(auth[1], name="auth")
511
+ userpass = username + b":" + password
512
+ authorization = b"Basic " + base64.b64encode(userpass)
513
+ self.auth: tuple[bytes, bytes] | None = (username, password)
514
+ self.headers = [(b"Proxy-Authorization", authorization)] + self.headers
515
+ else:
516
+ self.auth = None
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_ssl.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import ssl
2
+
3
+ import certifi
4
+
5
+
6
+ def default_ssl_context() -> ssl.SSLContext:
7
+ context = ssl.create_default_context()
8
+ context.load_verify_locations(certifi.where())
9
+ return context
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_utils.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import select
4
+ import socket
5
+ import sys
6
+
7
+
8
+ def is_socket_readable(sock: socket.socket | None) -> bool:
9
+ """
10
+ Return whether a socket, as identifed by its file descriptor, is readable.
11
+ "A socket is readable" means that the read buffer isn't empty, i.e. that calling
12
+ .recv() on it would immediately return some data.
13
+ """
14
+ # NOTE: we want check for readability without actually attempting to read, because
15
+ # we don't want to block forever if it's not readable.
16
+
17
+ # In the case that the socket no longer exists, or cannot return a file
18
+ # descriptor, we treat it as being readable, as if it the next read operation
19
+ # on it is ready to return the terminating `b""`.
20
+ sock_fd = None if sock is None else sock.fileno()
21
+ if sock_fd is None or sock_fd < 0: # pragma: nocover
22
+ return True
23
+
24
+ # The implementation below was stolen from:
25
+ # https://github.com/python-trio/trio/blob/20ee2b1b7376db637435d80e266212a35837ddcc/trio/_socket.py#L471-L478
26
+ # See also: https://github.com/encode/httpcore/pull/193#issuecomment-703129316
27
+
28
+ # Use select.select on Windows, and when poll is unavailable and select.poll
29
+ # everywhere else. (E.g. When eventlet is in use. See #327)
30
+ if (
31
+ sys.platform == "win32" or getattr(select, "poll", None) is None
32
+ ): # pragma: nocover
33
+ rready, _, _ = select.select([sock_fd], [], [], 0)
34
+ return bool(rready)
35
+ p = select.poll()
36
+ p.register(sock_fd, select.POLLIN)
37
+ return bool(p.poll(0))
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/py.typed ADDED
File without changes
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/jinja2-3.1.6.dist-info/INSTALLER ADDED
@@ -0,0 +1 @@
 
 
1
+ pip
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/jinja2-3.1.6.dist-info/METADATA ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Metadata-Version: 2.4
2
+ Name: Jinja2
3
+ Version: 3.1.6
4
+ Summary: A very fast and expressive template engine.
5
+ Maintainer-email: Pallets <contact@palletsprojects.com>
6
+ Requires-Python: >=3.7
7
+ Description-Content-Type: text/markdown
8
+ Classifier: Development Status :: 5 - Production/Stable
9
+ Classifier: Environment :: Web Environment
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: License :: OSI Approved :: BSD License
12
+ Classifier: Operating System :: OS Independent
13
+ Classifier: Programming Language :: Python
14
+ Classifier: Topic :: Internet :: WWW/HTTP :: Dynamic Content
15
+ Classifier: Topic :: Text Processing :: Markup :: HTML
16
+ Classifier: Typing :: Typed
17
+ License-File: LICENSE.txt
18
+ Requires-Dist: MarkupSafe>=2.0
19
+ Requires-Dist: Babel>=2.7 ; extra == "i18n"
20
+ Project-URL: Changes, https://jinja.palletsprojects.com/changes/
21
+ Project-URL: Chat, https://discord.gg/pallets
22
+ Project-URL: Documentation, https://jinja.palletsprojects.com/
23
+ Project-URL: Donate, https://palletsprojects.com/donate
24
+ Project-URL: Source, https://github.com/pallets/jinja/
25
+ Provides-Extra: i18n
26
+
27
+ # Jinja
28
+
29
+ Jinja is a fast, expressive, extensible templating engine. Special
30
+ placeholders in the template allow writing code similar to Python
31
+ syntax. Then the template is passed data to render the final document.
32
+
33
+ It includes:
34
+
35
+ - Template inheritance and inclusion.
36
+ - Define and import macros within templates.
37
+ - HTML templates can use autoescaping to prevent XSS from untrusted
38
+ user input.
39
+ - A sandboxed environment can safely render untrusted templates.
40
+ - AsyncIO support for generating templates and calling async
41
+ functions.
42
+ - I18N support with Babel.
43
+ - Templates are compiled to optimized Python code just-in-time and
44
+ cached, or can be compiled ahead-of-time.
45
+ - Exceptions point to the correct line in templates to make debugging
46
+ easier.
47
+ - Extensible filters, tests, functions, and even syntax.
48
+
49
+ Jinja's philosophy is that while application logic belongs in Python if
50
+ possible, it shouldn't make the template designer's job difficult by
51
+ restricting functionality too much.
52
+
53
+
54
+ ## In A Nutshell
55
+
56
+ ```jinja
57
+ {% extends "base.html" %}
58
+ {% block title %}Members{% endblock %}
59
+ {% block content %}
60
+ <ul>
61
+ {% for user in users %}
62
+ <li><a href="{{ user.url }}">{{ user.username }}</a></li>
63
+ {% endfor %}
64
+ </ul>
65
+ {% endblock %}
66
+ ```
67
+
68
+ ## Donate
69
+
70
+ The Pallets organization develops and supports Jinja and other popular
71
+ packages. In order to grow the community of contributors and users, and
72
+ allow the maintainers to devote more time to the projects, [please
73
+ donate today][].
74
+
75
+ [please donate today]: https://palletsprojects.com/donate
76
+
77
+ ## Contributing
78
+
79
+ See our [detailed contributing documentation][contrib] for many ways to
80
+ contribute, including reporting issues, requesting features, asking or answering
81
+ questions, and making PRs.
82
+
83
+ [contrib]: https://palletsprojects.com/contributing/
84
+
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/jinja2-3.1.6.dist-info/RECORD ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ jinja2-3.1.6.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
2
+ jinja2-3.1.6.dist-info/METADATA,sha256=aMVUj7Z8QTKhOJjZsx7FDGvqKr3ZFdkh8hQ1XDpkmcg,2871
3
+ jinja2-3.1.6.dist-info/RECORD,,
4
+ jinja2-3.1.6.dist-info/WHEEL,sha256=_2ozNFCLWc93bK4WKHCO-eDUENDlo-dgc9cU3qokYO4,82
5
+ jinja2-3.1.6.dist-info/entry_points.txt,sha256=OL85gYU1eD8cuPlikifFngXpeBjaxl6rIJ8KkC_3r-I,58
6
+ jinja2-3.1.6.dist-info/licenses/LICENSE.txt,sha256=O0nc7kEF6ze6wQ-vG-JgQI_oXSUrjp3y4JefweCUQ3s,1475
7
+ jinja2/__init__.py,sha256=xxepO9i7DHsqkQrgBEduLtfoz2QCuT6_gbL4XSN1hbU,1928
8
+ jinja2/__pycache__/__init__.cpython-312.pyc,,
9
+ jinja2/__pycache__/_identifier.cpython-312.pyc,,
10
+ jinja2/__pycache__/async_utils.cpython-312.pyc,,
11
+ jinja2/__pycache__/bccache.cpython-312.pyc,,
12
+ jinja2/__pycache__/compiler.cpython-312.pyc,,
13
+ jinja2/__pycache__/constants.cpython-312.pyc,,
14
+ jinja2/__pycache__/debug.cpython-312.pyc,,
15
+ jinja2/__pycache__/defaults.cpython-312.pyc,,
16
+ jinja2/__pycache__/environment.cpython-312.pyc,,
17
+ jinja2/__pycache__/exceptions.cpython-312.pyc,,
18
+ jinja2/__pycache__/ext.cpython-312.pyc,,
19
+ jinja2/__pycache__/filters.cpython-312.pyc,,
20
+ jinja2/__pycache__/idtracking.cpython-312.pyc,,
21
+ jinja2/__pycache__/lexer.cpython-312.pyc,,
22
+ jinja2/__pycache__/loaders.cpython-312.pyc,,
23
+ jinja2/__pycache__/meta.cpython-312.pyc,,
24
+ jinja2/__pycache__/nativetypes.cpython-312.pyc,,
25
+ jinja2/__pycache__/nodes.cpython-312.pyc,,
26
+ jinja2/__pycache__/optimizer.cpython-312.pyc,,
27
+ jinja2/__pycache__/parser.cpython-312.pyc,,
28
+ jinja2/__pycache__/runtime.cpython-312.pyc,,
29
+ jinja2/__pycache__/sandbox.cpython-312.pyc,,
30
+ jinja2/__pycache__/tests.cpython-312.pyc,,
31
+ jinja2/__pycache__/utils.cpython-312.pyc,,
32
+ jinja2/__pycache__/visitor.cpython-312.pyc,,
33
+ jinja2/_identifier.py,sha256=_zYctNKzRqlk_murTNlzrju1FFJL7Va_Ijqqd7ii2lU,1958
34
+ jinja2/async_utils.py,sha256=vK-PdsuorOMnWSnEkT3iUJRIkTnYgO2T6MnGxDgHI5o,2834
35
+ jinja2/bccache.py,sha256=gh0qs9rulnXo0PhX5jTJy2UHzI8wFnQ63o_vw7nhzRg,14061
36
+ jinja2/compiler.py,sha256=9RpCQl5X88BHllJiPsHPh295Hh0uApvwFJNQuutULeM,74131
37
+ jinja2/constants.py,sha256=GMoFydBF_kdpaRKPoM5cl5MviquVRLVyZtfp5-16jg0,1433
38
+ jinja2/debug.py,sha256=CnHqCDHd-BVGvti_8ZsTolnXNhA3ECsY-6n_2pwU8Hw,6297
39
+ jinja2/defaults.py,sha256=boBcSw78h-lp20YbaXSJsqkAI2uN_mD_TtCydpeq5wU,1267
40
+ jinja2/environment.py,sha256=9nhrP7Ch-NbGX00wvyr4yy-uhNHq2OCc60ggGrni_fk,61513
41
+ jinja2/exceptions.py,sha256=ioHeHrWwCWNaXX1inHmHVblvc4haO7AXsjCp3GfWvx0,5071
42
+ jinja2/ext.py,sha256=5PF5eHfh8mXAIxXHHRB2xXbXohi8pE3nHSOxa66uS7E,31875
43
+ jinja2/filters.py,sha256=PQ_Egd9n9jSgtnGQYyF4K5j2nYwhUIulhPnyimkdr-k,55212
44
+ jinja2/idtracking.py,sha256=-ll5lIp73pML3ErUYiIJj7tdmWxcH_IlDv3yA_hiZYo,10555
45
+ jinja2/lexer.py,sha256=LYiYio6br-Tep9nPcupWXsPEtjluw3p1mU-lNBVRUfk,29786
46
+ jinja2/loaders.py,sha256=wIrnxjvcbqh5VwW28NSkfotiDq8qNCxIOSFbGUiSLB4,24055
47
+ jinja2/meta.py,sha256=OTDPkaFvU2Hgvx-6akz7154F8BIWaRmvJcBFvwopHww,4397
48
+ jinja2/nativetypes.py,sha256=7GIGALVJgdyL80oZJdQUaUfwSt5q2lSSZbXt0dNf_M4,4210
49
+ jinja2/nodes.py,sha256=m1Duzcr6qhZI8JQ6VyJgUNinjAf5bQzijSmDnMsvUx8,34579
50
+ jinja2/optimizer.py,sha256=rJnCRlQ7pZsEEmMhsQDgC_pKyDHxP5TPS6zVPGsgcu8,1651
51
+ jinja2/parser.py,sha256=lLOFy3sEmHc5IaEHRiH1sQVnId2moUQzhyeJZTtdY30,40383
52
+ jinja2/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
53
+ jinja2/runtime.py,sha256=gDk-GvdriJXqgsGbHgrcKTP0Yp6zPXzhzrIpCFH3jAU,34249
54
+ jinja2/sandbox.py,sha256=Mw2aitlY2I8la7FYhcX2YG9BtUYcLnD0Gh3d29cDWrY,15009
55
+ jinja2/tests.py,sha256=VLsBhVFnWg-PxSBz1MhRnNWgP1ovXk3neO1FLQMeC9Q,5926
56
+ jinja2/utils.py,sha256=rRp3o9e7ZKS4fyrWRbELyLcpuGVTFcnooaOa1qx_FIk,24129
57
+ jinja2/visitor.py,sha256=EcnL1PIwf_4RVCOMxsRNuR8AXHbS1qfAdMOE2ngKJz4,3557
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/jinja2-3.1.6.dist-info/WHEEL ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ Wheel-Version: 1.0
2
+ Generator: flit 3.11.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/jinja2-3.1.6.dist-info/entry_points.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ [babel.extractors]
2
+ jinja2=jinja2.ext:babel_extract[i18n]
3
+
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml-6.0.2.dist-info/INSTALLER ADDED
@@ -0,0 +1 @@
 
 
1
+ pip
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml-6.0.2.dist-info/METADATA ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Metadata-Version: 2.4
2
+ Name: lxml
3
+ Version: 6.0.2
4
+ Summary: Powerful and Pythonic XML processing library combining libxml2/libxslt with the ElementTree API.
5
+ Home-page: https://lxml.de/
6
+ Author: lxml dev team
7
+ Author-email: lxml@lxml.de
8
+ Maintainer: lxml dev team
9
+ Maintainer-email: lxml@lxml.de
10
+ License: BSD-3-Clause
11
+ Project-URL: Source, https://github.com/lxml/lxml
12
+ Project-URL: Bug Tracker, https://bugs.launchpad.net/lxml
13
+ Classifier: Development Status :: 5 - Production/Stable
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: Intended Audience :: Information Technology
16
+ Classifier: Programming Language :: Cython
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.8
19
+ Classifier: Programming Language :: Python :: 3.9
20
+ Classifier: Programming Language :: Python :: 3.10
21
+ Classifier: Programming Language :: Python :: 3.11
22
+ Classifier: Programming Language :: Python :: 3.12
23
+ Classifier: Programming Language :: Python :: 3.13
24
+ Classifier: Programming Language :: Python :: 3.14
25
+ Classifier: Programming Language :: C
26
+ Classifier: Operating System :: OS Independent
27
+ Classifier: Topic :: Text Processing :: Markup :: HTML
28
+ Classifier: Topic :: Text Processing :: Markup :: XML
29
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
30
+ Requires-Python: >=3.8
31
+ License-File: LICENSE.txt
32
+ License-File: LICENSES.txt
33
+ Provides-Extra: source
34
+ Provides-Extra: cssselect
35
+ Requires-Dist: cssselect>=0.7; extra == "cssselect"
36
+ Provides-Extra: html5
37
+ Requires-Dist: html5lib; extra == "html5"
38
+ Provides-Extra: htmlsoup
39
+ Requires-Dist: BeautifulSoup4; extra == "htmlsoup"
40
+ Provides-Extra: html-clean
41
+ Requires-Dist: lxml_html_clean; extra == "html-clean"
42
+ Dynamic: author
43
+ Dynamic: author-email
44
+ Dynamic: classifier
45
+ Dynamic: description
46
+ Dynamic: home-page
47
+ Dynamic: license
48
+ Dynamic: license-file
49
+ Dynamic: maintainer
50
+ Dynamic: maintainer-email
51
+ Dynamic: project-url
52
+ Dynamic: provides-extra
53
+ Dynamic: requires-python
54
+ Dynamic: summary
55
+
56
+ lxml is a Pythonic, mature binding for the libxml2 and libxslt libraries.
57
+ It provides safe and convenient access to these libraries using the
58
+ ElementTree API.
59
+
60
+ It extends the ElementTree API significantly to offer support for XPath,
61
+ RelaxNG, XML Schema, XSLT, C14N and much more.
62
+
63
+ To contact the project, go to the `project home page <https://lxml.de/>`_
64
+ or see our bug tracker at https://launchpad.net/lxml
65
+
66
+ In case you want to use the current in-development version of lxml,
67
+ you can get it from the github repository at
68
+ https://github.com/lxml/lxml . Note that this requires Cython to
69
+ build the sources, see the build instructions on the project home page.
70
+
71
+
72
+ After an official release of a new stable series, bug fixes may become available at
73
+ https://github.com/lxml/lxml/tree/lxml-6.0 .
74
+ Running ``pip install https://github.com/lxml/lxml/archive/refs/heads/lxml-6.0.tar.gz``
75
+ will install the unreleased branch state as soon as a maintenance branch has been established.
76
+ Note that this requires Cython to be installed at an appropriate version for the build.
77
+
78
+ 6.0.2 (2025-09-21)
79
+ ==================
80
+
81
+ Bugs fixed
82
+ ----------
83
+
84
+ * LP#2125278: Compilation with libxml2 2.15.0 failed.
85
+ Original patch by Xi Ruoyao.
86
+
87
+ * Setting ``decompress=True`` in the parser had no effect in libxml2 2.15.
88
+
89
+ * Binary wheels on Linux and macOS use the library version libxml2 2.14.6.
90
+ See https://gitlab.gnome.org/GNOME/libxml2/-/releases/v2.14.6
91
+
92
+ * Test failures in libxml2 2.15.0 were fixed.
93
+
94
+ Other changes
95
+ -------------
96
+
97
+ * Binary wheels for Py3.9-3.11 on the ``riscv64`` architecture were added.
98
+
99
+ * Error constants were updated to match libxml2 2.15.0.
100
+
101
+ * Built using Cython 3.1.4.
102
+
103
+
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml-6.0.2.dist-info/RECORD ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ lxml-6.0.2.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
2
+ lxml-6.0.2.dist-info/METADATA,sha256=0qIHkwlNTTMz4-c5e8ZnbbGgt_vpYZHCEoqXyckR95Q,3622
3
+ lxml-6.0.2.dist-info/RECORD,,
4
+ lxml-6.0.2.dist-info/WHEEL,sha256=1rk9WkINO5IYd_dGyocTHV6htge3I27wu_Vax8WCadA,152
5
+ lxml-6.0.2.dist-info/licenses/LICENSE.txt,sha256=j8K1aBM1FuRoRdIUeRet7uFkjnCumrXtbFQXr-9M6FU,1507
6
+ lxml-6.0.2.dist-info/licenses/LICENSES.txt,sha256=QdSd1AaqDhVIptXyGjDWv2OLPNlutyid00jYPtLkA5I,1514
7
+ lxml-6.0.2.dist-info/top_level.txt,sha256=NjD988wqaKq512nshNdLt-uDxsjkp4Bh51m6N-dhUrk,5
8
+ lxml/ElementInclude.py,sha256=PSLeZFvCa76WHJulPLxcZXJtCI2-4dK2CtqPRiYOAQg,8560
9
+ lxml/__init__.py,sha256=rgOcPyZUNBFL30ylxIxd8fHHWi6TwyIUCi8Av84XWwo,574
10
+ lxml/__pycache__/ElementInclude.cpython-312.pyc,,
11
+ lxml/__pycache__/__init__.cpython-312.pyc,,
12
+ lxml/__pycache__/_elementpath.cpython-312.pyc,,
13
+ lxml/__pycache__/builder.cpython-312.pyc,,
14
+ lxml/__pycache__/cssselect.cpython-312.pyc,,
15
+ lxml/__pycache__/doctestcompare.cpython-312.pyc,,
16
+ lxml/__pycache__/pyclasslookup.cpython-312.pyc,,
17
+ lxml/__pycache__/sax.cpython-312.pyc,,
18
+ lxml/__pycache__/usedoctest.cpython-312.pyc,,
19
+ lxml/_elementpath.cpython-312-x86_64-linux-gnu.so,sha256=1mB7tnIOx_08TqlYHQQSYJX5SXE4lQZrrnexJZBuvi8,217352
20
+ lxml/_elementpath.py,sha256=b80hM3ndAkTtRX6v54za3LkkAqCcd0700BbMPZHnTBU,10959
21
+ lxml/apihelpers.pxi,sha256=9S6bzp-VKCUPZv0f6-el5PsbPFN4FJqSnMCIYilS0eU,63881
22
+ lxml/builder.cpython-312-x86_64-linux-gnu.so,sha256=iSov_1syOR8dCLyAPsAlfGOkc67Yl1GX7I93Af993ZI,129080
23
+ lxml/builder.py,sha256=KI1HxHTd4wJqqVfmTRtSbXBQdl2T-P36ih4hT-J3MNw,8485
24
+ lxml/classlookup.pxi,sha256=Tax8Vhbm5C6UCjgmRFsYjW0pFHxIuTthH1MOgASDLgc,22435
25
+ lxml/cleanup.pxi,sha256=ZNEpbv7qx_ICPzsxhCaMUHCOfiznOoZ_u3jlYXHAuh4,8454
26
+ lxml/cssselect.py,sha256=_wZdX-B9p5MeIYABmENIYRWEkwXwX-7jO8Dkf-1rUZU,3306
27
+ lxml/debug.pxi,sha256=KTcpR8-slUYvmIPbE35GoHDNTb-gjTEvD7bw6LltM4c,1125
28
+ lxml/docloader.pxi,sha256=bYSZAxxbBEfVzfLXTUWFRfOyUTfV23L7i9hR2dgtSNY,5772
29
+ lxml/doctestcompare.py,sha256=40EDnkwpcvW86qNa86990OXF42xdHaosSZoiBsEjkzU,17731
30
+ lxml/dtd.pxi,sha256=IAKkmA4ZoC68sqAWcTqoS8jEGYcPQrVMCZgn4iLBYko,15281
31
+ lxml/etree.cpython-312-x86_64-linux-gnu.so,sha256=4SybuGGBSJ2dF8AZo5PSuo8BaiLbT3eF8sofIH2RT_U,5395056
32
+ lxml/etree.h,sha256=_NkGkD3C_jpE4UegvQ6Y32_ycTbUCLyOBz9xfWRPkug,9792
33
+ lxml/etree.pyx,sha256=2qCb8ZNjsdoB0fUELYwAM4ldLQZWS5_gt-OxKEUM-vs,138014
34
+ lxml/etree_api.h,sha256=dNCm28ubaVS8SbhLuxs9JvYWg41NoR_yD3qTRr7hliA,17372
35
+ lxml/extensions.pxi,sha256=xKLad35EQgpsDhs07tw31aKJBBMWIK9rMc0JTXETAUA,32022
36
+ lxml/html/ElementSoup.py,sha256=s_dLobLMuKn2DhexR-iDXdZrMFg1RjLy1feHsIeZMpw,320
37
+ lxml/html/__init__.py,sha256=CC5WdsvSptZhr9MZya1qsL6JKVbviYdrHOhXrGhmORg,64425
38
+ lxml/html/__pycache__/ElementSoup.cpython-312.pyc,,
39
+ lxml/html/__pycache__/__init__.cpython-312.pyc,,
40
+ lxml/html/__pycache__/_diffcommand.cpython-312.pyc,,
41
+ lxml/html/__pycache__/_difflib.cpython-312.pyc,,
42
+ lxml/html/__pycache__/_html5builder.cpython-312.pyc,,
43
+ lxml/html/__pycache__/_setmixin.cpython-312.pyc,,
44
+ lxml/html/__pycache__/builder.cpython-312.pyc,,
45
+ lxml/html/__pycache__/clean.cpython-312.pyc,,
46
+ lxml/html/__pycache__/defs.cpython-312.pyc,,
47
+ lxml/html/__pycache__/diff.cpython-312.pyc,,
48
+ lxml/html/__pycache__/formfill.cpython-312.pyc,,
49
+ lxml/html/__pycache__/html5parser.cpython-312.pyc,,
50
+ lxml/html/__pycache__/soupparser.cpython-312.pyc,,
51
+ lxml/html/__pycache__/usedoctest.cpython-312.pyc,,
52
+ lxml/html/_diffcommand.py,sha256=kz_7EP9PmYWuczlZcGiw74_rG0eTKvQ2lrO0rkiwlYE,2081
53
+ lxml/html/_difflib.cpython-312-x86_64-linux-gnu.so,sha256=XuPeciCf-4e7FpclT9B1viDjUaTJVJg4zkeEW_zXauo,570296
54
+ lxml/html/_difflib.py,sha256=GgH_jVrZQC8tI8WV_lFZQsXFJ3mOTAPup1zjBJNvkPo,84954
55
+ lxml/html/_html5builder.py,sha256=NLaT-Ev-aBgJpeQl-6ZbJChLZK5GV-znDkHOJD5VQC4,3230
56
+ lxml/html/_setmixin.py,sha256=8IFIOLmVz0G-XzsD2tCEkSFWO-dgPBHgvHufC8ni67s,1188
57
+ lxml/html/builder.py,sha256=Uz3r5uiuCdoN0UPa7ngoLMwAadVIhslzGvlRPGigY_M,6187
58
+ lxml/html/clean.py,sha256=FghSJy4jt2RaBy6dgusowkU18hxpZ4XLE5ceCK9qxyA,503
59
+ lxml/html/defs.py,sha256=l_6nh4DHvrsVyWVqWCUUx14QiahRyZv4Melqy_thf6Q,4250
60
+ lxml/html/diff.cpython-312-x86_64-linux-gnu.so,sha256=iWcPoTRaf2StqEyPKB6xz1j15rvZDLvW_a-KwYLJLyY,377848
61
+ lxml/html/diff.py,sha256=Za0By-yeYlQEjUu7m7xKB288kKiy8VBS5gT0RPOaFY0,32989
62
+ lxml/html/formfill.py,sha256=umgk0BbkAI1W6q9musFbL-cDnI_aap2NsLBJqk0UmVI,9681
63
+ lxml/html/html5parser.py,sha256=dnyC4cqHxywjZSzk0mu2L7THTZjxhg4yF4pncjusa_w,8634
64
+ lxml/html/soupparser.py,sha256=xo8VvNeOEb-SChuXLKCRECh8J7HBiJLE9sAbEskoUUQ,10197
65
+ lxml/html/usedoctest.py,sha256=tPlmVz4KK1GRKV5DJLrdVECeqsT9PlDzSqqTodVi5s0,249
66
+ lxml/includes/__init__.pxd,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
67
+ lxml/includes/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
68
+ lxml/includes/__pycache__/__init__.cpython-312.pyc,,
69
+ lxml/includes/c14n.pxd,sha256=DBQcOJ0c_YS245ohMb8fmuEC1kFyv1LrNY_8Mf-syZg,1110
70
+ lxml/includes/config.pxd,sha256=H6Mrl8It21hzRI2hzMId9W48QqkYYkoLT4dniLNmdTw,96
71
+ lxml/includes/dtdvalid.pxd,sha256=Nv0OykjYehv2lO-Zj--q6jS3TAC_dvQVPSgPMuse1NM,689
72
+ lxml/includes/etree_defs.h,sha256=h_UjJTmNUqPyKNNrWB9hxmt6v4CF7_83XVY8dOfxqW0,14524
73
+ lxml/includes/etreepublic.pxd,sha256=Bn4d3JkWPqXputXqI-eJ0xmPrwNFPTfDCa7axgjB7FM,10184
74
+ lxml/includes/extlibs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
75
+ lxml/includes/extlibs/__pycache__/__init__.cpython-312.pyc,,
76
+ lxml/includes/extlibs/libcharset.h,sha256=GA0FumrbNI4VDGlzq3lf5CLaCwXgn4unw2l0btGQFwI,1510
77
+ lxml/includes/extlibs/localcharset.h,sha256=Z_AagaQeq0aDE7NPsVOqEf4nO4KcUp46ggo4d0ONIOQ,6338
78
+ lxml/includes/extlibs/zconf.h,sha256=ROVD_0UUx6mgHWSAGcLJqB0RBcv6PHfx-vbNhur6ir0,16464
79
+ lxml/includes/extlibs/zlib.h,sha256=ilV5r3LqT0J_8ApBUPDMs_xcHkN59ybhARM7Grn8YAw,96829
80
+ lxml/includes/htmlparser.pxd,sha256=9uASkP5dU7OE2lCOLT-z2e01qSbFlp4ehgwdostF_qk,2802
81
+ lxml/includes/libexslt/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
82
+ lxml/includes/libexslt/__pycache__/__init__.cpython-312.pyc,,
83
+ lxml/includes/libexslt/exslt.h,sha256=eSW5tMJAewSUANLqk7AGEiU8b2BbCNRyauHnez7nKSU,3114
84
+ lxml/includes/libexslt/exsltconfig.h,sha256=QHxzEbRlv_h0USBvpr0Zrl0Muzlc71VCrvgR6lqnLEY,1172
85
+ lxml/includes/libexslt/exsltexports.h,sha256=1Jm9KTXm2FUUJIZ6V6-Uw55yG0BMULX3_goyxDd2LL8,1077
86
+ lxml/includes/libxml/HTMLparser.h,sha256=sU4xGqj-vBtEvzlxA3hBPWJboifvkc4F1hynKXmsl3k,9569
87
+ lxml/includes/libxml/HTMLtree.h,sha256=Q7UBKFbQ8fx4d_dMnmR6ay8JmfOhopFkDp2B63YkLDU,3517
88
+ lxml/includes/libxml/SAX.h,sha256=SFnG27EFrYGUB9HDL_xSIGBwEns5pl07rApXWThFZFM,386
89
+ lxml/includes/libxml/SAX2.h,sha256=RfFP5o3le-Rg8bnA2GW7L7L9_pfXCs3TieODcv1DTWY,4240
90
+ lxml/includes/libxml/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
91
+ lxml/includes/libxml/__pycache__/__init__.cpython-312.pyc,,
92
+ lxml/includes/libxml/c14n.h,sha256=BSBXw6nIZutC8mWvbRrLLmoWjw3wRt-nM93vjXGMCm8,2742
93
+ lxml/includes/libxml/catalog.h,sha256=H9ssTCaBjtDqc-AZqCk1R7h8F2iD9szqLjJyHpaczXg,4633
94
+ lxml/includes/libxml/chvalid.h,sha256=TZcceNp6Cw0QlYwIqK9GxyYqL5UiAjpQyjt_yrZGTQE,5087
95
+ lxml/includes/libxml/debugXML.h,sha256=XXRNI39gJW7bGRC4SzE4ad-SJ906BsUGz3AwOtkKuS4,1667
96
+ lxml/includes/libxml/dict.h,sha256=SweaPGMtTTf4je6dNTIoEzcfEvpsAT9_PhR7FC0K-rQ,1770
97
+ lxml/includes/libxml/encoding.h,sha256=haL7ratww2wkIERGmtwUqU2BbTVe52FZFU7MmrOpsPk,9623
98
+ lxml/includes/libxml/entities.h,sha256=LEOCA826-0f8dhRJzC_2hvUVsSH7lKQjrea9hSTdBbo,4419
99
+ lxml/includes/libxml/globals.h,sha256=NH8zyRI5cXJJGp5k2aLxOm-reJEGOFX6LYP82GBXRlY,583
100
+ lxml/includes/libxml/hash.h,sha256=KIIpAYKBfGUU3ydWhGehUyfuauZz_Ps0gyambzQo_rc,7017
101
+ lxml/includes/libxml/list.h,sha256=oh7iJNQajRA_cHsNk9CcFPYkaW2smf4J_MpedPPjC4k,3128
102
+ lxml/includes/libxml/nanoftp.h,sha256=22PBtWhJueYLFvwukt4oFooRct_xJA83hbluHRBNXUM,302
103
+ lxml/includes/libxml/nanohttp.h,sha256=bLbzYjAyAKmP3ComMOPH6XaUImu6bNAESF1HrVtRve0,2124
104
+ lxml/includes/libxml/parser.h,sha256=Uq7-ce55UUAsvo4n6CiBlNQpmowewvWhOsQtgGM1UQ8,48498
105
+ lxml/includes/libxml/parserInternals.h,sha256=8_Wr6UgRzm8BRn1RPLxyBkw6BagAdDvVqMA_e181_EI,14539
106
+ lxml/includes/libxml/relaxng.h,sha256=VXZ74r5Yja06KqypdBHc8neDwPxQ2aMrsWHSdRt5oi4,5991
107
+ lxml/includes/libxml/schemasInternals.h,sha256=V8M4In3zf24EX55Yt4dcfxwp7NpHGYViKnLKwtyrPJ4,26233
108
+ lxml/includes/libxml/schematron.h,sha256=8EhPDhvtlMxl9e0C5rSbEruOvzJS5BC_OOFbq9RXZnY,4255
109
+ lxml/includes/libxml/threads.h,sha256=mT3CgK4lXK7-NDnUOFXqYuCK6fyY70S3BsHF-TnT45k,1619
110
+ lxml/includes/libxml/tree.h,sha256=zTRLt6h5x6ApyeXgs90CKQZSAl2hKm7b5NxtPKUQFAE,36106
111
+ lxml/includes/libxml/uri.h,sha256=J9teJHme5z883c4twF5oImEYY-E3xSvhdSGpyRVtvIg,2855
112
+ lxml/includes/libxml/valid.h,sha256=By61IbPvk_eLux7a8x0mOaly7oclFaSGaFE8b2xZcUE,13226
113
+ lxml/includes/libxml/xinclude.h,sha256=K3I5jhw2zAMj26LuRNZc15Bwv2JE2hWxwVn4TCqv2b4,3258
114
+ lxml/includes/libxml/xlink.h,sha256=TVLOkISrcKDelo9n_XIUyPiStDYa8NxuF2dz70aBFCI,5062
115
+ lxml/includes/libxml/xmlIO.h,sha256=FvbuMYTy1-S5PScabE03wz0oWKf626pmXvOPZNuLm-w,11948
116
+ lxml/includes/libxml/xmlautomata.h,sha256=7Sc3YgPz1ZIBKCHPSxs5oAwJEZWQ1RT2kyUw85pUtmU,4004
117
+ lxml/includes/libxml/xmlerror.h,sha256=mMfltMxUza6kiSBfP2QfnY3UlMP_rEXKfX0wruBLl4A,37561
118
+ lxml/includes/libxml/xmlexports.h,sha256=IyV3AMeQVbOl0wkjlnNX4B8WUZ-5GNKQmxZc6-maWUU,2025
119
+ lxml/includes/libxml/xmlmemory.h,sha256=m7wGvVMxNzZiuOAo3vkjxaVWstc8aQLzb6obbjPsebE,4658
120
+ lxml/includes/libxml/xmlmodule.h,sha256=ERUHUmDdZRmh6NjLYWUpse51rLWR8qNjPHOtdgmlLF0,1198
121
+ lxml/includes/libxml/xmlreader.h,sha256=BAHinlSOTXX3DEax9BniaIIPAXJyLGfzym9R-27LCcU,12387
122
+ lxml/includes/libxml/xmlregexp.h,sha256=_q6C1XRy8DS3kSmLbEKpvkKQciTgjTJgGc_zUQ6m22M,2632
123
+ lxml/includes/libxml/xmlsave.h,sha256=zcEQr9sO5CsFrnoOLshhdsqMEr8k4AeFhbkYyNfO9Fs,2934
124
+ lxml/includes/libxml/xmlschemas.h,sha256=5AfLnYUcfmxHRzg0dVpdHig--4ui1-XDwDgpKGDKCiU,7067
125
+ lxml/includes/libxml/xmlschemastypes.h,sha256=MYwlGmoKAo3lHRaaKgnCXiLmPT9KRjdxyCJ7TEyZ6jM,4583
126
+ lxml/includes/libxml/xmlstring.h,sha256=d5PpqxP1I1sfmCUHvVJtjoC9h7hLHcAAQ5ok_Rtf50I,5271
127
+ lxml/includes/libxml/xmlunicode.h,sha256=8sq3wEW2AiyTCuc3ZceOEkce7lfrI7VnkRfwEQgc6pU,278
128
+ lxml/includes/libxml/xmlversion.h,sha256=oVpaE_xbttaeZNFKSuSfcLOceWz7LQgKP71Z1msXZNo,5112
129
+ lxml/includes/libxml/xmlwriter.h,sha256=BEUwYNKx3xymDE9vepksEK7yVq9SXYm1d2pQnzlPy90,20688
130
+ lxml/includes/libxml/xpath.h,sha256=CQv6X_pRhuXoCVpqoDXYB7FfusLK7AuPxCNigwhNYAA,16156
131
+ lxml/includes/libxml/xpathInternals.h,sha256=mc9B5tdpfssyz_NPUzww6dKuWCtBybBiBRJkTe4AE4U,18504
132
+ lxml/includes/libxml/xpointer.h,sha256=DAxMsfPp2SSZgXFrPbxBA84RwTMRf35Qg_LBbUzPQhA,1026
133
+ lxml/includes/libxslt/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
134
+ lxml/includes/libxslt/__pycache__/__init__.cpython-312.pyc,,
135
+ lxml/includes/libxslt/attributes.h,sha256=qKwzfGf7r89esLC65s96iYJWRA-s-Ezss2_V6Mmo1hk,957
136
+ lxml/includes/libxslt/documents.h,sha256=kBihgH5pqRvFalhm_fOFHtJTFhTpBcm681yT5dxgwfw,2704
137
+ lxml/includes/libxslt/extensions.h,sha256=W5UMyJqUP_1zt6sXZ0mgc0gAIwDJrZ8gjByhyrWqvd8,6899
138
+ lxml/includes/libxslt/extra.h,sha256=6X3Wu3NdPtrlqz-Koo7dB-rccnnszi6j3zg599gTByg,1640
139
+ lxml/includes/libxslt/functions.h,sha256=fc4CZj-9KeBHzO9-WWU_bNqmaEZAz3n7NNwClIBXk14,1972
140
+ lxml/includes/libxslt/imports.h,sha256=18kIjoGqdFXR63Ce3ZtzxsTiYV3XGKpchYakMUPDuUI,1840
141
+ lxml/includes/libxslt/keys.h,sha256=16v25VEluS7jYhgg6gYFwVxgGMn-1ctnlhhWWT4RcBY,1155
142
+ lxml/includes/libxslt/namespaces.h,sha256=VofSn2Kkn-a5JyRKCmY3jPp7amQy3n09vzy0KUQt4q0,1666
143
+ lxml/includes/libxslt/numbersInternals.h,sha256=Eg5gYZ5p3h0_e5wyI61S-0E6_ArVJzv0yr63j6BU2fc,2019
144
+ lxml/includes/libxslt/pattern.h,sha256=tJ-BPfs9UYgiZMMoQZbhij3g7xVppYq7TrrOu25eR7Q,2110
145
+ lxml/includes/libxslt/preproc.h,sha256=D_LjEdHhsdyBnEAvflnwFgoR4hGUb72kgEhXkkmPRsw,896
146
+ lxml/includes/libxslt/security.h,sha256=fUD1cy_WxFCTvTNAF0WOQIU4p5CNWn1LHFyZJd-Fx5U,2652
147
+ lxml/includes/libxslt/templates.h,sha256=bnt6Jqui6KU5pNUdMNPbQZkZ5d-VTWqC0TMGkOlVoIo,2268
148
+ lxml/includes/libxslt/transform.h,sha256=ICT7meUV0OTAx27WaKVrKj-aUmR9LSpTNaOAJd2UStg,6311
149
+ lxml/includes/libxslt/variables.h,sha256=cQAgPe4QCcK2uKbWg7Iz-9peM9xWGm7m3M6jQm0sjIA,3143
150
+ lxml/includes/libxslt/xslt.h,sha256=wmFx2Q31Pd8Iq2phAQpY9J3QQatb8lWg3gABtqKFgEw,1964
151
+ lxml/includes/libxslt/xsltInternals.h,sha256=2EbEKYmnYZq0HjGnUMAlpqnqZJurRXzjlgk5Js1WYaY,57949
152
+ lxml/includes/libxslt/xsltconfig.h,sha256=cV5scdRK6xmOHeOg3OCw6hBfcQ_nrtNs_tKefX67304,2910
153
+ lxml/includes/libxslt/xsltexports.h,sha256=1-luH-0bCIgBAlKAXhV-dqHBfwOAQNDamiYbxIlTf0k,1124
154
+ lxml/includes/libxslt/xsltlocale.h,sha256=ppxGEmJfZIJgwRQzCM0_77p9WNekEWq1NrdYZrQl4IE,942
155
+ lxml/includes/libxslt/xsltutils.h,sha256=1eguYgR9-jeNOVlBUktHboaq-VLX6JXraO80TfbARKM,9085
156
+ lxml/includes/lxml-version.h,sha256=KZfk_lJnXSnxkyRdUV5taHsWJe4xbC6UEYfYldlfouI,71
157
+ lxml/includes/relaxng.pxd,sha256=HzHlQ6mCcf_tj_JZ9NAVJTVAv8ScCkE8Ifq15y3bS0c,2615
158
+ lxml/includes/schematron.pxd,sha256=Hob7xh-K-MKqp7WiG8thMagf5EkQzmgfi4ds0EF91JA,1604
159
+ lxml/includes/tree.pxd,sha256=XApzMRy_LSqCtQ-OTS-vNSW7CT_OWstybfIT2H84LsA,20179
160
+ lxml/includes/uri.pxd,sha256=3vOXw6AbSPxAM9uo71T1qnfx-wd9ezXLDQtWsb2zX0I,145
161
+ lxml/includes/xinclude.pxd,sha256=CuO_XZNB6E2JK1qXXWn11APrjFQV5kA6SMyb77WZn0A,804
162
+ lxml/includes/xmlerror.pxd,sha256=OQqayytkV0NigAPbsQCCcvmy7luRe0XhVzpTdzJjP3g,58837
163
+ lxml/includes/xmlparser.pxd,sha256=eDGyU5kZyNVksK0dUhMIi7rnE-LSevXsqyl72v99Ess,13730
164
+ lxml/includes/xmlschema.pxd,sha256=OLZPd2WDJyopiXJJyo-dAyyYHaeSYFiMAI4tqIiv-Ik,1702
165
+ lxml/includes/xpath.pxd,sha256=e8-ZYUbRG7N1mHETAlknJ_QqAteOosrYLRgpH-OsTkg,5603
166
+ lxml/includes/xslt.pxd,sha256=4yl3pOu7pAvsx5Tc-W4IWCoB8wgtSSR62HI1jqu6jko,8241
167
+ lxml/isoschematron/__init__.py,sha256=uauerYeKTlWFCJSqieIHhF5l6rYV2myeEJ0Imd1LzRc,13274
168
+ lxml/isoschematron/__pycache__/__init__.cpython-312.pyc,,
169
+ lxml/isoschematron/resources/rng/iso-schematron.rng,sha256=VsWxPyi3iViJDDbjJJw0wWkEHkLrz9zoCA8zJLor9N4,18337
170
+ lxml/isoschematron/resources/xsl/RNG2Schtrn.xsl,sha256=ObebsB8Wt-d3uIA_U5NU85TpnQ3PxPX38TdOAqosMac,3172
171
+ lxml/isoschematron/resources/xsl/XSD2Schtrn.xsl,sha256=QweRrIIM-zFcgg98GXA2CaWfIbgVE0XKEeYSfvv67A0,4563
172
+ lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_abstract_expand.xsl,sha256=xSZ_Ekq_I-62ZpiE5AqYYHwFW_qh855zt9V4_s7rbkY,11703
173
+ lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_dsdl_include.xsl,sha256=x42QJ-dxQ1waPzydsCoQnp2Xj15y53nW43O7BuoDRHk,39957
174
+ lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_schematron_message.xsl,sha256=Tr9BnO6pzjVWwhqJfm10UlvAy95EgfSCz2iMlrVGT6Q,2015
175
+ lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_schematron_skeleton_for_xslt1.xsl,sha256=ue8q_88X4e_jsJizo31GRNBxNhdxkEE9fY20oq0Iqwk,71764
176
+ lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_svrl_for_xslt1.xsl,sha256=BBAdsVSi5zAzeGepuN6gS1saQINDqITXKplmmj4dTWg,20382
177
+ lxml/isoschematron/resources/xsl/iso-schematron-xslt1/readme.txt,sha256=OGLiFswuLJEW5EPYKOeoauuCJFEtVa6jyzBE1OcJI98,3310
178
+ lxml/iterparse.pxi,sha256=JXvYhSOCaRjT_hYbRGMlJt2rlqx0TiRpN4FE1jQc63w,16521
179
+ lxml/lxml.etree.h,sha256=_NkGkD3C_jpE4UegvQ6Y32_ycTbUCLyOBz9xfWRPkug,9792
180
+ lxml/lxml.etree_api.h,sha256=dAbJPd53D_9CIGzePAUB3otgyhG4o2cSdA4-6apdzRA,17377
181
+ lxml/nsclasses.pxi,sha256=5pzNBhBtlqObPdThL9QIGRs1Dxj1qnr0PyXuTCURqTg,9129
182
+ lxml/objectify.cpython-312-x86_64-linux-gnu.so,sha256=TYF3CoGF-cenIwFh_1nY0sr2UI2wdsS8tZO2Wi0evyg,2933112
183
+ lxml/objectify.pyx,sha256=I4bQQXmQssBtk5bTrid-eVURBLKRTM5iQZiviugIrts,75823
184
+ lxml/objectpath.pxi,sha256=s5TNG2-EbaWWKLFAiX303B95zK_Ui8ausB__3QvFFGw,11450
185
+ lxml/parser.pxi,sha256=VZfychEJ3-XPE3x6oGOEzn6HVAr74R7lXfDSVF-hq-U,85411
186
+ lxml/parsertarget.pxi,sha256=v1PidxRaG5giwXcTDkpBI7PDFmsZuOcK0y9LdkQaY8M,6326
187
+ lxml/proxy.pxi,sha256=8IVvYF2KTuzl7Hb3XGHEmcxfSLbUZkA2Q1Y50hLsyzE,23929
188
+ lxml/public-api.pxi,sha256=XoP6_cJOEoQIItvE1RiYCKYD1ry4AobaOr4XLo0KSE4,6666
189
+ lxml/pyclasslookup.py,sha256=gLD1HM2HtITYYiGzjEOewSwbB7XkVx_NZv_quCt79Oc,92
190
+ lxml/readonlytree.pxi,sha256=ddRYczhHieJ4XUvWvTPW9N9oQ8vuKtv7lC1mtE1qvH8,18976
191
+ lxml/relaxng.pxi,sha256=3OQ-fZMzP-KF5vM6HTozT_9ee3J0DJnpj9RcHC8LoMw,6339
192
+ lxml/sax.cpython-312-x86_64-linux-gnu.so,sha256=UQn-l56AOOT5UUJ395Fil7It-Im_brnlsMYfmUpwQe0,190272
193
+ lxml/sax.py,sha256=yrNvKD6rlon48jrR-1qpFXER8j4psYC2R5yt0u6TWLs,9706
194
+ lxml/saxparser.pxi,sha256=TmkdM5h9xII9iKRaBk_1NGk2KTfeesl5Ha8bpFQGqLc,33529
195
+ lxml/schematron.pxi,sha256=F2OHKZUl57-byBk_wWtPTnHZ1fwlj0FtwG3VuGtG-UY,6064
196
+ lxml/serializer.pxi,sha256=iIXfechFHfvFs2sTk7wMIy3sDJxmaMPbNO33mkLLBUE,68063
197
+ lxml/usedoctest.py,sha256=qRgZKQVcAZcl-zN0AIXVJnOsETUXz2nPXkxuzs1lGgk,230
198
+ lxml/xinclude.pxi,sha256=7eBrI_OK47mmrHQ0ixbixRI8pKqQ1nwkMV-OmKUVlD4,2456
199
+ lxml/xmlerror.pxi,sha256=i1kR42WB2BAxtrmh7m2ADlH-jffVQ-blW3pW0Ps4s-g,50061
200
+ lxml/xmlid.pxi,sha256=5zf9oR6bsCtavGiOmilNyHqYwgG_bnrIabSd2SURtm0,6073
201
+ lxml/xmlschema.pxi,sha256=mumNoHni5S3BQPtcmOHRd61KRaVWu4eOie2wQeB0e6E,8490
202
+ lxml/xpath.pxi,sha256=aqW24V817dUxps4Gnc8h7Tm3QVlITKvxU5_9WgJUIFg,19132
203
+ lxml/xslt.pxi,sha256=wxdbuvNFVA8eP57tHmBYWER__ceFhf6HGdsbBHbx_0A,36315
204
+ lxml/xsltext.pxi,sha256=TImDiAPlAezC07P7RY1N9YChA7AuKFH-G53hXdel9yc,11088
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml-6.0.2.dist-info/WHEEL ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.9.0)
3
+ Root-Is-Purelib: false
4
+ Tag: cp312-cp312-manylinux_2_26_x86_64
5
+ Tag: cp312-cp312-manylinux_2_28_x86_64
6
+
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml-6.0.2.dist-info/top_level.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ lxml
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia_curand_cu12-10.3.9.90.dist-info/INSTALLER ADDED
@@ -0,0 +1 @@
 
 
1
+ pip
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia_curand_cu12-10.3.9.90.dist-info/License.txt ADDED
@@ -0,0 +1,1568 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ End User License Agreement
2
+ --------------------------
3
+
4
+
5
+ Preface
6
+ -------
7
+
8
+ The Software License Agreement in Chapter 1 and the Supplement
9
+ in Chapter 2 contain license terms and conditions that govern
10
+ the use of NVIDIA software. By accepting this agreement, you
11
+ agree to comply with all the terms and conditions applicable
12
+ to the product(s) included herein.
13
+
14
+
15
+ NVIDIA Driver
16
+
17
+
18
+ Description
19
+
20
+ This package contains the operating system driver and
21
+ fundamental system software components for NVIDIA GPUs.
22
+
23
+
24
+ NVIDIA CUDA Toolkit
25
+
26
+
27
+ Description
28
+
29
+ The NVIDIA CUDA Toolkit provides command-line and graphical
30
+ tools for building, debugging and optimizing the performance
31
+ of applications accelerated by NVIDIA GPUs, runtime and math
32
+ libraries, and documentation including programming guides,
33
+ user manuals, and API references.
34
+
35
+
36
+ Default Install Location of CUDA Toolkit
37
+
38
+ Windows platform:
39
+
40
+ %ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v#.#
41
+
42
+ Linux platform:
43
+
44
+ /usr/local/cuda-#.#
45
+
46
+ Mac platform:
47
+
48
+ /Developer/NVIDIA/CUDA-#.#
49
+
50
+
51
+ NVIDIA CUDA Samples
52
+
53
+
54
+ Description
55
+
56
+ This package includes over 100+ CUDA examples that demonstrate
57
+ various CUDA programming principles, and efficient CUDA
58
+ implementation of algorithms in specific application domains.
59
+
60
+
61
+ Default Install Location of CUDA Samples
62
+
63
+ Windows platform:
64
+
65
+ %ProgramData%\NVIDIA Corporation\CUDA Samples\v#.#
66
+
67
+ Linux platform:
68
+
69
+ /usr/local/cuda-#.#/samples
70
+
71
+ and
72
+
73
+ $HOME/NVIDIA_CUDA-#.#_Samples
74
+
75
+ Mac platform:
76
+
77
+ /Developer/NVIDIA/CUDA-#.#/samples
78
+
79
+
80
+ NVIDIA Nsight Visual Studio Edition (Windows only)
81
+
82
+
83
+ Description
84
+
85
+ NVIDIA Nsight Development Platform, Visual Studio Edition is a
86
+ development environment integrated into Microsoft Visual
87
+ Studio that provides tools for debugging, profiling, analyzing
88
+ and optimizing your GPU computing and graphics applications.
89
+
90
+
91
+ Default Install Location of Nsight Visual Studio Edition
92
+
93
+ Windows platform:
94
+
95
+ %ProgramFiles(x86)%\NVIDIA Corporation\Nsight Visual Studio Edition #.#
96
+
97
+
98
+ 1. License Agreement for NVIDIA Software Development Kits
99
+ ---------------------------------------------------------
100
+
101
+
102
+ Release Date: July 26, 2018
103
+ ---------------------------
104
+
105
+
106
+ Important NoticeRead before downloading, installing,
107
+ copying or using the licensed software:
108
+ -------------------------------------------------------
109
+
110
+ This license agreement, including exhibits attached
111
+ ("Agreement”) is a legal agreement between you and NVIDIA
112
+ Corporation ("NVIDIA") and governs your use of a NVIDIA
113
+ software development kit (“SDK”).
114
+
115
+ Each SDK has its own set of software and materials, but here
116
+ is a description of the types of items that may be included in
117
+ a SDK: source code, header files, APIs, data sets and assets
118
+ (examples include images, textures, models, scenes, videos,
119
+ native API input/output files), binary software, sample code,
120
+ libraries, utility programs, programming code and
121
+ documentation.
122
+
123
+ This Agreement can be accepted only by an adult of legal age
124
+ of majority in the country in which the SDK is used.
125
+
126
+ If you are entering into this Agreement on behalf of a company
127
+ or other legal entity, you represent that you have the legal
128
+ authority to bind the entity to this Agreement, in which case
129
+ “you” will mean the entity you represent.
130
+
131
+ If you don’t have the required age or authority to accept
132
+ this Agreement, or if you don’t accept all the terms and
133
+ conditions of this Agreement, do not download, install or use
134
+ the SDK.
135
+
136
+ You agree to use the SDK only for purposes that are permitted
137
+ by (a) this Agreement, and (b) any applicable law, regulation
138
+ or generally accepted practices or guidelines in the relevant
139
+ jurisdictions.
140
+
141
+
142
+ 1.1. License
143
+
144
+
145
+ 1.1.1. License Grant
146
+
147
+ Subject to the terms of this Agreement, NVIDIA hereby grants
148
+ you a non-exclusive, non-transferable license, without the
149
+ right to sublicense (except as expressly provided in this
150
+ Agreement) to:
151
+
152
+ 1. Install and use the SDK,
153
+
154
+ 2. Modify and create derivative works of sample source code
155
+ delivered in the SDK, and
156
+
157
+ 3. Distribute those portions of the SDK that are identified
158
+ in this Agreement as distributable, as incorporated in
159
+ object code format into a software application that meets
160
+ the distribution requirements indicated in this Agreement.
161
+
162
+
163
+ 1.1.2. Distribution Requirements
164
+
165
+ These are the distribution requirements for you to exercise
166
+ the distribution grant:
167
+
168
+ 1. Your application must have material additional
169
+ functionality, beyond the included portions of the SDK.
170
+
171
+ 2. The distributable portions of the SDK shall only be
172
+ accessed by your application.
173
+
174
+ 3. The following notice shall be included in modifications
175
+ and derivative works of sample source code distributed:
176
+ “This software contains source code provided by NVIDIA
177
+ Corporation.”
178
+
179
+ 4. Unless a developer tool is identified in this Agreement
180
+ as distributable, it is delivered for your internal use
181
+ only.
182
+
183
+ 5. The terms under which you distribute your application
184
+ must be consistent with the terms of this Agreement,
185
+ including (without limitation) terms relating to the
186
+ license grant and license restrictions and protection of
187
+ NVIDIA’s intellectual property rights. Additionally, you
188
+ agree that you will protect the privacy, security and
189
+ legal rights of your application users.
190
+
191
+ 6. You agree to notify NVIDIA in writing of any known or
192
+ suspected distribution or use of the SDK not in compliance
193
+ with the requirements of this Agreement, and to enforce
194
+ the terms of your agreements with respect to distributed
195
+ SDK.
196
+
197
+
198
+ 1.1.3. Authorized Users
199
+
200
+ You may allow employees and contractors of your entity or of
201
+ your subsidiary(ies) to access and use the SDK from your
202
+ secure network to perform work on your behalf.
203
+
204
+ If you are an academic institution you may allow users
205
+ enrolled or employed by the academic institution to access and
206
+ use the SDK from your secure network.
207
+
208
+ You are responsible for the compliance with the terms of this
209
+ Agreement by your authorized users. If you become aware that
210
+ your authorized users didn’t follow the terms of this
211
+ Agreement, you agree to take reasonable steps to resolve the
212
+ non-compliance and prevent new occurrences.
213
+
214
+
215
+ 1.1.4. Pre-Release SDK
216
+
217
+ The SDK versions identified as alpha, beta, preview or
218
+ otherwise as pre-release, may not be fully functional, may
219
+ contain errors or design flaws, and may have reduced or
220
+ different security, privacy, accessibility, availability, and
221
+ reliability standards relative to commercial versions of
222
+ NVIDIA software and materials. Use of a pre-release SDK may
223
+ result in unexpected results, loss of data, project delays or
224
+ other unpredictable damage or loss.
225
+
226
+ You may use a pre-release SDK at your own risk, understanding
227
+ that pre-release SDKs are not intended for use in production
228
+ or business-critical systems.
229
+
230
+ NVIDIA may choose not to make available a commercial version
231
+ of any pre-release SDK. NVIDIA may also choose to abandon
232
+ development and terminate the availability of a pre-release
233
+ SDK at any time without liability.
234
+
235
+
236
+ 1.1.5. Updates
237
+
238
+ NVIDIA may, at its option, make available patches, workarounds
239
+ or other updates to this SDK. Unless the updates are provided
240
+ with their separate governing terms, they are deemed part of
241
+ the SDK licensed to you as provided in this Agreement. You
242
+ agree that the form and content of the SDK that NVIDIA
243
+ provides may change without prior notice to you. While NVIDIA
244
+ generally maintains compatibility between versions, NVIDIA may
245
+ in some cases make changes that introduce incompatibilities in
246
+ future versions of the SDK.
247
+
248
+
249
+ 1.1.6. Third Party Licenses
250
+
251
+ The SDK may come bundled with, or otherwise include or be
252
+ distributed with, third party software licensed by a NVIDIA
253
+ supplier and/or open source software provided under an open
254
+ source license. Use of third party software is subject to the
255
+ third-party license terms, or in the absence of third party
256
+ terms, the terms of this Agreement. Copyright to third party
257
+ software is held by the copyright holders indicated in the
258
+ third-party software or license.
259
+
260
+
261
+ 1.1.7. Reservation of Rights
262
+
263
+ NVIDIA reserves all rights, title, and interest in and to the
264
+ SDK, not expressly granted to you under this Agreement.
265
+
266
+
267
+ 1.2. Limitations
268
+
269
+ The following license limitations apply to your use of the
270
+ SDK:
271
+
272
+ 1. You may not reverse engineer, decompile or disassemble,
273
+ or remove copyright or other proprietary notices from any
274
+ portion of the SDK or copies of the SDK.
275
+
276
+ 2. Except as expressly provided in this Agreement, you may
277
+ not copy, sell, rent, sublicense, transfer, distribute,
278
+ modify, or create derivative works of any portion of the
279
+ SDK. For clarity, you may not distribute or sublicense the
280
+ SDK as a stand-alone product.
281
+
282
+ 3. Unless you have an agreement with NVIDIA for this
283
+ purpose, you may not indicate that an application created
284
+ with the SDK is sponsored or endorsed by NVIDIA.
285
+
286
+ 4. You may not bypass, disable, or circumvent any
287
+ encryption, security, digital rights management or
288
+ authentication mechanism in the SDK.
289
+
290
+ 5. You may not use the SDK in any manner that would cause it
291
+ to become subject to an open source software license. As
292
+ examples, licenses that require as a condition of use,
293
+ modification, and/or distribution that the SDK be:
294
+
295
+ a. Disclosed or distributed in source code form;
296
+
297
+ b. Licensed for the purpose of making derivative works;
298
+ or
299
+
300
+ c. Redistributable at no charge.
301
+
302
+ 6. Unless you have an agreement with NVIDIA for this
303
+ purpose, you may not use the SDK with any system or
304
+ application where the use or failure of the system or
305
+ application can reasonably be expected to threaten or
306
+ result in personal injury, death, or catastrophic loss.
307
+ Examples include use in avionics, navigation, military,
308
+ medical, life support or other life critical applications.
309
+ NVIDIA does not design, test or manufacture the SDK for
310
+ these critical uses and NVIDIA shall not be liable to you
311
+ or any third party, in whole or in part, for any claims or
312
+ damages arising from such uses.
313
+
314
+ 7. You agree to defend, indemnify and hold harmless NVIDIA
315
+ and its affiliates, and their respective employees,
316
+ contractors, agents, officers and directors, from and
317
+ against any and all claims, damages, obligations, losses,
318
+ liabilities, costs or debt, fines, restitutions and
319
+ expenses (including but not limited to attorney’s fees
320
+ and costs incident to establishing the right of
321
+ indemnification) arising out of or related to your use of
322
+ the SDK outside of the scope of this Agreement, or not in
323
+ compliance with its terms.
324
+
325
+
326
+ 1.3. Ownership
327
+
328
+ 1. NVIDIA or its licensors hold all rights, title and
329
+ interest in and to the SDK and its modifications and
330
+ derivative works, including their respective intellectual
331
+ property rights, subject to your rights described in this
332
+ section. This SDK may include software and materials from
333
+ NVIDIA’s licensors, and these licensors are intended
334
+ third party beneficiaries that may enforce this Agreement
335
+ with respect to their intellectual property rights.
336
+
337
+ 2. You hold all rights, title and interest in and to your
338
+ applications and your derivative works of the sample
339
+ source code delivered in the SDK, including their
340
+ respective intellectual property rights, subject to
341
+ NVIDIA’s rights described in this section.
342
+
343
+ 3. You may, but don’t have to, provide to NVIDIA
344
+ suggestions, feature requests or other feedback regarding
345
+ the SDK, including possible enhancements or modifications
346
+ to the SDK. For any feedback that you voluntarily provide,
347
+ you hereby grant NVIDIA and its affiliates a perpetual,
348
+ non-exclusive, worldwide, irrevocable license to use,
349
+ reproduce, modify, license, sublicense (through multiple
350
+ tiers of sublicensees), and distribute (through multiple
351
+ tiers of distributors) it without the payment of any
352
+ royalties or fees to you. NVIDIA will use feedback at its
353
+ choice. NVIDIA is constantly looking for ways to improve
354
+ its products, so you may send feedback to NVIDIA through
355
+ the developer portal at https://developer.nvidia.com.
356
+
357
+
358
+ 1.4. No Warranties
359
+
360
+ THE SDK IS PROVIDED BY NVIDIA “AS IS” AND “WITH ALL
361
+ FAULTS.” TO THE MAXIMUM EXTENT PERMITTED BY LAW, NVIDIA AND
362
+ ITS AFFILIATES EXPRESSLY DISCLAIM ALL WARRANTIES OF ANY KIND
363
+ OR NATURE, WHETHER EXPRESS, IMPLIED OR STATUTORY, INCLUDING,
364
+ BUT NOT LIMITED TO, ANY WARRANTIES OF MERCHANTABILITY, FITNESS
365
+ FOR A PARTICULAR PURPOSE, TITLE, NON-INFRINGEMENT, OR THE
366
+ ABSENCE OF ANY DEFECTS THEREIN, WHETHER LATENT OR PATENT. NO
367
+ WARRANTY IS MADE ON THE BASIS OF TRADE USAGE, COURSE OF
368
+ DEALING OR COURSE OF TRADE.
369
+
370
+
371
+ 1.5. Limitation of Liability
372
+
373
+ TO THE MAXIMUM EXTENT PERMITTED BY LAW, NVIDIA AND ITS
374
+ AFFILIATES SHALL NOT BE LIABLE FOR ANY SPECIAL, INCIDENTAL,
375
+ PUNITIVE OR CONSEQUENTIAL DAMAGES, OR ANY LOST PROFITS, LOSS
376
+ OF USE, LOSS OF DATA OR LOSS OF GOODWILL, OR THE COSTS OF
377
+ PROCURING SUBSTITUTE PRODUCTS, ARISING OUT OF OR IN CONNECTION
378
+ WITH THIS AGREEMENT OR THE USE OR PERFORMANCE OF THE SDK,
379
+ WHETHER SUCH LIABILITY ARISES FROM ANY CLAIM BASED UPON BREACH
380
+ OF CONTRACT, BREACH OF WARRANTY, TORT (INCLUDING NEGLIGENCE),
381
+ PRODUCT LIABILITY OR ANY OTHER CAUSE OF ACTION OR THEORY OF
382
+ LIABILITY. IN NO EVENT WILL NVIDIA’S AND ITS AFFILIATES
383
+ TOTAL CUMULATIVE LIABILITY UNDER OR ARISING OUT OF THIS
384
+ AGREEMENT EXCEED US$10.00. THE NATURE OF THE LIABILITY OR THE
385
+ NUMBER OF CLAIMS OR SUITS SHALL NOT ENLARGE OR EXTEND THIS
386
+ LIMIT.
387
+
388
+ These exclusions and limitations of liability shall apply
389
+ regardless if NVIDIA or its affiliates have been advised of
390
+ the possibility of such damages, and regardless of whether a
391
+ remedy fails its essential purpose. These exclusions and
392
+ limitations of liability form an essential basis of the
393
+ bargain between the parties, and, absent any of these
394
+ exclusions or limitations of liability, the provisions of this
395
+ Agreement, including, without limitation, the economic terms,
396
+ would be substantially different.
397
+
398
+
399
+ 1.6. Termination
400
+
401
+ 1. This Agreement will continue to apply until terminated by
402
+ either you or NVIDIA as described below.
403
+
404
+ 2. If you want to terminate this Agreement, you may do so by
405
+ stopping to use the SDK.
406
+
407
+ 3. NVIDIA may, at any time, terminate this Agreement if:
408
+
409
+ a. (i) you fail to comply with any term of this
410
+ Agreement and the non-compliance is not fixed within
411
+ thirty (30) days following notice from NVIDIA (or
412
+ immediately if you violate NVIDIA’s intellectual
413
+ property rights);
414
+
415
+ b. (ii) you commence or participate in any legal
416
+ proceeding against NVIDIA with respect to the SDK; or
417
+
418
+ c. (iii) NVIDIA decides to no longer provide the SDK in
419
+ a country or, in NVIDIA’s sole discretion, the
420
+ continued use of it is no longer commercially viable.
421
+
422
+ 4. Upon any termination of this Agreement, you agree to
423
+ promptly discontinue use of the SDK and destroy all copies
424
+ in your possession or control. Your prior distributions in
425
+ accordance with this Agreement are not affected by the
426
+ termination of this Agreement. Upon written request, you
427
+ will certify in writing that you have complied with your
428
+ commitments under this section. Upon any termination of
429
+ this Agreement all provisions survive except for the
430
+ license grant provisions.
431
+
432
+
433
+ 1.7. General
434
+
435
+ If you wish to assign this Agreement or your rights and
436
+ obligations, including by merger, consolidation, dissolution
437
+ or operation of law, contact NVIDIA to ask for permission. Any
438
+ attempted assignment not approved by NVIDIA in writing shall
439
+ be void and of no effect. NVIDIA may assign, delegate or
440
+ transfer this Agreement and its rights and obligations, and if
441
+ to a non-affiliate you will be notified.
442
+
443
+ You agree to cooperate with NVIDIA and provide reasonably
444
+ requested information to verify your compliance with this
445
+ Agreement.
446
+
447
+ This Agreement will be governed in all respects by the laws of
448
+ the United States and of the State of Delaware as those laws
449
+ are applied to contracts entered into and performed entirely
450
+ within Delaware by Delaware residents, without regard to the
451
+ conflicts of laws principles. The United Nations Convention on
452
+ Contracts for the International Sale of Goods is specifically
453
+ disclaimed. You agree to all terms of this Agreement in the
454
+ English language.
455
+
456
+ The state or federal courts residing in Santa Clara County,
457
+ California shall have exclusive jurisdiction over any dispute
458
+ or claim arising out of this Agreement. Notwithstanding this,
459
+ you agree that NVIDIA shall still be allowed to apply for
460
+ injunctive remedies or an equivalent type of urgent legal
461
+ relief in any jurisdiction.
462
+
463
+ If any court of competent jurisdiction determines that any
464
+ provision of this Agreement is illegal, invalid or
465
+ unenforceable, such provision will be construed as limited to
466
+ the extent necessary to be consistent with and fully
467
+ enforceable under the law and the remaining provisions will
468
+ remain in full force and effect. Unless otherwise specified,
469
+ remedies are cumulative.
470
+
471
+ Each party acknowledges and agrees that the other is an
472
+ independent contractor in the performance of this Agreement.
473
+
474
+ The SDK has been developed entirely at private expense and is
475
+ “commercial items” consisting of “commercial computer
476
+ software” and “commercial computer software
477
+ documentation” provided with RESTRICTED RIGHTS. Use,
478
+ duplication or disclosure by the U.S. Government or a U.S.
479
+ Government subcontractor is subject to the restrictions in
480
+ this Agreement pursuant to DFARS 227.7202-3(a) or as set forth
481
+ in subparagraphs (c)(1) and (2) of the Commercial Computer
482
+ Software - Restricted Rights clause at FAR 52.227-19, as
483
+ applicable. Contractor/manufacturer is NVIDIA, 2788 San Tomas
484
+ Expressway, Santa Clara, CA 95051.
485
+
486
+ The SDK is subject to United States export laws and
487
+ regulations. You agree that you will not ship, transfer or
488
+ export the SDK into any country, or use the SDK in any manner,
489
+ prohibited by the United States Bureau of Industry and
490
+ Security or economic sanctions regulations administered by the
491
+ U.S. Department of Treasury’s Office of Foreign Assets
492
+ Control (OFAC), or any applicable export laws, restrictions or
493
+ regulations. These laws include restrictions on destinations,
494
+ end users and end use. By accepting this Agreement, you
495
+ confirm that you are not a resident or citizen of any country
496
+ currently embargoed by the U.S. and that you are not otherwise
497
+ prohibited from receiving the SDK.
498
+
499
+ Any notice delivered by NVIDIA to you under this Agreement
500
+ will be delivered via mail, email or fax. You agree that any
501
+ notices that NVIDIA sends you electronically will satisfy any
502
+ legal communication requirements. Please direct your legal
503
+ notices or other correspondence to NVIDIA Corporation, 2788
504
+ San Tomas Expressway, Santa Clara, California 95051, United
505
+ States of America, Attention: Legal Department.
506
+
507
+ This Agreement and any exhibits incorporated into this
508
+ Agreement constitute the entire agreement of the parties with
509
+ respect to the subject matter of this Agreement and supersede
510
+ all prior negotiations or documentation exchanged between the
511
+ parties relating to this SDK license. Any additional and/or
512
+ conflicting terms on documents issued by you are null, void,
513
+ and invalid. Any amendment or waiver under this Agreement
514
+ shall be in writing and signed by representatives of both
515
+ parties.
516
+
517
+
518
+ 2. CUDA Toolkit Supplement to Software License Agreement for
519
+ NVIDIA Software Development Kits
520
+ ------------------------------------------------------------
521
+
522
+
523
+ Release date: August 16, 2018
524
+ -----------------------------
525
+
526
+ The terms in this supplement govern your use of the NVIDIA
527
+ CUDA Toolkit SDK under the terms of your license agreement
528
+ (“Agreement”) as modified by this supplement. Capitalized
529
+ terms used but not defined below have the meaning assigned to
530
+ them in the Agreement.
531
+
532
+ This supplement is an exhibit to the Agreement and is
533
+ incorporated as an integral part of the Agreement. In the
534
+ event of conflict between the terms in this supplement and the
535
+ terms in the Agreement, the terms in this supplement govern.
536
+
537
+
538
+ 2.1. License Scope
539
+
540
+ The SDK is licensed for you to develop applications only for
541
+ use in systems with NVIDIA GPUs.
542
+
543
+
544
+ 2.2. Distribution
545
+
546
+ The portions of the SDK that are distributable under the
547
+ Agreement are listed in Attachment A.
548
+
549
+
550
+ 2.3. Operating Systems
551
+
552
+ Those portions of the SDK designed exclusively for use on the
553
+ Linux or FreeBSD operating systems, or other operating systems
554
+ derived from the source code to these operating systems, may
555
+ be copied and redistributed for use in accordance with this
556
+ Agreement, provided that the object code files are not
557
+ modified in any way (except for unzipping of compressed
558
+ files).
559
+
560
+
561
+ 2.4. Audio and Video Encoders and Decoders
562
+
563
+ You acknowledge and agree that it is your sole responsibility
564
+ to obtain any additional third-party licenses required to
565
+ make, have made, use, have used, sell, import, and offer for
566
+ sale your products or services that include or incorporate any
567
+ third-party software and content relating to audio and/or
568
+ video encoders and decoders from, including but not limited
569
+ to, Microsoft, Thomson, Fraunhofer IIS, Sisvel S.p.A.,
570
+ MPEG-LA, and Coding Technologies. NVIDIA does not grant to you
571
+ under this Agreement any necessary patent or other rights with
572
+ respect to any audio and/or video encoders and decoders.
573
+
574
+
575
+ 2.5. Licensing
576
+
577
+ If the distribution terms in this Agreement are not suitable
578
+ for your organization, or for any questions regarding this
579
+ Agreement, please contact NVIDIA at
580
+ nvidia-compute-license-questions@nvidia.com.
581
+
582
+
583
+ 2.6. Attachment A
584
+
585
+ The following portions of the SDK are distributable under the
586
+ Agreement:
587
+
588
+ Component
589
+
590
+ CUDA Runtime
591
+
592
+ Windows
593
+
594
+ cudart.dll, cudart_static.lib, cudadevrt.lib
595
+
596
+ Mac OSX
597
+
598
+ libcudart.dylib, libcudart_static.a, libcudadevrt.a
599
+
600
+ Linux
601
+
602
+ libcudart.so, libcudart_static.a, libcudadevrt.a
603
+
604
+ Android
605
+
606
+ libcudart.so, libcudart_static.a, libcudadevrt.a
607
+
608
+ Component
609
+
610
+ CUDA FFT Library
611
+
612
+ Windows
613
+
614
+ cufft.dll, cufftw.dll, cufft.lib, cufftw.lib
615
+
616
+ Mac OSX
617
+
618
+ libcufft.dylib, libcufft_static.a, libcufftw.dylib,
619
+ libcufftw_static.a
620
+
621
+ Linux
622
+
623
+ libcufft.so, libcufft_static.a, libcufftw.so,
624
+ libcufftw_static.a
625
+
626
+ Android
627
+
628
+ libcufft.so, libcufft_static.a, libcufftw.so,
629
+ libcufftw_static.a
630
+
631
+ Component
632
+
633
+ CUDA BLAS Library
634
+
635
+ Windows
636
+
637
+ cublas.dll, cublasLt.dll
638
+
639
+ Mac OSX
640
+
641
+ libcublas.dylib, libcublasLt.dylib, libcublas_static.a,
642
+ libcublasLt_static.a
643
+
644
+ Linux
645
+
646
+ libcublas.so, libcublasLt.so, libcublas_static.a,
647
+ libcublasLt_static.a
648
+
649
+ Android
650
+
651
+ libcublas.so, libcublasLt.so, libcublas_static.a,
652
+ libcublasLt_static.a
653
+
654
+ Component
655
+
656
+ NVIDIA "Drop-in" BLAS Library
657
+
658
+ Windows
659
+
660
+ nvblas.dll
661
+
662
+ Mac OSX
663
+
664
+ libnvblas.dylib
665
+
666
+ Linux
667
+
668
+ libnvblas.so
669
+
670
+ Component
671
+
672
+ CUDA Sparse Matrix Library
673
+
674
+ Windows
675
+
676
+ cusparse.dll, cusparse.lib
677
+
678
+ Mac OSX
679
+
680
+ libcusparse.dylib, libcusparse_static.a
681
+
682
+ Linux
683
+
684
+ libcusparse.so, libcusparse_static.a
685
+
686
+ Android
687
+
688
+ libcusparse.so, libcusparse_static.a
689
+
690
+ Component
691
+
692
+ CUDA Linear Solver Library
693
+
694
+ Windows
695
+
696
+ cusolver.dll, cusolver.lib
697
+
698
+ Mac OSX
699
+
700
+ libcusolver.dylib, libcusolver_static.a
701
+
702
+ Linux
703
+
704
+ libcusolver.so, libcusolver_static.a
705
+
706
+ Android
707
+
708
+ libcusolver.so, libcusolver_static.a
709
+
710
+ Component
711
+
712
+ CUDA Random Number Generation Library
713
+
714
+ Windows
715
+
716
+ curand.dll, curand.lib
717
+
718
+ Mac OSX
719
+
720
+ libcurand.dylib, libcurand_static.a
721
+
722
+ Linux
723
+
724
+ libcurand.so, libcurand_static.a
725
+
726
+ Android
727
+
728
+ libcurand.so, libcurand_static.a
729
+
730
+ Component
731
+
732
+ CUDA Accelerated Graph Library
733
+
734
+ Component
735
+
736
+ NVIDIA Performance Primitives Library
737
+
738
+ Windows
739
+
740
+ nppc.dll, nppc.lib, nppial.dll, nppial.lib, nppicc.dll,
741
+ nppicc.lib, nppicom.dll, nppicom.lib, nppidei.dll,
742
+ nppidei.lib, nppif.dll, nppif.lib, nppig.dll, nppig.lib,
743
+ nppim.dll, nppim.lib, nppist.dll, nppist.lib, nppisu.dll,
744
+ nppisu.lib, nppitc.dll, nppitc.lib, npps.dll, npps.lib
745
+
746
+ Mac OSX
747
+
748
+ libnppc.dylib, libnppc_static.a, libnppial.dylib,
749
+ libnppial_static.a, libnppicc.dylib, libnppicc_static.a,
750
+ libnppicom.dylib, libnppicom_static.a, libnppidei.dylib,
751
+ libnppidei_static.a, libnppif.dylib, libnppif_static.a,
752
+ libnppig.dylib, libnppig_static.a, libnppim.dylib,
753
+ libnppisu_static.a, libnppitc.dylib, libnppitc_static.a,
754
+ libnpps.dylib, libnpps_static.a
755
+
756
+ Linux
757
+
758
+ libnppc.so, libnppc_static.a, libnppial.so,
759
+ libnppial_static.a, libnppicc.so, libnppicc_static.a,
760
+ libnppicom.so, libnppicom_static.a, libnppidei.so,
761
+ libnppidei_static.a, libnppif.so, libnppif_static.a
762
+ libnppig.so, libnppig_static.a, libnppim.so,
763
+ libnppim_static.a, libnppist.so, libnppist_static.a,
764
+ libnppisu.so, libnppisu_static.a, libnppitc.so
765
+ libnppitc_static.a, libnpps.so, libnpps_static.a
766
+
767
+ Android
768
+
769
+ libnppc.so, libnppc_static.a, libnppial.so,
770
+ libnppial_static.a, libnppicc.so, libnppicc_static.a,
771
+ libnppicom.so, libnppicom_static.a, libnppidei.so,
772
+ libnppidei_static.a, libnppif.so, libnppif_static.a
773
+ libnppig.so, libnppig_static.a, libnppim.so,
774
+ libnppim_static.a, libnppist.so, libnppist_static.a,
775
+ libnppisu.so, libnppisu_static.a, libnppitc.so
776
+ libnppitc_static.a, libnpps.so, libnpps_static.a
777
+
778
+ Component
779
+
780
+ NVIDIA JPEG Library
781
+
782
+ Linux
783
+
784
+ libnvjpeg.so, libnvjpeg_static.a
785
+
786
+ Component
787
+
788
+ Internal common library required for statically linking to
789
+ cuBLAS, cuSPARSE, cuFFT, cuRAND, nvJPEG and NPP
790
+
791
+ Mac OSX
792
+
793
+ libculibos.a
794
+
795
+ Linux
796
+
797
+ libculibos.a
798
+
799
+ Component
800
+
801
+ NVIDIA Runtime Compilation Library and Header
802
+
803
+ All
804
+
805
+ nvrtc.h
806
+
807
+ Windows
808
+
809
+ nvrtc.dll, nvrtc-builtins.dll
810
+
811
+ Mac OSX
812
+
813
+ libnvrtc.dylib, libnvrtc-builtins.dylib
814
+
815
+ Linux
816
+
817
+ libnvrtc.so, libnvrtc-builtins.so
818
+
819
+ Component
820
+
821
+ NVIDIA Optimizing Compiler Library
822
+
823
+ Windows
824
+
825
+ nvvm.dll
826
+
827
+ Mac OSX
828
+
829
+ libnvvm.dylib
830
+
831
+ Linux
832
+
833
+ libnvvm.so
834
+
835
+ Component
836
+
837
+ NVIDIA Common Device Math Functions Library
838
+
839
+ Windows
840
+
841
+ libdevice.10.bc
842
+
843
+ Mac OSX
844
+
845
+ libdevice.10.bc
846
+
847
+ Linux
848
+
849
+ libdevice.10.bc
850
+
851
+ Component
852
+
853
+ CUDA Occupancy Calculation Header Library
854
+
855
+ All
856
+
857
+ cuda_occupancy.h
858
+
859
+ Component
860
+
861
+ CUDA Half Precision Headers
862
+
863
+ All
864
+
865
+ cuda_fp16.h, cuda_fp16.hpp
866
+
867
+ Component
868
+
869
+ CUDA Profiling Tools Interface (CUPTI) Library
870
+
871
+ Windows
872
+
873
+ cupti.dll
874
+
875
+ Mac OSX
876
+
877
+ libcupti.dylib
878
+
879
+ Linux
880
+
881
+ libcupti.so
882
+
883
+ Component
884
+
885
+ NVIDIA Tools Extension Library
886
+
887
+ Windows
888
+
889
+ nvToolsExt.dll, nvToolsExt.lib
890
+
891
+ Mac OSX
892
+
893
+ libnvToolsExt.dylib
894
+
895
+ Linux
896
+
897
+ libnvToolsExt.so
898
+
899
+ Component
900
+
901
+ NVIDIA CUDA Driver Libraries
902
+
903
+ Linux
904
+
905
+ libcuda.so, libnvidia-fatbinaryloader.so,
906
+ libnvidia-ptxjitcompiler.so
907
+
908
+ The NVIDIA CUDA Driver Libraries are only distributable in
909
+ applications that meet this criteria:
910
+
911
+ 1. The application was developed starting from a NVIDIA CUDA
912
+ container obtained from Docker Hub or the NVIDIA GPU
913
+ Cloud, and
914
+
915
+ 2. The resulting application is packaged as a Docker
916
+ container and distributed to users on Docker Hub or the
917
+ NVIDIA GPU Cloud only.
918
+
919
+
920
+ 2.7. Attachment B
921
+
922
+
923
+ Additional Licensing Obligations
924
+
925
+ The following third party components included in the SOFTWARE
926
+ are licensed to Licensee pursuant to the following terms and
927
+ conditions:
928
+
929
+ 1. Licensee's use of the GDB third party component is
930
+ subject to the terms and conditions of GNU GPL v3:
931
+
932
+ This product includes copyrighted third-party software licensed
933
+ under the terms of the GNU General Public License v3 ("GPL v3").
934
+ All third-party software packages are copyright by their respective
935
+ authors. GPL v3 terms and conditions are hereby incorporated into
936
+ the Agreement by this reference: http://www.gnu.org/licenses/gpl.txt
937
+
938
+ Consistent with these licensing requirements, the software
939
+ listed below is provided under the terms of the specified
940
+ open source software licenses. To obtain source code for
941
+ software provided under licenses that require
942
+ redistribution of source code, including the GNU General
943
+ Public License (GPL) and GNU Lesser General Public License
944
+ (LGPL), contact oss-requests@nvidia.com. This offer is
945
+ valid for a period of three (3) years from the date of the
946
+ distribution of this product by NVIDIA CORPORATION.
947
+
948
+ Component License
949
+ CUDA-GDB GPL v3
950
+
951
+ 2. Licensee represents and warrants that any and all third
952
+ party licensing and/or royalty payment obligations in
953
+ connection with Licensee's use of the H.264 video codecs
954
+ are solely the responsibility of Licensee.
955
+
956
+ 3. Licensee's use of the Thrust library is subject to the
957
+ terms and conditions of the Apache License Version 2.0.
958
+ All third-party software packages are copyright by their
959
+ respective authors. Apache License Version 2.0 terms and
960
+ conditions are hereby incorporated into the Agreement by
961
+ this reference.
962
+ http://www.apache.org/licenses/LICENSE-2.0.html
963
+
964
+ In addition, Licensee acknowledges the following notice:
965
+ Thrust includes source code from the Boost Iterator,
966
+ Tuple, System, and Random Number libraries.
967
+
968
+ Boost Software License - Version 1.0 - August 17th, 2003
969
+ . . . .
970
+
971
+ Permission is hereby granted, free of charge, to any person or
972
+ organization obtaining a copy of the software and accompanying
973
+ documentation covered by this license (the "Software") to use,
974
+ reproduce, display, distribute, execute, and transmit the Software,
975
+ and to prepare derivative works of the Software, and to permit
976
+ third-parties to whom the Software is furnished to do so, all
977
+ subject to the following:
978
+
979
+ The copyright notices in the Software and this entire statement,
980
+ including the above license grant, this restriction and the following
981
+ disclaimer, must be included in all copies of the Software, in whole
982
+ or in part, and all derivative works of the Software, unless such
983
+ copies or derivative works are solely in the form of machine-executable
984
+ object code generated by a source language processor.
985
+
986
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
987
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
988
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE AND
989
+ NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR
990
+ ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE FOR ANY DAMAGES OR
991
+ OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, ARISING
992
+ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
993
+ OTHER DEALINGS IN THE SOFTWARE.
994
+
995
+ 4. Licensee's use of the LLVM third party component is
996
+ subject to the following terms and conditions:
997
+
998
+ ======================================================
999
+ LLVM Release License
1000
+ ======================================================
1001
+ University of Illinois/NCSA
1002
+ Open Source License
1003
+
1004
+ Copyright (c) 2003-2010 University of Illinois at Urbana-Champaign.
1005
+ All rights reserved.
1006
+
1007
+ Developed by:
1008
+
1009
+ LLVM Team
1010
+
1011
+ University of Illinois at Urbana-Champaign
1012
+
1013
+ http://llvm.org
1014
+
1015
+ Permission is hereby granted, free of charge, to any person obtaining a copy
1016
+ of this software and associated documentation files (the "Software"), to
1017
+ deal with the Software without restriction, including without limitation the
1018
+ rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
1019
+ sell copies of the Software, and to permit persons to whom the Software is
1020
+ furnished to do so, subject to the following conditions:
1021
+
1022
+ * Redistributions of source code must retain the above copyright notice,
1023
+ this list of conditions and the following disclaimers.
1024
+
1025
+ * Redistributions in binary form must reproduce the above copyright
1026
+ notice, this list of conditions and the following disclaimers in the
1027
+ documentation and/or other materials provided with the distribution.
1028
+
1029
+ * Neither the names of the LLVM Team, University of Illinois at Urbana-
1030
+ Champaign, nor the names of its contributors may be used to endorse or
1031
+ promote products derived from this Software without specific prior
1032
+ written permission.
1033
+
1034
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1035
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1036
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
1037
+ THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
1038
+ OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
1039
+ ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
1040
+ DEALINGS WITH THE SOFTWARE.
1041
+
1042
+ 5. Licensee's use (e.g. nvprof) of the PCRE third party
1043
+ component is subject to the following terms and
1044
+ conditions:
1045
+
1046
+ ------------
1047
+ PCRE LICENCE
1048
+ ------------
1049
+ PCRE is a library of functions to support regular expressions whose syntax
1050
+ and semantics are as close as possible to those of the Perl 5 language.
1051
+ Release 8 of PCRE is distributed under the terms of the "BSD" licence, as
1052
+ specified below. The documentation for PCRE, supplied in the "doc"
1053
+ directory, is distributed under the same terms as the software itself. The
1054
+ basic library functions are written in C and are freestanding. Also
1055
+ included in the distribution is a set of C++ wrapper functions, and a just-
1056
+ in-time compiler that can be used to optimize pattern matching. These are
1057
+ both optional features that can be omitted when the library is built.
1058
+
1059
+ THE BASIC LIBRARY FUNCTIONS
1060
+ ---------------------------
1061
+ Written by: Philip Hazel
1062
+ Email local part: ph10
1063
+ Email domain: cam.ac.uk
1064
+ University of Cambridge Computing Service,
1065
+ Cambridge, England.
1066
+ Copyright (c) 1997-2012 University of Cambridge
1067
+ All rights reserved.
1068
+
1069
+ PCRE JUST-IN-TIME COMPILATION SUPPORT
1070
+ -------------------------------------
1071
+ Written by: Zoltan Herczeg
1072
+ Email local part: hzmester
1073
+ Emain domain: freemail.hu
1074
+ Copyright(c) 2010-2012 Zoltan Herczeg
1075
+ All rights reserved.
1076
+
1077
+ STACK-LESS JUST-IN-TIME COMPILER
1078
+ --------------------------------
1079
+ Written by: Zoltan Herczeg
1080
+ Email local part: hzmester
1081
+ Emain domain: freemail.hu
1082
+ Copyright(c) 2009-2012 Zoltan Herczeg
1083
+ All rights reserved.
1084
+
1085
+ THE C++ WRAPPER FUNCTIONS
1086
+ -------------------------
1087
+ Contributed by: Google Inc.
1088
+ Copyright (c) 2007-2012, Google Inc.
1089
+ All rights reserved.
1090
+
1091
+ THE "BSD" LICENCE
1092
+ -----------------
1093
+ Redistribution and use in source and binary forms, with or without
1094
+ modification, are permitted provided that the following conditions are met:
1095
+
1096
+ * Redistributions of source code must retain the above copyright notice,
1097
+ this list of conditions and the following disclaimer.
1098
+
1099
+ * Redistributions in binary form must reproduce the above copyright
1100
+ notice, this list of conditions and the following disclaimer in the
1101
+ documentation and/or other materials provided with the distribution.
1102
+
1103
+ * Neither the name of the University of Cambridge nor the name of Google
1104
+ Inc. nor the names of their contributors may be used to endorse or
1105
+ promote products derived from this software without specific prior
1106
+ written permission.
1107
+
1108
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
1109
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
1110
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
1111
+ ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
1112
+ LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
1113
+ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
1114
+ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
1115
+ INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
1116
+ CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
1117
+ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
1118
+ POSSIBILITY OF SUCH DAMAGE.
1119
+
1120
+ 6. Some of the cuBLAS library routines were written by or
1121
+ derived from code written by Vasily Volkov and are subject
1122
+ to the Modified Berkeley Software Distribution License as
1123
+ follows:
1124
+
1125
+ Copyright (c) 2007-2009, Regents of the University of California
1126
+
1127
+ All rights reserved.
1128
+
1129
+ Redistribution and use in source and binary forms, with or without
1130
+ modification, are permitted provided that the following conditions are
1131
+ met:
1132
+ * Redistributions of source code must retain the above copyright
1133
+ notice, this list of conditions and the following disclaimer.
1134
+ * Redistributions in binary form must reproduce the above
1135
+ copyright notice, this list of conditions and the following
1136
+ disclaimer in the documentation and/or other materials provided
1137
+ with the distribution.
1138
+ * Neither the name of the University of California, Berkeley nor
1139
+ the names of its contributors may be used to endorse or promote
1140
+ products derived from this software without specific prior
1141
+ written permission.
1142
+
1143
+ THIS SOFTWARE IS PROVIDED BY THE AUTHOR "AS IS" AND ANY EXPRESS OR
1144
+ IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
1145
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
1146
+ DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
1147
+ INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
1148
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
1149
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
1150
+ HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
1151
+ STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
1152
+ IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
1153
+ POSSIBILITY OF SUCH DAMAGE.
1154
+
1155
+ 7. Some of the cuBLAS library routines were written by or
1156
+ derived from code written by Davide Barbieri and are
1157
+ subject to the Modified Berkeley Software Distribution
1158
+ License as follows:
1159
+
1160
+ Copyright (c) 2008-2009 Davide Barbieri @ University of Rome Tor Vergata.
1161
+
1162
+ All rights reserved.
1163
+
1164
+ Redistribution and use in source and binary forms, with or without
1165
+ modification, are permitted provided that the following conditions are
1166
+ met:
1167
+ * Redistributions of source code must retain the above copyright
1168
+ notice, this list of conditions and the following disclaimer.
1169
+ * Redistributions in binary form must reproduce the above
1170
+ copyright notice, this list of conditions and the following
1171
+ disclaimer in the documentation and/or other materials provided
1172
+ with the distribution.
1173
+ * The name of the author may not be used to endorse or promote
1174
+ products derived from this software without specific prior
1175
+ written permission.
1176
+
1177
+ THIS SOFTWARE IS PROVIDED BY THE AUTHOR "AS IS" AND ANY EXPRESS OR
1178
+ IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
1179
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
1180
+ DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
1181
+ INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
1182
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
1183
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
1184
+ HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
1185
+ STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
1186
+ IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
1187
+ POSSIBILITY OF SUCH DAMAGE.
1188
+
1189
+ 8. Some of the cuBLAS library routines were derived from
1190
+ code developed by the University of Tennessee and are
1191
+ subject to the Modified Berkeley Software Distribution
1192
+ License as follows:
1193
+
1194
+ Copyright (c) 2010 The University of Tennessee.
1195
+
1196
+ All rights reserved.
1197
+
1198
+ Redistribution and use in source and binary forms, with or without
1199
+ modification, are permitted provided that the following conditions are
1200
+ met:
1201
+ * Redistributions of source code must retain the above copyright
1202
+ notice, this list of conditions and the following disclaimer.
1203
+ * Redistributions in binary form must reproduce the above
1204
+ copyright notice, this list of conditions and the following
1205
+ disclaimer listed in this license in the documentation and/or
1206
+ other materials provided with the distribution.
1207
+ * Neither the name of the copyright holders nor the names of its
1208
+ contributors may be used to endorse or promote products derived
1209
+ from this software without specific prior written permission.
1210
+
1211
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
1212
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
1213
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
1214
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
1215
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
1216
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
1217
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
1218
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
1219
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
1220
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
1221
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
1222
+
1223
+ 9. Some of the cuBLAS library routines were written by or
1224
+ derived from code written by Jonathan Hogg and are subject
1225
+ to the Modified Berkeley Software Distribution License as
1226
+ follows:
1227
+
1228
+ Copyright (c) 2012, The Science and Technology Facilities Council (STFC).
1229
+
1230
+ All rights reserved.
1231
+
1232
+ Redistribution and use in source and binary forms, with or without
1233
+ modification, are permitted provided that the following conditions are
1234
+ met:
1235
+ * Redistributions of source code must retain the above copyright
1236
+ notice, this list of conditions and the following disclaimer.
1237
+ * Redistributions in binary form must reproduce the above
1238
+ copyright notice, this list of conditions and the following
1239
+ disclaimer in the documentation and/or other materials provided
1240
+ with the distribution.
1241
+ * Neither the name of the STFC nor the names of its contributors
1242
+ may be used to endorse or promote products derived from this
1243
+ software without specific prior written permission.
1244
+
1245
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
1246
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
1247
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
1248
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE STFC BE
1249
+ LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
1250
+ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
1251
+ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
1252
+ BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
1253
+ WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
1254
+ OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
1255
+ IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
1256
+
1257
+ 10. Some of the cuBLAS library routines were written by or
1258
+ derived from code written by Ahmad M. Abdelfattah, David
1259
+ Keyes, and Hatem Ltaief, and are subject to the Apache
1260
+ License, Version 2.0, as follows:
1261
+
1262
+ -- (C) Copyright 2013 King Abdullah University of Science and Technology
1263
+ Authors:
1264
+ Ahmad Abdelfattah (ahmad.ahmad@kaust.edu.sa)
1265
+ David Keyes (david.keyes@kaust.edu.sa)
1266
+ Hatem Ltaief (hatem.ltaief@kaust.edu.sa)
1267
+
1268
+ Redistribution and use in source and binary forms, with or without
1269
+ modification, are permitted provided that the following conditions
1270
+ are met:
1271
+
1272
+ * Redistributions of source code must retain the above copyright
1273
+ notice, this list of conditions and the following disclaimer.
1274
+ * Redistributions in binary form must reproduce the above copyright
1275
+ notice, this list of conditions and the following disclaimer in the
1276
+ documentation and/or other materials provided with the distribution.
1277
+ * Neither the name of the King Abdullah University of Science and
1278
+ Technology nor the names of its contributors may be used to endorse
1279
+ or promote products derived from this software without specific prior
1280
+ written permission.
1281
+
1282
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
1283
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
1284
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
1285
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
1286
+ HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
1287
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
1288
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
1289
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
1290
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
1291
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
1292
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE
1293
+
1294
+ 11. Some of the cuSPARSE library routines were written by or
1295
+ derived from code written by Li-Wen Chang and are subject
1296
+ to the NCSA Open Source License as follows:
1297
+
1298
+ Copyright (c) 2012, University of Illinois.
1299
+
1300
+ All rights reserved.
1301
+
1302
+ Developed by: IMPACT Group, University of Illinois, http://impact.crhc.illinois.edu
1303
+
1304
+ Permission is hereby granted, free of charge, to any person obtaining
1305
+ a copy of this software and associated documentation files (the
1306
+ "Software"), to deal with the Software without restriction, including
1307
+ without limitation the rights to use, copy, modify, merge, publish,
1308
+ distribute, sublicense, and/or sell copies of the Software, and to
1309
+ permit persons to whom the Software is furnished to do so, subject to
1310
+ the following conditions:
1311
+ * Redistributions of source code must retain the above copyright
1312
+ notice, this list of conditions and the following disclaimer.
1313
+ * Redistributions in binary form must reproduce the above
1314
+ copyright notice, this list of conditions and the following
1315
+ disclaimers in the documentation and/or other materials provided
1316
+ with the distribution.
1317
+ * Neither the names of IMPACT Group, University of Illinois, nor
1318
+ the names of its contributors may be used to endorse or promote
1319
+ products derived from this Software without specific prior
1320
+ written permission.
1321
+
1322
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
1323
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
1324
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
1325
+ NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
1326
+ HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
1327
+ IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
1328
+ IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
1329
+ SOFTWARE.
1330
+
1331
+ 12. Some of the cuRAND library routines were written by or
1332
+ derived from code written by Mutsuo Saito and Makoto
1333
+ Matsumoto and are subject to the following license:
1334
+
1335
+ Copyright (c) 2009, 2010 Mutsuo Saito, Makoto Matsumoto and Hiroshima
1336
+ University. All rights reserved.
1337
+
1338
+ Copyright (c) 2011 Mutsuo Saito, Makoto Matsumoto, Hiroshima
1339
+ University and University of Tokyo. All rights reserved.
1340
+
1341
+ Redistribution and use in source and binary forms, with or without
1342
+ modification, are permitted provided that the following conditions are
1343
+ met:
1344
+ * Redistributions of source code must retain the above copyright
1345
+ notice, this list of conditions and the following disclaimer.
1346
+ * Redistributions in binary form must reproduce the above
1347
+ copyright notice, this list of conditions and the following
1348
+ disclaimer in the documentation and/or other materials provided
1349
+ with the distribution.
1350
+ * Neither the name of the Hiroshima University nor the names of
1351
+ its contributors may be used to endorse or promote products
1352
+ derived from this software without specific prior written
1353
+ permission.
1354
+
1355
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
1356
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
1357
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
1358
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
1359
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
1360
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
1361
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
1362
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
1363
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
1364
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
1365
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
1366
+
1367
+ 13. Some of the cuRAND library routines were derived from
1368
+ code developed by D. E. Shaw Research and are subject to
1369
+ the following license:
1370
+
1371
+ Copyright 2010-2011, D. E. Shaw Research.
1372
+
1373
+ All rights reserved.
1374
+
1375
+ Redistribution and use in source and binary forms, with or without
1376
+ modification, are permitted provided that the following conditions are
1377
+ met:
1378
+ * Redistributions of source code must retain the above copyright
1379
+ notice, this list of conditions, and the following disclaimer.
1380
+ * Redistributions in binary form must reproduce the above
1381
+ copyright notice, this list of conditions, and the following
1382
+ disclaimer in the documentation and/or other materials provided
1383
+ with the distribution.
1384
+ * Neither the name of D. E. Shaw Research nor the names of its
1385
+ contributors may be used to endorse or promote products derived
1386
+ from this software without specific prior written permission.
1387
+
1388
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
1389
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
1390
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
1391
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
1392
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
1393
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
1394
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
1395
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
1396
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
1397
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
1398
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
1399
+
1400
+ 14. Some of the Math library routines were written by or
1401
+ derived from code developed by Norbert Juffa and are
1402
+ subject to the following license:
1403
+
1404
+ Copyright (c) 2015-2017, Norbert Juffa
1405
+ All rights reserved.
1406
+
1407
+ Redistribution and use in source and binary forms, with or without
1408
+ modification, are permitted provided that the following conditions
1409
+ are met:
1410
+
1411
+ 1. Redistributions of source code must retain the above copyright
1412
+ notice, this list of conditions and the following disclaimer.
1413
+
1414
+ 2. Redistributions in binary form must reproduce the above copyright
1415
+ notice, this list of conditions and the following disclaimer in the
1416
+ documentation and/or other materials provided with the distribution.
1417
+
1418
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
1419
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
1420
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
1421
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
1422
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
1423
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
1424
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
1425
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
1426
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
1427
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
1428
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
1429
+
1430
+ 15. Licensee's use of the lz4 third party component is
1431
+ subject to the following terms and conditions:
1432
+
1433
+ Copyright (C) 2011-2013, Yann Collet.
1434
+ BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
1435
+
1436
+ Redistribution and use in source and binary forms, with or without
1437
+ modification, are permitted provided that the following conditions are
1438
+ met:
1439
+
1440
+ * Redistributions of source code must retain the above copyright
1441
+ notice, this list of conditions and the following disclaimer.
1442
+ * Redistributions in binary form must reproduce the above
1443
+ copyright notice, this list of conditions and the following disclaimer
1444
+ in the documentation and/or other materials provided with the
1445
+ distribution.
1446
+
1447
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
1448
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
1449
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
1450
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
1451
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
1452
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
1453
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
1454
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
1455
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
1456
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
1457
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
1458
+
1459
+ 16. The NPP library uses code from the Boost Math Toolkit,
1460
+ and is subject to the following license:
1461
+
1462
+ Boost Software License - Version 1.0 - August 17th, 2003
1463
+ . . . .
1464
+
1465
+ Permission is hereby granted, free of charge, to any person or
1466
+ organization obtaining a copy of the software and accompanying
1467
+ documentation covered by this license (the "Software") to use,
1468
+ reproduce, display, distribute, execute, and transmit the Software,
1469
+ and to prepare derivative works of the Software, and to permit
1470
+ third-parties to whom the Software is furnished to do so, all
1471
+ subject to the following:
1472
+
1473
+ The copyright notices in the Software and this entire statement,
1474
+ including the above license grant, this restriction and the following
1475
+ disclaimer, must be included in all copies of the Software, in whole
1476
+ or in part, and all derivative works of the Software, unless such
1477
+ copies or derivative works are solely in the form of machine-executable
1478
+ object code generated by a source language processor.
1479
+
1480
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
1481
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
1482
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE AND
1483
+ NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR
1484
+ ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE FOR ANY DAMAGES OR
1485
+ OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, ARISING
1486
+ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
1487
+ OTHER DEALINGS IN THE SOFTWARE.
1488
+
1489
+ 17. Portions of the Nsight Eclipse Edition is subject to the
1490
+ following license:
1491
+
1492
+ The Eclipse Foundation makes available all content in this plug-in
1493
+ ("Content"). Unless otherwise indicated below, the Content is provided
1494
+ to you under the terms and conditions of the Eclipse Public License
1495
+ Version 1.0 ("EPL"). A copy of the EPL is available at http://
1496
+ www.eclipse.org/legal/epl-v10.html. For purposes of the EPL, "Program"
1497
+ will mean the Content.
1498
+
1499
+ If you did not receive this Content directly from the Eclipse
1500
+ Foundation, the Content is being redistributed by another party
1501
+ ("Redistributor") and different terms and conditions may apply to your
1502
+ use of any object code in the Content. Check the Redistributor's
1503
+ license that was provided with the Content. If no such license exists,
1504
+ contact the Redistributor. Unless otherwise indicated below, the terms
1505
+ and conditions of the EPL still apply to any source code in the
1506
+ Content and such source code may be obtained at http://www.eclipse.org.
1507
+
1508
+ 18. Some of the cuBLAS library routines uses code from
1509
+ OpenAI, which is subject to the following license:
1510
+
1511
+ License URL
1512
+ https://github.com/openai/openai-gemm/blob/master/LICENSE
1513
+
1514
+ License Text
1515
+ The MIT License
1516
+
1517
+ Copyright (c) 2016 OpenAI (http://openai.com), 2016 Google Inc.
1518
+
1519
+ Permission is hereby granted, free of charge, to any person obtaining a copy
1520
+ of this software and associated documentation files (the "Software"), to deal
1521
+ in the Software without restriction, including without limitation the rights
1522
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
1523
+ copies of the Software, and to permit persons to whom the Software is
1524
+ furnished to do so, subject to the following conditions:
1525
+
1526
+ The above copyright notice and this permission notice shall be included in
1527
+ all copies or substantial portions of the Software.
1528
+
1529
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1530
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1531
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
1532
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
1533
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
1534
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
1535
+ THE SOFTWARE.
1536
+
1537
+ 19. Licensee's use of the Visual Studio Setup Configuration
1538
+ Samples is subject to the following license:
1539
+
1540
+ The MIT License (MIT)
1541
+ Copyright (C) Microsoft Corporation. All rights reserved.
1542
+
1543
+ Permission is hereby granted, free of charge, to any person
1544
+ obtaining a copy of this software and associated documentation
1545
+ files (the "Software"), to deal in the Software without restriction,
1546
+ including without limitation the rights to use, copy, modify, merge,
1547
+ publish, distribute, sublicense, and/or sell copies of the Software,
1548
+ and to permit persons to whom the Software is furnished to do so,
1549
+ subject to the following conditions:
1550
+
1551
+ The above copyright notice and this permission notice shall be included
1552
+ in all copies or substantial portions of the Software.
1553
+
1554
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
1555
+ OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1556
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
1557
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
1558
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
1559
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
1560
+
1561
+ 20. Licensee's use of linmath.h header for CPU functions for
1562
+ GL vector/matrix operations from lunarG is subject to the
1563
+ Apache License Version 2.0.
1564
+
1565
+ 21. The DX12-CUDA sample uses the d3dx12.h header, which is
1566
+ subject to the MIT license .
1567
+
1568
+ -----------------
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia_curand_cu12-10.3.9.90.dist-info/METADATA ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Metadata-Version: 2.2
2
+ Name: nvidia-curand-cu12
3
+ Version: 10.3.9.90
4
+ Summary: CURAND native runtime libraries
5
+ Home-page: https://developer.nvidia.com/cuda-zone
6
+ Author: Nvidia CUDA Installer Team
7
+ Author-email: compute_installer@nvidia.com
8
+ License: NVIDIA Proprietary Software
9
+ Keywords: cuda,nvidia,runtime,machine learning,deep learning
10
+ Classifier: Development Status :: 4 - Beta
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: Intended Audience :: Education
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: License :: Other/Proprietary License
15
+ Classifier: Natural Language :: English
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.5
18
+ Classifier: Programming Language :: Python :: 3.6
19
+ Classifier: Programming Language :: Python :: 3.7
20
+ Classifier: Programming Language :: Python :: 3.8
21
+ Classifier: Programming Language :: Python :: 3.9
22
+ Classifier: Programming Language :: Python :: 3.10
23
+ Classifier: Programming Language :: Python :: 3.11
24
+ Classifier: Programming Language :: Python :: 3 :: Only
25
+ Classifier: Topic :: Scientific/Engineering
26
+ Classifier: Topic :: Scientific/Engineering :: Mathematics
27
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
28
+ Classifier: Topic :: Software Development
29
+ Classifier: Topic :: Software Development :: Libraries
30
+ Classifier: Operating System :: Microsoft :: Windows
31
+ Classifier: Operating System :: POSIX :: Linux
32
+ Requires-Python: >=3
33
+ License-File: License.txt
34
+ Dynamic: author
35
+ Dynamic: author-email
36
+ Dynamic: classifier
37
+ Dynamic: description
38
+ Dynamic: home-page
39
+ Dynamic: keywords
40
+ Dynamic: license
41
+ Dynamic: requires-python
42
+ Dynamic: summary
43
+
44
+ CURAND native runtime libraries
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia_curand_cu12-10.3.9.90.dist-info/RECORD ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ nvidia/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ nvidia/__pycache__/__init__.cpython-312.pyc,,
3
+ nvidia/curand/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
+ nvidia/curand/__pycache__/__init__.cpython-312.pyc,,
5
+ nvidia/curand/include/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
+ nvidia/curand/include/__pycache__/__init__.cpython-312.pyc,,
7
+ nvidia/curand/include/curand.h,sha256=strQ9idlRTQoBJy_hAbAT4pgkW6BKYg8p_nUjbb8BVw,44075
8
+ nvidia/curand/include/curand_discrete.h,sha256=2qD3BkI622XEu0444wVP7HeYkKAx0Rjr2HDhqU4SA7E,3486
9
+ nvidia/curand/include/curand_discrete2.h,sha256=ZrQTO5R9x83AMX88uq7M8M94DLSC5VEz0PAkfcwtQeg,10883
10
+ nvidia/curand/include/curand_globals.h,sha256=bES1Kx0NrATXk1DReMMkqWrB062nOnaAp39y22wViXU,3717
11
+ nvidia/curand/include/curand_kernel.h,sha256=SjfAeh13ybXIxiekcgczzua02kIAqETopJKRhYvCat8,53133
12
+ nvidia/curand/include/curand_lognormal.h,sha256=-X-iNkJSzWpAYYjogm689EJTZfzore9sxU7ObddljLk,28142
13
+ nvidia/curand/include/curand_mrg32k3a.h,sha256=ZVVREjGNsJQJ-3IzZZ_LKGtGteslicb8E0Aly49BKPs,170296
14
+ nvidia/curand/include/curand_mtgp32.h,sha256=Qhrmx0pHWF-P2Uu5bKwYE9ymEWq3c7qBzCITVMaKMfI,7845
15
+ nvidia/curand/include/curand_mtgp32_host.h,sha256=SXqzmSQkzTLSRJ4pojTg_TNCC3T-G89HdBK-boSDqr4,18274
16
+ nvidia/curand/include/curand_mtgp32_kernel.h,sha256=ajZnXr5ZXnQExElf6LPpigrrKPTmMIZbRyTEnJ-BDhw,13731
17
+ nvidia/curand/include/curand_mtgp32dc_p_11213.h,sha256=7_gGYUH47UugIAEt60vYH5nFa-QUwTpDwSEgLg9cZts,276889
18
+ nvidia/curand/include/curand_normal.h,sha256=lnmYVk2fn0oEVWOytdKhXrHL36GLCjMnB8OnZeCaYcA,26953
19
+ nvidia/curand/include/curand_normal_static.h,sha256=5K4iTC9AuSWCe1LVxuj_0y3BVjtp0bxO6hndv2rbmiw,4727
20
+ nvidia/curand/include/curand_philox4x32_x.h,sha256=T21IP-Rdg3_tSVU9Je4dLKuwEqE4ovfwi7r1hOY92Dw,7166
21
+ nvidia/curand/include/curand_poisson.h,sha256=KrhXOmO_D7aclnj8geIyHqdpSQwWHurS9V_pVtgzodM,25461
22
+ nvidia/curand/include/curand_precalc.h,sha256=I6NZdgT42fMm9qSCtP-rlOAqt4Zsqgal0ajktcPmEak,1392393
23
+ nvidia/curand/include/curand_uniform.h,sha256=gpmRgQu5r6ppgLTg60NXoDdVJS6wMUy6jC5bh8l04e8,17472
24
+ nvidia/curand/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
25
+ nvidia/curand/lib/__pycache__/__init__.cpython-312.pyc,,
26
+ nvidia/curand/lib/libcurand.so.10,sha256=-b6gOKJwO3IVcf1FopmomBQf2MsmSlkSY1yVEW9ZYP4,136749240
27
+ nvidia_curand_cu12-10.3.9.90.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
28
+ nvidia_curand_cu12-10.3.9.90.dist-info/License.txt,sha256=rW9YU_ugyg0VnQ9Y1JrkmDDC-Mk_epJki5zpCttMbM0,59262
29
+ nvidia_curand_cu12-10.3.9.90.dist-info/METADATA,sha256=fU3xSITD3i7JIsVG2ZXO5i-aDlIls-ry2JUVICEsv28,1684
30
+ nvidia_curand_cu12-10.3.9.90.dist-info/RECORD,,
31
+ nvidia_curand_cu12-10.3.9.90.dist-info/WHEEL,sha256=VtFLEVB-VX8niQT4kQ5pcQOOqiKvUvqfZe5V14HmU88,109
32
+ nvidia_curand_cu12-10.3.9.90.dist-info/top_level.txt,sha256=fTkAtiFuL16nUrB9ytDDtpytz2t0B4NvYTnRzwAhO14,7
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia_curand_cu12-10.3.9.90.dist-info/WHEEL ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (75.8.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-manylinux_2_27_x86_64
5
+
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia_curand_cu12-10.3.9.90.dist-info/top_level.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ nvidia