Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- Prism/LLaDA/LLaDA_Baseline/dllm_eval/decontamination/archiver.py +174 -0
- Prism/LLaDA/LLaDA_Baseline/dllm_eval/decontamination/decontaminate.py +166 -0
- Prism/LLaDA/LLaDA_Prism/.venv/bin/Activate.ps1 +247 -0
- Prism/LLaDA/LLaDA_Prism/.venv/bin/accelerate-launch +8 -0
- Prism/LLaDA/LLaDA_Prism/.venv/bin/activate.csh +27 -0
- Prism/LLaDA/LLaDA_Prism/.venv/bin/activate.fish +69 -0
- Prism/LLaDA/LLaDA_Prism/.venv/bin/f2py +8 -0
- Prism/LLaDA/LLaDA_Prism/.venv/bin/get_objgraph +54 -0
- Prism/LLaDA/LLaDA_Prism/.venv/bin/hf +8 -0
- Prism/LLaDA/LLaDA_Prism/.venv/bin/httpx +8 -0
- Prism/LLaDA/LLaDA_Prism/.venv/bin/markdown-it +8 -0
- Prism/LLaDA/LLaDA_Prism/.venv/bin/pip +8 -0
- Prism/LLaDA/LLaDA_Prism/.venv/bin/torchfrtrace +8 -0
- Prism/LLaDA/LLaDA_Prism/.venv/bin/torchrun +8 -0
- Prism/LLaDA/LLaDA_Prism/.venv/bin/typer +8 -0
- Prism/LLaDA/LLaDA_Prism/.venv/bin/undill +22 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/attrs-25.1.0.dist-info/INSTALLER +1 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/attrs-25.1.0.dist-info/METADATA +232 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/attrs-25.1.0.dist-info/RECORD +55 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/attrs-25.1.0.dist-info/WHEEL +4 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/colorama/__init__.py +7 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/colorama/ansi.py +102 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/colorama/ansitowin32.py +277 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/colorama/initialise.py +121 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/colorama/win32.py +180 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/colorama/winterm.py +195 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/cuda/ccuda.pxd +15 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/cuda/ccudart.cpython-312-x86_64-linux-gnu.so +0 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/cuda/cnvrtc.pxd +15 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/cuda/cnvrtc.pyx +7 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/cuda/cuda.cpp +0 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/cuda/cuda.cpython-312-x86_64-linux-gnu.so +0 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/cuda/cudart.pyx +22 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/cuda/nvrtc.pyx +22 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/datasets/arrow_dataset.py +0 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/datasets/arrow_reader.py +663 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/datasets/builder.bak.py +0 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/datasets/builder.py +0 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/datasets/config.py +272 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/datasets/dataset_dict.py +0 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/datasets/distributed.py +39 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/datasets/exceptions.py +196 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/datasets/inspect.py +582 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/datasets/keyhash.py +104 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/datasets/load.py +0 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/datasets/naming.py +84 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/datasets/streaming.py +142 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore-1.0.7.dist-info/INSTALLER +1 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore-1.0.7.dist-info/METADATA +616 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore-1.0.7.dist-info/RECORD +68 -0
Prism/LLaDA/LLaDA_Baseline/dllm_eval/decontamination/archiver.py
ADDED
|
@@ -0,0 +1,174 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import datetime
|
| 2 |
+
import io
|
| 3 |
+
import json
|
| 4 |
+
import mmap
|
| 5 |
+
import os
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
from typing import Any
|
| 8 |
+
|
| 9 |
+
import jsonlines
|
| 10 |
+
import tqdm
|
| 11 |
+
import zstandard
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def json_serial(obj: Any) -> str:
|
| 15 |
+
"""JSON serializer for objects not serializable by default json code"""
|
| 16 |
+
|
| 17 |
+
if isinstance(obj, (datetime.datetime,)):
|
| 18 |
+
return obj.isoformat()
|
| 19 |
+
raise TypeError("Type %s not serializable" % type(obj))
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
# Modified version of lm_dataformat Archive for single file.
|
| 23 |
+
class Archive:
|
| 24 |
+
def __init__(self, file_path: str, compression_level: int = 3) -> None:
|
| 25 |
+
self.file_path = file_path
|
| 26 |
+
dir_name = os.path.dirname(file_path)
|
| 27 |
+
if dir_name:
|
| 28 |
+
os.makedirs(dir_name, exist_ok=True)
|
| 29 |
+
self.fh = open(self.file_path, "wb")
|
| 30 |
+
self.cctx = zstandard.ZstdCompressor(level=compression_level)
|
| 31 |
+
self.compressor = self.cctx.stream_writer(self.fh)
|
| 32 |
+
|
| 33 |
+
def add_data(self, data, meta=None) -> None:
|
| 34 |
+
if meta is None:
|
| 35 |
+
meta = {}
|
| 36 |
+
self.compressor.write(
|
| 37 |
+
json.dumps({"text": data, "meta": meta}, default=json_serial).encode(
|
| 38 |
+
"UTF-8"
|
| 39 |
+
)
|
| 40 |
+
+ b"\n"
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
def commit(self) -> None:
|
| 44 |
+
self.compressor.flush(zstandard.FLUSH_FRAME)
|
| 45 |
+
self.fh.flush()
|
| 46 |
+
self.fh.close()
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
# Modified version of lm_dataformat Reader with self.fh set, allowing peeking for tqdm.
|
| 50 |
+
class Reader:
|
| 51 |
+
def __init__(self) -> None:
|
| 52 |
+
pass
|
| 53 |
+
|
| 54 |
+
def read(
|
| 55 |
+
self,
|
| 56 |
+
file,
|
| 57 |
+
get_meta: bool = False,
|
| 58 |
+
autojoin_paragraphs: bool = True,
|
| 59 |
+
para_joiner: str = "\n\n",
|
| 60 |
+
):
|
| 61 |
+
with open(file, "rb") as fh:
|
| 62 |
+
self.fh = fh
|
| 63 |
+
cctx = zstandard.ZstdDecompressor()
|
| 64 |
+
reader = io.BufferedReader(cctx.stream_reader(fh))
|
| 65 |
+
rdr = jsonlines.Reader(reader)
|
| 66 |
+
for ob in rdr:
|
| 67 |
+
# naive jsonl where each object is just the string itself, with no meta. For legacy compatibility.
|
| 68 |
+
if isinstance(ob, str):
|
| 69 |
+
assert not get_meta
|
| 70 |
+
yield ob
|
| 71 |
+
continue
|
| 72 |
+
|
| 73 |
+
text = ob["text"]
|
| 74 |
+
|
| 75 |
+
if autojoin_paragraphs and isinstance(text, list):
|
| 76 |
+
text = para_joiner.join(text)
|
| 77 |
+
|
| 78 |
+
if get_meta:
|
| 79 |
+
yield text, (ob["meta"] if "meta" in ob else {})
|
| 80 |
+
else:
|
| 81 |
+
yield text
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
class TextArchive:
|
| 85 |
+
def __init__(self, file_path, mode: str = "rb+") -> None:
|
| 86 |
+
self.file_path = file_path
|
| 87 |
+
dir_name = os.path.dirname(file_path)
|
| 88 |
+
if dir_name:
|
| 89 |
+
os.makedirs(dir_name, exist_ok=True)
|
| 90 |
+
|
| 91 |
+
if not os.path.exists(file_path):
|
| 92 |
+
Path(file_path).touch()
|
| 93 |
+
|
| 94 |
+
self.fh = open(self.file_path, mode)
|
| 95 |
+
|
| 96 |
+
def add_data(self, data) -> None:
|
| 97 |
+
self.fh.write(data.encode("UTF-8") + b"\n")
|
| 98 |
+
|
| 99 |
+
def commit(self) -> None:
|
| 100 |
+
self.fh.flush()
|
| 101 |
+
self.fh.close()
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
class TextReader:
|
| 105 |
+
def __init__(self, file_path) -> None:
|
| 106 |
+
self.file_path = file_path
|
| 107 |
+
|
| 108 |
+
# Optimized mmap read with infrequent tqdm updates to maintain speed
|
| 109 |
+
# Tested up to 250MB/s.
|
| 110 |
+
def read_tqdm(self, update_frequency: int = 10000):
|
| 111 |
+
current_file_position = 0
|
| 112 |
+
line_counter = 0
|
| 113 |
+
with (
|
| 114 |
+
open(self.file_path, "r", encoding="utf-8") as fh,
|
| 115 |
+
tqdm.tqdm(
|
| 116 |
+
total=os.path.getsize(self.file_path),
|
| 117 |
+
dynamic_ncols=True,
|
| 118 |
+
unit="byte",
|
| 119 |
+
unit_scale=1,
|
| 120 |
+
) as progress,
|
| 121 |
+
):
|
| 122 |
+
with mmap.mmap(fh.fileno(), length=0, access=mmap.ACCESS_READ) as mmap_obj:
|
| 123 |
+
for line in iter(mmap_obj.readline, b""):
|
| 124 |
+
line = line.decode("utf-8")
|
| 125 |
+
line_counter += 1
|
| 126 |
+
if line_counter == update_frequency:
|
| 127 |
+
new_file_pos = mmap_obj.tell()
|
| 128 |
+
bytes_read = new_file_pos - current_file_position
|
| 129 |
+
current_file_position = new_file_pos
|
| 130 |
+
progress.update(bytes_read)
|
| 131 |
+
line_counter = 0
|
| 132 |
+
yield line[:-1]
|
| 133 |
+
|
| 134 |
+
def read_and_tell(self):
|
| 135 |
+
current_file_position = 0
|
| 136 |
+
with open(self.file_path, "r", encoding="utf8") as fh:
|
| 137 |
+
with mmap.mmap(fh.fileno(), length=0, access=mmap.ACCESS_READ) as mmap_obj:
|
| 138 |
+
for line in iter(mmap_obj.readline, b""):
|
| 139 |
+
line = line.decode("utf-8")
|
| 140 |
+
new_file_pos = mmap_obj.tell()
|
| 141 |
+
raw_bytes_read = new_file_pos - current_file_position
|
| 142 |
+
current_file_position = new_file_pos
|
| 143 |
+
yield line[:-1], raw_bytes_read
|
| 144 |
+
|
| 145 |
+
def read(self):
|
| 146 |
+
with open(self.file_path, "r", encoding="utf8") as fh:
|
| 147 |
+
with mmap.mmap(fh.fileno(), length=0, access=mmap.ACCESS_READ) as mmap_obj:
|
| 148 |
+
for line in iter(mmap_obj.readline, b""):
|
| 149 |
+
line = line.decode("utf-8")
|
| 150 |
+
yield line[:-1]
|
| 151 |
+
|
| 152 |
+
def read_slow(self):
|
| 153 |
+
with open(self.file_path, "r", encoding="utf8") as fh:
|
| 154 |
+
while True:
|
| 155 |
+
line = fh.readline()
|
| 156 |
+
if line == -1 or line == "":
|
| 157 |
+
break
|
| 158 |
+
else:
|
| 159 |
+
yield line[:-1]
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
# Optimized for speed. Decompresses the archive in shell before
|
| 163 |
+
# using the mmap'd TextReader.
|
| 164 |
+
class ZStdTextReader:
|
| 165 |
+
def __init__(self, file) -> None:
|
| 166 |
+
self.file = file
|
| 167 |
+
|
| 168 |
+
def read_tqdm(self):
|
| 169 |
+
decompressed_file = self.file[:-4]
|
| 170 |
+
print("Decompressing file, please wait...")
|
| 171 |
+
os.system(f"zstd -d {self.file}") # linux decompress is faster
|
| 172 |
+
reader = TextReader(decompressed_file)
|
| 173 |
+
yield from reader.read_tqdm()
|
| 174 |
+
os.remove(decompressed_file)
|
Prism/LLaDA/LLaDA_Baseline/dllm_eval/decontamination/decontaminate.py
ADDED
|
@@ -0,0 +1,166 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import collections
|
| 2 |
+
import glob
|
| 3 |
+
import json
|
| 4 |
+
import os
|
| 5 |
+
import pickle
|
| 6 |
+
import random
|
| 7 |
+
import time
|
| 8 |
+
|
| 9 |
+
from .archiver import ZStdTextReader
|
| 10 |
+
from .janitor import Janitor, word_ngrams
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
# Was used for testing the evaluator decoupled from the full logic below
|
| 14 |
+
def get_train_overlap_stub(docs: dict, ngrams_path: str, ngrams_n_size: str):
|
| 15 |
+
simulated_overlap = 0.1
|
| 16 |
+
contaminated = int(len(docs) * simulated_overlap)
|
| 17 |
+
return random.sample(range(len(docs)), contaminated)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
# Returns a dictionary containing all overlapping documents in each
|
| 21 |
+
# task. In the standard use case, an overlap occurs when any of the 13-grams
|
| 22 |
+
# found in the task document exist in the training set documents.
|
| 23 |
+
#
|
| 24 |
+
# To generate 13-grams for the pile see scripts/clean_training_data. The final output of these
|
| 25 |
+
# scripts are an info.json file containing the n_gram_size (13) and a bunch of "ngrams_{x}.bkt.txt.sorted.zst"
|
| 26 |
+
# files. These should exist in the "ngrams_path" provided to this function.
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
# Algorithm:
|
| 30 |
+
# 1. Build lookups for each dataset {ngram: list(document_ids)}
|
| 31 |
+
# 2. Merge into an overall lookup {ngram: [(task_name, task_set, doc_ids),]}
|
| 32 |
+
# 3. Full scan the 13-grams from the training set against the merged lookup,
|
| 33 |
+
# saving matches in the "duplicates" dictionary {(task_name, task_set): set(doc_ids)}
|
| 34 |
+
# 4. Strip the task_set from the dictionary keys and return
|
| 35 |
+
#
|
| 36 |
+
# We cache the task+set lookups as well as the overlaps.
|
| 37 |
+
def get_train_overlap(docs_by_task_set: dict, ngrams_path: str, limit: int) -> dict:
|
| 38 |
+
# return get_train_overlap_stub(docs, ngrams_path, ngrams_n_size)
|
| 39 |
+
|
| 40 |
+
info_dict_path = os.path.join(ngrams_path, "info.json")
|
| 41 |
+
info_dict = json.load(open(info_dict_path, "r", encoding="utf-8"))
|
| 42 |
+
ngrams_n_size = info_dict["ngram_size"]
|
| 43 |
+
|
| 44 |
+
janitor = Janitor()
|
| 45 |
+
|
| 46 |
+
# Build lookup for each dataset first in case we use different task combinations later
|
| 47 |
+
print("Building Lookups...")
|
| 48 |
+
start = time.perf_counter()
|
| 49 |
+
|
| 50 |
+
def get_overlaps_dump_path(task_name, task_set, ngrams_n_size, limit) -> str:
|
| 51 |
+
return f"data/{task_name}/{task_set}_{ngrams_n_size}grams_limit{limit}.overlaps"
|
| 52 |
+
|
| 53 |
+
lookups = {}
|
| 54 |
+
duplicates = {} # (task_name, task_set): set(doc_ids)}
|
| 55 |
+
sets_to_decontaminate = len(docs_by_task_set.keys())
|
| 56 |
+
|
| 57 |
+
for (task_name, task_set), docs in docs_by_task_set.items():
|
| 58 |
+
if not os.path.exists(f"data/{task_name}"):
|
| 59 |
+
os.mkdir(f"data/{task_name}")
|
| 60 |
+
|
| 61 |
+
# Check if we've decontaminated this combination before
|
| 62 |
+
overlaps_dump_path = get_overlaps_dump_path(
|
| 63 |
+
task_name, task_set, ngrams_n_size, limit
|
| 64 |
+
)
|
| 65 |
+
if os.path.exists(overlaps_dump_path):
|
| 66 |
+
duplicates[(task_name, task_set)] = pickle.load(
|
| 67 |
+
open(overlaps_dump_path, "rb")
|
| 68 |
+
)
|
| 69 |
+
sets_to_decontaminate -= 1
|
| 70 |
+
continue
|
| 71 |
+
else:
|
| 72 |
+
duplicates[(task_name, task_set)] = set()
|
| 73 |
+
|
| 74 |
+
# Build/load the task lookup {ngram: set(documents)}.
|
| 75 |
+
task_set_lookup_path = (
|
| 76 |
+
f"data/{task_name}/{task_set}_{ngrams_n_size}grams_limit{limit}.lookup"
|
| 77 |
+
)
|
| 78 |
+
if os.path.exists(task_set_lookup_path):
|
| 79 |
+
print(f"{task_set_lookup_path} available, loading...")
|
| 80 |
+
lookups[(task_name, task_set)] = pickle.load(
|
| 81 |
+
open(task_set_lookup_path, "rb")
|
| 82 |
+
)
|
| 83 |
+
else:
|
| 84 |
+
print(f"{task_set_lookup_path} not available, building...")
|
| 85 |
+
lookup = collections.defaultdict(set)
|
| 86 |
+
|
| 87 |
+
for doc_id, document in enumerate(docs):
|
| 88 |
+
ngrams = word_ngrams(janitor.normalize_string(document), ngrams_n_size)
|
| 89 |
+
for ngram in ngrams:
|
| 90 |
+
lookup[ngram].add(doc_id)
|
| 91 |
+
|
| 92 |
+
pickle.dump(lookup, open(task_set_lookup_path, "wb"))
|
| 93 |
+
lookups[(task_name, task_set)] = lookup
|
| 94 |
+
|
| 95 |
+
elapsed = time.perf_counter() - start
|
| 96 |
+
print(f"Building lookups took {elapsed:0.5f} seconds.")
|
| 97 |
+
|
| 98 |
+
matched_ngrams = []
|
| 99 |
+
|
| 100 |
+
if sets_to_decontaminate > 0:
|
| 101 |
+
print("Merging lookups...")
|
| 102 |
+
start = time.perf_counter()
|
| 103 |
+
merged_lookup = collections.defaultdict(list)
|
| 104 |
+
for (task_name, task_set), lookup in lookups.items():
|
| 105 |
+
for ngram, doc_ids in lookup.items():
|
| 106 |
+
merged_lookup[ngram].append((task_name, task_set, doc_ids))
|
| 107 |
+
|
| 108 |
+
elapsed = time.perf_counter() - start
|
| 109 |
+
print(f"Merging lookups took {elapsed:0.5f} seconds.")
|
| 110 |
+
|
| 111 |
+
print(f"{ngrams_n_size} grams files found in {ngrams_path}:")
|
| 112 |
+
files = glob.glob(os.path.join(ngrams_path, "*.sorted.zst"))
|
| 113 |
+
print(files)
|
| 114 |
+
|
| 115 |
+
for file in files:
|
| 116 |
+
start = time.perf_counter()
|
| 117 |
+
print(f"Scanning {file}")
|
| 118 |
+
reader = ZStdTextReader(file)
|
| 119 |
+
total_ngrams = 0
|
| 120 |
+
unique_ngrams = 0
|
| 121 |
+
matching_unique = 0
|
| 122 |
+
non_matching_unique = 0
|
| 123 |
+
|
| 124 |
+
current_ngram = ""
|
| 125 |
+
for line in reader.read_tqdm(): # Scan training set ngrams file
|
| 126 |
+
total_ngrams += 1
|
| 127 |
+
[ngram, document_id] = line.rsplit(" ", 1)
|
| 128 |
+
if (
|
| 129 |
+
ngram != current_ngram
|
| 130 |
+
): # Only need to match the ngram once in training set
|
| 131 |
+
unique_ngrams += 1
|
| 132 |
+
current_ngram = ngram
|
| 133 |
+
if ngram in merged_lookup:
|
| 134 |
+
matched_ngrams.append(ngram) # For logging
|
| 135 |
+
matching_unique += 1
|
| 136 |
+
for task_name, task_set, doc_ids in merged_lookup[ngram]:
|
| 137 |
+
task_doc_set = duplicates[(task_name, task_set)]
|
| 138 |
+
for doc_id in doc_ids: # Record contamination across all relevant task/set combos
|
| 139 |
+
task_doc_set.add(doc_id)
|
| 140 |
+
del merged_lookup[ngram] # No point matching again
|
| 141 |
+
else:
|
| 142 |
+
non_matching_unique += 1
|
| 143 |
+
|
| 144 |
+
print(f"Total Ngrams: {total_ngrams}")
|
| 145 |
+
print(f"Unique Ngrams: {unique_ngrams}")
|
| 146 |
+
print(f"Unique Matching: {matching_unique}")
|
| 147 |
+
print(f"Unique Non Matching: {non_matching_unique}")
|
| 148 |
+
print("Matched ngrams:")
|
| 149 |
+
for ngram in matched_ngrams:
|
| 150 |
+
print(ngram)
|
| 151 |
+
|
| 152 |
+
elapsed = time.perf_counter() - start
|
| 153 |
+
print(f"Read took {elapsed:0.5f} seconds.")
|
| 154 |
+
print(f"Speed: {(os.path.getsize(file) / 1000000.0) / elapsed}MB/second")
|
| 155 |
+
|
| 156 |
+
print(duplicates)
|
| 157 |
+
|
| 158 |
+
# Dump overlaps separately
|
| 159 |
+
for (task_name, task_set), doc_ids in duplicates.items():
|
| 160 |
+
overlaps_dump_path = get_overlaps_dump_path(
|
| 161 |
+
task_name, task_set, ngrams_n_size, limit
|
| 162 |
+
)
|
| 163 |
+
pickle.dump(doc_ids, open(overlaps_dump_path, "wb"))
|
| 164 |
+
|
| 165 |
+
# Strip task set and return
|
| 166 |
+
return {task_name: doc_ids for (task_name, task_set), doc_ids in duplicates.items()}
|
Prism/LLaDA/LLaDA_Prism/.venv/bin/Activate.ps1
ADDED
|
@@ -0,0 +1,247 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<#
|
| 2 |
+
.Synopsis
|
| 3 |
+
Activate a Python virtual environment for the current PowerShell session.
|
| 4 |
+
|
| 5 |
+
.Description
|
| 6 |
+
Pushes the python executable for a virtual environment to the front of the
|
| 7 |
+
$Env:PATH environment variable and sets the prompt to signify that you are
|
| 8 |
+
in a Python virtual environment. Makes use of the command line switches as
|
| 9 |
+
well as the `pyvenv.cfg` file values present in the virtual environment.
|
| 10 |
+
|
| 11 |
+
.Parameter VenvDir
|
| 12 |
+
Path to the directory that contains the virtual environment to activate. The
|
| 13 |
+
default value for this is the parent of the directory that the Activate.ps1
|
| 14 |
+
script is located within.
|
| 15 |
+
|
| 16 |
+
.Parameter Prompt
|
| 17 |
+
The prompt prefix to display when this virtual environment is activated. By
|
| 18 |
+
default, this prompt is the name of the virtual environment folder (VenvDir)
|
| 19 |
+
surrounded by parentheses and followed by a single space (ie. '(.venv) ').
|
| 20 |
+
|
| 21 |
+
.Example
|
| 22 |
+
Activate.ps1
|
| 23 |
+
Activates the Python virtual environment that contains the Activate.ps1 script.
|
| 24 |
+
|
| 25 |
+
.Example
|
| 26 |
+
Activate.ps1 -Verbose
|
| 27 |
+
Activates the Python virtual environment that contains the Activate.ps1 script,
|
| 28 |
+
and shows extra information about the activation as it executes.
|
| 29 |
+
|
| 30 |
+
.Example
|
| 31 |
+
Activate.ps1 -VenvDir C:\Users\MyUser\Common\.venv
|
| 32 |
+
Activates the Python virtual environment located in the specified location.
|
| 33 |
+
|
| 34 |
+
.Example
|
| 35 |
+
Activate.ps1 -Prompt "MyPython"
|
| 36 |
+
Activates the Python virtual environment that contains the Activate.ps1 script,
|
| 37 |
+
and prefixes the current prompt with the specified string (surrounded in
|
| 38 |
+
parentheses) while the virtual environment is active.
|
| 39 |
+
|
| 40 |
+
.Notes
|
| 41 |
+
On Windows, it may be required to enable this Activate.ps1 script by setting the
|
| 42 |
+
execution policy for the user. You can do this by issuing the following PowerShell
|
| 43 |
+
command:
|
| 44 |
+
|
| 45 |
+
PS C:\> Set-ExecutionPolicy -ExecutionPolicy RemoteSigned -Scope CurrentUser
|
| 46 |
+
|
| 47 |
+
For more information on Execution Policies:
|
| 48 |
+
https://go.microsoft.com/fwlink/?LinkID=135170
|
| 49 |
+
|
| 50 |
+
#>
|
| 51 |
+
Param(
|
| 52 |
+
[Parameter(Mandatory = $false)]
|
| 53 |
+
[String]
|
| 54 |
+
$VenvDir,
|
| 55 |
+
[Parameter(Mandatory = $false)]
|
| 56 |
+
[String]
|
| 57 |
+
$Prompt
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
<# Function declarations --------------------------------------------------- #>
|
| 61 |
+
|
| 62 |
+
<#
|
| 63 |
+
.Synopsis
|
| 64 |
+
Remove all shell session elements added by the Activate script, including the
|
| 65 |
+
addition of the virtual environment's Python executable from the beginning of
|
| 66 |
+
the PATH variable.
|
| 67 |
+
|
| 68 |
+
.Parameter NonDestructive
|
| 69 |
+
If present, do not remove this function from the global namespace for the
|
| 70 |
+
session.
|
| 71 |
+
|
| 72 |
+
#>
|
| 73 |
+
function global:deactivate ([switch]$NonDestructive) {
|
| 74 |
+
# Revert to original values
|
| 75 |
+
|
| 76 |
+
# The prior prompt:
|
| 77 |
+
if (Test-Path -Path Function:_OLD_VIRTUAL_PROMPT) {
|
| 78 |
+
Copy-Item -Path Function:_OLD_VIRTUAL_PROMPT -Destination Function:prompt
|
| 79 |
+
Remove-Item -Path Function:_OLD_VIRTUAL_PROMPT
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
# The prior PYTHONHOME:
|
| 83 |
+
if (Test-Path -Path Env:_OLD_VIRTUAL_PYTHONHOME) {
|
| 84 |
+
Copy-Item -Path Env:_OLD_VIRTUAL_PYTHONHOME -Destination Env:PYTHONHOME
|
| 85 |
+
Remove-Item -Path Env:_OLD_VIRTUAL_PYTHONHOME
|
| 86 |
+
}
|
| 87 |
+
|
| 88 |
+
# The prior PATH:
|
| 89 |
+
if (Test-Path -Path Env:_OLD_VIRTUAL_PATH) {
|
| 90 |
+
Copy-Item -Path Env:_OLD_VIRTUAL_PATH -Destination Env:PATH
|
| 91 |
+
Remove-Item -Path Env:_OLD_VIRTUAL_PATH
|
| 92 |
+
}
|
| 93 |
+
|
| 94 |
+
# Just remove the VIRTUAL_ENV altogether:
|
| 95 |
+
if (Test-Path -Path Env:VIRTUAL_ENV) {
|
| 96 |
+
Remove-Item -Path env:VIRTUAL_ENV
|
| 97 |
+
}
|
| 98 |
+
|
| 99 |
+
# Just remove VIRTUAL_ENV_PROMPT altogether.
|
| 100 |
+
if (Test-Path -Path Env:VIRTUAL_ENV_PROMPT) {
|
| 101 |
+
Remove-Item -Path env:VIRTUAL_ENV_PROMPT
|
| 102 |
+
}
|
| 103 |
+
|
| 104 |
+
# Just remove the _PYTHON_VENV_PROMPT_PREFIX altogether:
|
| 105 |
+
if (Get-Variable -Name "_PYTHON_VENV_PROMPT_PREFIX" -ErrorAction SilentlyContinue) {
|
| 106 |
+
Remove-Variable -Name _PYTHON_VENV_PROMPT_PREFIX -Scope Global -Force
|
| 107 |
+
}
|
| 108 |
+
|
| 109 |
+
# Leave deactivate function in the global namespace if requested:
|
| 110 |
+
if (-not $NonDestructive) {
|
| 111 |
+
Remove-Item -Path function:deactivate
|
| 112 |
+
}
|
| 113 |
+
}
|
| 114 |
+
|
| 115 |
+
<#
|
| 116 |
+
.Description
|
| 117 |
+
Get-PyVenvConfig parses the values from the pyvenv.cfg file located in the
|
| 118 |
+
given folder, and returns them in a map.
|
| 119 |
+
|
| 120 |
+
For each line in the pyvenv.cfg file, if that line can be parsed into exactly
|
| 121 |
+
two strings separated by `=` (with any amount of whitespace surrounding the =)
|
| 122 |
+
then it is considered a `key = value` line. The left hand string is the key,
|
| 123 |
+
the right hand is the value.
|
| 124 |
+
|
| 125 |
+
If the value starts with a `'` or a `"` then the first and last character is
|
| 126 |
+
stripped from the value before being captured.
|
| 127 |
+
|
| 128 |
+
.Parameter ConfigDir
|
| 129 |
+
Path to the directory that contains the `pyvenv.cfg` file.
|
| 130 |
+
#>
|
| 131 |
+
function Get-PyVenvConfig(
|
| 132 |
+
[String]
|
| 133 |
+
$ConfigDir
|
| 134 |
+
) {
|
| 135 |
+
Write-Verbose "Given ConfigDir=$ConfigDir, obtain values in pyvenv.cfg"
|
| 136 |
+
|
| 137 |
+
# Ensure the file exists, and issue a warning if it doesn't (but still allow the function to continue).
|
| 138 |
+
$pyvenvConfigPath = Join-Path -Resolve -Path $ConfigDir -ChildPath 'pyvenv.cfg' -ErrorAction Continue
|
| 139 |
+
|
| 140 |
+
# An empty map will be returned if no config file is found.
|
| 141 |
+
$pyvenvConfig = @{ }
|
| 142 |
+
|
| 143 |
+
if ($pyvenvConfigPath) {
|
| 144 |
+
|
| 145 |
+
Write-Verbose "File exists, parse `key = value` lines"
|
| 146 |
+
$pyvenvConfigContent = Get-Content -Path $pyvenvConfigPath
|
| 147 |
+
|
| 148 |
+
$pyvenvConfigContent | ForEach-Object {
|
| 149 |
+
$keyval = $PSItem -split "\s*=\s*", 2
|
| 150 |
+
if ($keyval[0] -and $keyval[1]) {
|
| 151 |
+
$val = $keyval[1]
|
| 152 |
+
|
| 153 |
+
# Remove extraneous quotations around a string value.
|
| 154 |
+
if ("'""".Contains($val.Substring(0, 1))) {
|
| 155 |
+
$val = $val.Substring(1, $val.Length - 2)
|
| 156 |
+
}
|
| 157 |
+
|
| 158 |
+
$pyvenvConfig[$keyval[0]] = $val
|
| 159 |
+
Write-Verbose "Adding Key: '$($keyval[0])'='$val'"
|
| 160 |
+
}
|
| 161 |
+
}
|
| 162 |
+
}
|
| 163 |
+
return $pyvenvConfig
|
| 164 |
+
}
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
<# Begin Activate script --------------------------------------------------- #>
|
| 168 |
+
|
| 169 |
+
# Determine the containing directory of this script
|
| 170 |
+
$VenvExecPath = Split-Path -Parent $MyInvocation.MyCommand.Definition
|
| 171 |
+
$VenvExecDir = Get-Item -Path $VenvExecPath
|
| 172 |
+
|
| 173 |
+
Write-Verbose "Activation script is located in path: '$VenvExecPath'"
|
| 174 |
+
Write-Verbose "VenvExecDir Fullname: '$($VenvExecDir.FullName)"
|
| 175 |
+
Write-Verbose "VenvExecDir Name: '$($VenvExecDir.Name)"
|
| 176 |
+
|
| 177 |
+
# Set values required in priority: CmdLine, ConfigFile, Default
|
| 178 |
+
# First, get the location of the virtual environment, it might not be
|
| 179 |
+
# VenvExecDir if specified on the command line.
|
| 180 |
+
if ($VenvDir) {
|
| 181 |
+
Write-Verbose "VenvDir given as parameter, using '$VenvDir' to determine values"
|
| 182 |
+
}
|
| 183 |
+
else {
|
| 184 |
+
Write-Verbose "VenvDir not given as a parameter, using parent directory name as VenvDir."
|
| 185 |
+
$VenvDir = $VenvExecDir.Parent.FullName.TrimEnd("\\/")
|
| 186 |
+
Write-Verbose "VenvDir=$VenvDir"
|
| 187 |
+
}
|
| 188 |
+
|
| 189 |
+
# Next, read the `pyvenv.cfg` file to determine any required value such
|
| 190 |
+
# as `prompt`.
|
| 191 |
+
$pyvenvCfg = Get-PyVenvConfig -ConfigDir $VenvDir
|
| 192 |
+
|
| 193 |
+
# Next, set the prompt from the command line, or the config file, or
|
| 194 |
+
# just use the name of the virtual environment folder.
|
| 195 |
+
if ($Prompt) {
|
| 196 |
+
Write-Verbose "Prompt specified as argument, using '$Prompt'"
|
| 197 |
+
}
|
| 198 |
+
else {
|
| 199 |
+
Write-Verbose "Prompt not specified as argument to script, checking pyvenv.cfg value"
|
| 200 |
+
if ($pyvenvCfg -and $pyvenvCfg['prompt']) {
|
| 201 |
+
Write-Verbose " Setting based on value in pyvenv.cfg='$($pyvenvCfg['prompt'])'"
|
| 202 |
+
$Prompt = $pyvenvCfg['prompt'];
|
| 203 |
+
}
|
| 204 |
+
else {
|
| 205 |
+
Write-Verbose " Setting prompt based on parent's directory's name. (Is the directory name passed to venv module when creating the virtual environment)"
|
| 206 |
+
Write-Verbose " Got leaf-name of $VenvDir='$(Split-Path -Path $venvDir -Leaf)'"
|
| 207 |
+
$Prompt = Split-Path -Path $venvDir -Leaf
|
| 208 |
+
}
|
| 209 |
+
}
|
| 210 |
+
|
| 211 |
+
Write-Verbose "Prompt = '$Prompt'"
|
| 212 |
+
Write-Verbose "VenvDir='$VenvDir'"
|
| 213 |
+
|
| 214 |
+
# Deactivate any currently active virtual environment, but leave the
|
| 215 |
+
# deactivate function in place.
|
| 216 |
+
deactivate -nondestructive
|
| 217 |
+
|
| 218 |
+
# Now set the environment variable VIRTUAL_ENV, used by many tools to determine
|
| 219 |
+
# that there is an activated venv.
|
| 220 |
+
$env:VIRTUAL_ENV = $VenvDir
|
| 221 |
+
|
| 222 |
+
if (-not $Env:VIRTUAL_ENV_DISABLE_PROMPT) {
|
| 223 |
+
|
| 224 |
+
Write-Verbose "Setting prompt to '$Prompt'"
|
| 225 |
+
|
| 226 |
+
# Set the prompt to include the env name
|
| 227 |
+
# Make sure _OLD_VIRTUAL_PROMPT is global
|
| 228 |
+
function global:_OLD_VIRTUAL_PROMPT { "" }
|
| 229 |
+
Copy-Item -Path function:prompt -Destination function:_OLD_VIRTUAL_PROMPT
|
| 230 |
+
New-Variable -Name _PYTHON_VENV_PROMPT_PREFIX -Description "Python virtual environment prompt prefix" -Scope Global -Option ReadOnly -Visibility Public -Value $Prompt
|
| 231 |
+
|
| 232 |
+
function global:prompt {
|
| 233 |
+
Write-Host -NoNewline -ForegroundColor Green "($_PYTHON_VENV_PROMPT_PREFIX) "
|
| 234 |
+
_OLD_VIRTUAL_PROMPT
|
| 235 |
+
}
|
| 236 |
+
$env:VIRTUAL_ENV_PROMPT = $Prompt
|
| 237 |
+
}
|
| 238 |
+
|
| 239 |
+
# Clear PYTHONHOME
|
| 240 |
+
if (Test-Path -Path Env:PYTHONHOME) {
|
| 241 |
+
Copy-Item -Path Env:PYTHONHOME -Destination Env:_OLD_VIRTUAL_PYTHONHOME
|
| 242 |
+
Remove-Item -Path Env:PYTHONHOME
|
| 243 |
+
}
|
| 244 |
+
|
| 245 |
+
# Add the venv to the PATH
|
| 246 |
+
Copy-Item -Path Env:PATH -Destination Env:_OLD_VIRTUAL_PATH
|
| 247 |
+
$Env:PATH = "$VenvExecDir$([System.IO.Path]::PathSeparator)$Env:PATH"
|
Prism/LLaDA/LLaDA_Prism/.venv/bin/accelerate-launch
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/gfs/space/private/fengzl/world_model/Prism/LLaDA/LLaDA_Prism/.venv/bin/python
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
import re
|
| 4 |
+
import sys
|
| 5 |
+
from accelerate.commands.launch import main
|
| 6 |
+
if __name__ == '__main__':
|
| 7 |
+
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
| 8 |
+
sys.exit(main())
|
Prism/LLaDA/LLaDA_Prism/.venv/bin/activate.csh
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# This file must be used with "source bin/activate.csh" *from csh*.
|
| 2 |
+
# You cannot run it directly.
|
| 3 |
+
|
| 4 |
+
# Created by Davide Di Blasi <davidedb@gmail.com>.
|
| 5 |
+
# Ported to Python 3.3 venv by Andrew Svetlov <andrew.svetlov@gmail.com>
|
| 6 |
+
|
| 7 |
+
alias deactivate 'test $?_OLD_VIRTUAL_PATH != 0 && setenv PATH "$_OLD_VIRTUAL_PATH" && unset _OLD_VIRTUAL_PATH; rehash; test $?_OLD_VIRTUAL_PROMPT != 0 && set prompt="$_OLD_VIRTUAL_PROMPT" && unset _OLD_VIRTUAL_PROMPT; unsetenv VIRTUAL_ENV; unsetenv VIRTUAL_ENV_PROMPT; test "\!:*" != "nondestructive" && unalias deactivate'
|
| 8 |
+
|
| 9 |
+
# Unset irrelevant variables.
|
| 10 |
+
deactivate nondestructive
|
| 11 |
+
|
| 12 |
+
setenv VIRTUAL_ENV /gfs/space/private/fengzl/world_model/Prism/LLaDA/LLaDA_Prism/.venv
|
| 13 |
+
|
| 14 |
+
set _OLD_VIRTUAL_PATH="$PATH"
|
| 15 |
+
setenv PATH "$VIRTUAL_ENV/"bin":$PATH"
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
set _OLD_VIRTUAL_PROMPT="$prompt"
|
| 19 |
+
|
| 20 |
+
if (! "$?VIRTUAL_ENV_DISABLE_PROMPT") then
|
| 21 |
+
set prompt = '(.venv) '"$prompt"
|
| 22 |
+
setenv VIRTUAL_ENV_PROMPT '(.venv) '
|
| 23 |
+
endif
|
| 24 |
+
|
| 25 |
+
alias pydoc python -m pydoc
|
| 26 |
+
|
| 27 |
+
rehash
|
Prism/LLaDA/LLaDA_Prism/.venv/bin/activate.fish
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# This file must be used with "source <venv>/bin/activate.fish" *from fish*
|
| 2 |
+
# (https://fishshell.com/). You cannot run it directly.
|
| 3 |
+
|
| 4 |
+
function deactivate -d "Exit virtual environment and return to normal shell environment"
|
| 5 |
+
# reset old environment variables
|
| 6 |
+
if test -n "$_OLD_VIRTUAL_PATH"
|
| 7 |
+
set -gx PATH $_OLD_VIRTUAL_PATH
|
| 8 |
+
set -e _OLD_VIRTUAL_PATH
|
| 9 |
+
end
|
| 10 |
+
if test -n "$_OLD_VIRTUAL_PYTHONHOME"
|
| 11 |
+
set -gx PYTHONHOME $_OLD_VIRTUAL_PYTHONHOME
|
| 12 |
+
set -e _OLD_VIRTUAL_PYTHONHOME
|
| 13 |
+
end
|
| 14 |
+
|
| 15 |
+
if test -n "$_OLD_FISH_PROMPT_OVERRIDE"
|
| 16 |
+
set -e _OLD_FISH_PROMPT_OVERRIDE
|
| 17 |
+
# prevents error when using nested fish instances (Issue #93858)
|
| 18 |
+
if functions -q _old_fish_prompt
|
| 19 |
+
functions -e fish_prompt
|
| 20 |
+
functions -c _old_fish_prompt fish_prompt
|
| 21 |
+
functions -e _old_fish_prompt
|
| 22 |
+
end
|
| 23 |
+
end
|
| 24 |
+
|
| 25 |
+
set -e VIRTUAL_ENV
|
| 26 |
+
set -e VIRTUAL_ENV_PROMPT
|
| 27 |
+
if test "$argv[1]" != "nondestructive"
|
| 28 |
+
# Self-destruct!
|
| 29 |
+
functions -e deactivate
|
| 30 |
+
end
|
| 31 |
+
end
|
| 32 |
+
|
| 33 |
+
# Unset irrelevant variables.
|
| 34 |
+
deactivate nondestructive
|
| 35 |
+
|
| 36 |
+
set -gx VIRTUAL_ENV /gfs/space/private/fengzl/world_model/Prism/LLaDA/LLaDA_Prism/.venv
|
| 37 |
+
|
| 38 |
+
set -gx _OLD_VIRTUAL_PATH $PATH
|
| 39 |
+
set -gx PATH "$VIRTUAL_ENV/"bin $PATH
|
| 40 |
+
|
| 41 |
+
# Unset PYTHONHOME if set.
|
| 42 |
+
if set -q PYTHONHOME
|
| 43 |
+
set -gx _OLD_VIRTUAL_PYTHONHOME $PYTHONHOME
|
| 44 |
+
set -e PYTHONHOME
|
| 45 |
+
end
|
| 46 |
+
|
| 47 |
+
if test -z "$VIRTUAL_ENV_DISABLE_PROMPT"
|
| 48 |
+
# fish uses a function instead of an env var to generate the prompt.
|
| 49 |
+
|
| 50 |
+
# Save the current fish_prompt function as the function _old_fish_prompt.
|
| 51 |
+
functions -c fish_prompt _old_fish_prompt
|
| 52 |
+
|
| 53 |
+
# With the original prompt function renamed, we can override with our own.
|
| 54 |
+
function fish_prompt
|
| 55 |
+
# Save the return status of the last command.
|
| 56 |
+
set -l old_status $status
|
| 57 |
+
|
| 58 |
+
# Output the venv prompt; color taken from the blue of the Python logo.
|
| 59 |
+
printf "%s%s%s" (set_color 4B8BBE) '(.venv) ' (set_color normal)
|
| 60 |
+
|
| 61 |
+
# Restore the return status of the previous command.
|
| 62 |
+
echo "exit $old_status" | .
|
| 63 |
+
# Output the original/"old" prompt.
|
| 64 |
+
_old_fish_prompt
|
| 65 |
+
end
|
| 66 |
+
|
| 67 |
+
set -gx _OLD_FISH_PROMPT_OVERRIDE "$VIRTUAL_ENV"
|
| 68 |
+
set -gx VIRTUAL_ENV_PROMPT '(.venv) '
|
| 69 |
+
end
|
Prism/LLaDA/LLaDA_Prism/.venv/bin/f2py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/gfs/space/private/fengzl/world_model/Prism/LLaDA/LLaDA_Prism/.venv/bin/python
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
import re
|
| 4 |
+
import sys
|
| 5 |
+
from numpy.f2py.f2py2e import main
|
| 6 |
+
if __name__ == '__main__':
|
| 7 |
+
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
| 8 |
+
sys.exit(main())
|
Prism/LLaDA/LLaDA_Prism/.venv/bin/get_objgraph
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/gfs/space/private/fengzl/world_model/Prism/LLaDA/LLaDA_Prism/.venv/bin/python
|
| 2 |
+
#
|
| 3 |
+
# Author: Mike McKerns (mmckerns @caltech and @uqfoundation)
|
| 4 |
+
# Copyright (c) 2008-2016 California Institute of Technology.
|
| 5 |
+
# Copyright (c) 2016-2024 The Uncertainty Quantification Foundation.
|
| 6 |
+
# License: 3-clause BSD. The full license text is available at:
|
| 7 |
+
# - https://github.com/uqfoundation/dill/blob/master/LICENSE
|
| 8 |
+
"""
|
| 9 |
+
display the reference paths for objects in ``dill.types`` or a .pkl file
|
| 10 |
+
|
| 11 |
+
Notes:
|
| 12 |
+
the generated image is useful in showing the pointer references in
|
| 13 |
+
objects that are or can be pickled. Any object in ``dill.objects``
|
| 14 |
+
listed in ``dill.load_types(picklable=True, unpicklable=True)`` works.
|
| 15 |
+
|
| 16 |
+
Examples::
|
| 17 |
+
|
| 18 |
+
$ get_objgraph ArrayType
|
| 19 |
+
Image generated as ArrayType.png
|
| 20 |
+
"""
|
| 21 |
+
|
| 22 |
+
import dill as pickle
|
| 23 |
+
#pickle.debug.trace(True)
|
| 24 |
+
#import pickle
|
| 25 |
+
|
| 26 |
+
# get all objects for testing
|
| 27 |
+
from dill import load_types
|
| 28 |
+
load_types(pickleable=True,unpickleable=True)
|
| 29 |
+
from dill import objects
|
| 30 |
+
|
| 31 |
+
if __name__ == "__main__":
|
| 32 |
+
import sys
|
| 33 |
+
if len(sys.argv) != 2:
|
| 34 |
+
print ("Please provide exactly one file or type name (e.g. 'IntType')")
|
| 35 |
+
msg = "\n"
|
| 36 |
+
for objtype in list(objects.keys())[:40]:
|
| 37 |
+
msg += objtype + ', '
|
| 38 |
+
print (msg + "...")
|
| 39 |
+
else:
|
| 40 |
+
objtype = str(sys.argv[-1])
|
| 41 |
+
try:
|
| 42 |
+
obj = objects[objtype]
|
| 43 |
+
except KeyError:
|
| 44 |
+
obj = pickle.load(open(objtype,'rb'))
|
| 45 |
+
import os
|
| 46 |
+
objtype = os.path.splitext(objtype)[0]
|
| 47 |
+
try:
|
| 48 |
+
import objgraph
|
| 49 |
+
objgraph.show_refs(obj, filename=objtype+'.png')
|
| 50 |
+
except ImportError:
|
| 51 |
+
print ("Please install 'objgraph' to view object graphs")
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
# EOF
|
Prism/LLaDA/LLaDA_Prism/.venv/bin/hf
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/gfs/space/private/fengzl/world_model/Prism/LLaDA/LLaDA_Prism/.venv/bin/python
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
import re
|
| 4 |
+
import sys
|
| 5 |
+
from huggingface_hub.cli.hf import main
|
| 6 |
+
if __name__ == '__main__':
|
| 7 |
+
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
| 8 |
+
sys.exit(main())
|
Prism/LLaDA/LLaDA_Prism/.venv/bin/httpx
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/gfs/space/private/fengzl/world_model/Prism/LLaDA/LLaDA_Prism/.venv/bin/python
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
import re
|
| 4 |
+
import sys
|
| 5 |
+
from httpx import main
|
| 6 |
+
if __name__ == '__main__':
|
| 7 |
+
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
| 8 |
+
sys.exit(main())
|
Prism/LLaDA/LLaDA_Prism/.venv/bin/markdown-it
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/gfs/space/private/fengzl/world_model/Prism/LLaDA/LLaDA_Prism/.venv/bin/python
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
import re
|
| 4 |
+
import sys
|
| 5 |
+
from markdown_it.cli.parse import main
|
| 6 |
+
if __name__ == '__main__':
|
| 7 |
+
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
| 8 |
+
sys.exit(main())
|
Prism/LLaDA/LLaDA_Prism/.venv/bin/pip
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/gfs/space/private/fengzl/world_model/Prism/LLaDA/LLaDA_Prism/.venv/bin/python
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
import re
|
| 4 |
+
import sys
|
| 5 |
+
from pip._internal.cli.main import main
|
| 6 |
+
if __name__ == '__main__':
|
| 7 |
+
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
| 8 |
+
sys.exit(main())
|
Prism/LLaDA/LLaDA_Prism/.venv/bin/torchfrtrace
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/gfs/space/private/fengzl/world_model/Prism/LLaDA/LLaDA_Prism/.venv/bin/python
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
import re
|
| 4 |
+
import sys
|
| 5 |
+
from torch.distributed.flight_recorder.fr_trace import main
|
| 6 |
+
if __name__ == '__main__':
|
| 7 |
+
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
| 8 |
+
sys.exit(main())
|
Prism/LLaDA/LLaDA_Prism/.venv/bin/torchrun
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/gfs/space/private/fengzl/world_model/Prism/LLaDA/LLaDA_Prism/.venv/bin/python
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
import re
|
| 4 |
+
import sys
|
| 5 |
+
from torch.distributed.run import main
|
| 6 |
+
if __name__ == '__main__':
|
| 7 |
+
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
| 8 |
+
sys.exit(main())
|
Prism/LLaDA/LLaDA_Prism/.venv/bin/typer
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/gfs/space/private/fengzl/world_model/Prism/LLaDA/LLaDA_Prism/.venv/bin/python
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
import re
|
| 4 |
+
import sys
|
| 5 |
+
from typer.cli import main
|
| 6 |
+
if __name__ == '__main__':
|
| 7 |
+
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
| 8 |
+
sys.exit(main())
|
Prism/LLaDA/LLaDA_Prism/.venv/bin/undill
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/gfs/space/private/fengzl/world_model/Prism/LLaDA/LLaDA_Prism/.venv/bin/python
|
| 2 |
+
#
|
| 3 |
+
# Author: Mike McKerns (mmckerns @caltech and @uqfoundation)
|
| 4 |
+
# Copyright (c) 2008-2016 California Institute of Technology.
|
| 5 |
+
# Copyright (c) 2016-2024 The Uncertainty Quantification Foundation.
|
| 6 |
+
# License: 3-clause BSD. The full license text is available at:
|
| 7 |
+
# - https://github.com/uqfoundation/dill/blob/master/LICENSE
|
| 8 |
+
"""
|
| 9 |
+
unpickle the contents of a pickled object file
|
| 10 |
+
|
| 11 |
+
Examples::
|
| 12 |
+
|
| 13 |
+
$ undill hello.pkl
|
| 14 |
+
['hello', 'world']
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
if __name__ == '__main__':
|
| 18 |
+
import sys
|
| 19 |
+
import dill
|
| 20 |
+
for file in sys.argv[1:]:
|
| 21 |
+
print (dill.load(open(file,'rb')))
|
| 22 |
+
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/attrs-25.1.0.dist-info/INSTALLER
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
pip
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/attrs-25.1.0.dist-info/METADATA
ADDED
|
@@ -0,0 +1,232 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Metadata-Version: 2.4
|
| 2 |
+
Name: attrs
|
| 3 |
+
Version: 25.1.0
|
| 4 |
+
Summary: Classes Without Boilerplate
|
| 5 |
+
Project-URL: Documentation, https://www.attrs.org/
|
| 6 |
+
Project-URL: Changelog, https://www.attrs.org/en/stable/changelog.html
|
| 7 |
+
Project-URL: GitHub, https://github.com/python-attrs/attrs
|
| 8 |
+
Project-URL: Funding, https://github.com/sponsors/hynek
|
| 9 |
+
Project-URL: Tidelift, https://tidelift.com/subscription/pkg/pypi-attrs?utm_source=pypi-attrs&utm_medium=pypi
|
| 10 |
+
Author-email: Hynek Schlawack <hs@ox.cx>
|
| 11 |
+
License-Expression: MIT
|
| 12 |
+
License-File: LICENSE
|
| 13 |
+
Keywords: attribute,boilerplate,class
|
| 14 |
+
Classifier: Development Status :: 5 - Production/Stable
|
| 15 |
+
Classifier: Programming Language :: Python :: 3.8
|
| 16 |
+
Classifier: Programming Language :: Python :: 3.9
|
| 17 |
+
Classifier: Programming Language :: Python :: 3.10
|
| 18 |
+
Classifier: Programming Language :: Python :: 3.11
|
| 19 |
+
Classifier: Programming Language :: Python :: 3.12
|
| 20 |
+
Classifier: Programming Language :: Python :: 3.13
|
| 21 |
+
Classifier: Programming Language :: Python :: Implementation :: CPython
|
| 22 |
+
Classifier: Programming Language :: Python :: Implementation :: PyPy
|
| 23 |
+
Classifier: Typing :: Typed
|
| 24 |
+
Requires-Python: >=3.8
|
| 25 |
+
Provides-Extra: benchmark
|
| 26 |
+
Requires-Dist: cloudpickle; (platform_python_implementation == 'CPython') and extra == 'benchmark'
|
| 27 |
+
Requires-Dist: hypothesis; extra == 'benchmark'
|
| 28 |
+
Requires-Dist: mypy>=1.11.1; (platform_python_implementation == 'CPython' and python_version >= '3.10') and extra == 'benchmark'
|
| 29 |
+
Requires-Dist: pympler; extra == 'benchmark'
|
| 30 |
+
Requires-Dist: pytest-codspeed; extra == 'benchmark'
|
| 31 |
+
Requires-Dist: pytest-mypy-plugins; (platform_python_implementation == 'CPython' and python_version >= '3.10') and extra == 'benchmark'
|
| 32 |
+
Requires-Dist: pytest-xdist[psutil]; extra == 'benchmark'
|
| 33 |
+
Requires-Dist: pytest>=4.3.0; extra == 'benchmark'
|
| 34 |
+
Provides-Extra: cov
|
| 35 |
+
Requires-Dist: cloudpickle; (platform_python_implementation == 'CPython') and extra == 'cov'
|
| 36 |
+
Requires-Dist: coverage[toml]>=5.3; extra == 'cov'
|
| 37 |
+
Requires-Dist: hypothesis; extra == 'cov'
|
| 38 |
+
Requires-Dist: mypy>=1.11.1; (platform_python_implementation == 'CPython' and python_version >= '3.10') and extra == 'cov'
|
| 39 |
+
Requires-Dist: pympler; extra == 'cov'
|
| 40 |
+
Requires-Dist: pytest-mypy-plugins; (platform_python_implementation == 'CPython' and python_version >= '3.10') and extra == 'cov'
|
| 41 |
+
Requires-Dist: pytest-xdist[psutil]; extra == 'cov'
|
| 42 |
+
Requires-Dist: pytest>=4.3.0; extra == 'cov'
|
| 43 |
+
Provides-Extra: dev
|
| 44 |
+
Requires-Dist: cloudpickle; (platform_python_implementation == 'CPython') and extra == 'dev'
|
| 45 |
+
Requires-Dist: hypothesis; extra == 'dev'
|
| 46 |
+
Requires-Dist: mypy>=1.11.1; (platform_python_implementation == 'CPython' and python_version >= '3.10') and extra == 'dev'
|
| 47 |
+
Requires-Dist: pre-commit-uv; extra == 'dev'
|
| 48 |
+
Requires-Dist: pympler; extra == 'dev'
|
| 49 |
+
Requires-Dist: pytest-mypy-plugins; (platform_python_implementation == 'CPython' and python_version >= '3.10') and extra == 'dev'
|
| 50 |
+
Requires-Dist: pytest-xdist[psutil]; extra == 'dev'
|
| 51 |
+
Requires-Dist: pytest>=4.3.0; extra == 'dev'
|
| 52 |
+
Provides-Extra: docs
|
| 53 |
+
Requires-Dist: cogapp; extra == 'docs'
|
| 54 |
+
Requires-Dist: furo; extra == 'docs'
|
| 55 |
+
Requires-Dist: myst-parser; extra == 'docs'
|
| 56 |
+
Requires-Dist: sphinx; extra == 'docs'
|
| 57 |
+
Requires-Dist: sphinx-notfound-page; extra == 'docs'
|
| 58 |
+
Requires-Dist: sphinxcontrib-towncrier; extra == 'docs'
|
| 59 |
+
Requires-Dist: towncrier<24.7; extra == 'docs'
|
| 60 |
+
Provides-Extra: tests
|
| 61 |
+
Requires-Dist: cloudpickle; (platform_python_implementation == 'CPython') and extra == 'tests'
|
| 62 |
+
Requires-Dist: hypothesis; extra == 'tests'
|
| 63 |
+
Requires-Dist: mypy>=1.11.1; (platform_python_implementation == 'CPython' and python_version >= '3.10') and extra == 'tests'
|
| 64 |
+
Requires-Dist: pympler; extra == 'tests'
|
| 65 |
+
Requires-Dist: pytest-mypy-plugins; (platform_python_implementation == 'CPython' and python_version >= '3.10') and extra == 'tests'
|
| 66 |
+
Requires-Dist: pytest-xdist[psutil]; extra == 'tests'
|
| 67 |
+
Requires-Dist: pytest>=4.3.0; extra == 'tests'
|
| 68 |
+
Provides-Extra: tests-mypy
|
| 69 |
+
Requires-Dist: mypy>=1.11.1; (platform_python_implementation == 'CPython' and python_version >= '3.10') and extra == 'tests-mypy'
|
| 70 |
+
Requires-Dist: pytest-mypy-plugins; (platform_python_implementation == 'CPython' and python_version >= '3.10') and extra == 'tests-mypy'
|
| 71 |
+
Description-Content-Type: text/markdown
|
| 72 |
+
|
| 73 |
+
<p align="center">
|
| 74 |
+
<a href="https://www.attrs.org/">
|
| 75 |
+
<img src="https://raw.githubusercontent.com/python-attrs/attrs/main/docs/_static/attrs_logo.svg" width="35%" alt="attrs" />
|
| 76 |
+
</a>
|
| 77 |
+
</p>
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
*attrs* is the Python package that will bring back the **joy** of **writing classes** by relieving you from the drudgery of implementing object protocols (aka [dunder methods](https://www.attrs.org/en/latest/glossary.html#term-dunder-methods)).
|
| 81 |
+
[Trusted by NASA](https://docs.github.com/en/account-and-profile/setting-up-and-managing-your-github-profile/customizing-your-profile/personalizing-your-profile#list-of-qualifying-repositories-for-mars-2020-helicopter-contributor-achievement) for Mars missions since 2020!
|
| 82 |
+
|
| 83 |
+
Its main goal is to help you to write **concise** and **correct** software without slowing down your code.
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
## Sponsors
|
| 87 |
+
|
| 88 |
+
*attrs* would not be possible without our [amazing sponsors](https://github.com/sponsors/hynek).
|
| 89 |
+
Especially those generously supporting us at the *The Organization* tier and higher:
|
| 90 |
+
|
| 91 |
+
<!-- sponsor-break-begin -->
|
| 92 |
+
|
| 93 |
+
<p align="center">
|
| 94 |
+
|
| 95 |
+
<!-- [[[cog
|
| 96 |
+
import pathlib, tomllib
|
| 97 |
+
|
| 98 |
+
for sponsor in tomllib.loads(pathlib.Path("pyproject.toml").read_text())["tool"]["sponcon"]["sponsors"]:
|
| 99 |
+
print(f'<a href="{sponsor["url"]}"><img title="{sponsor["title"]}" src="https://www.attrs.org/en/25.1.0/_static/sponsors/{sponsor["img"]}" width="190" /></a>')
|
| 100 |
+
]]] -->
|
| 101 |
+
<a href="https://www.variomedia.de/"><img title="Variomedia AG" src="https://www.attrs.org/en/25.1.0/_static/sponsors/Variomedia.svg" width="190" /></a>
|
| 102 |
+
<a href="https://tidelift.com/?utm_source=lifter&utm_medium=referral&utm_campaign=hynek"><img title="Tidelift" src="https://www.attrs.org/en/25.1.0/_static/sponsors/Tidelift.svg" width="190" /></a>
|
| 103 |
+
<a href="https://klaviyo.com/"><img title="Klaviyo" src="https://www.attrs.org/en/25.1.0/_static/sponsors/Klaviyo.svg" width="190" /></a>
|
| 104 |
+
<a href="https://www.emsys-renewables.com/"><img title="emsys renewables" src="https://www.attrs.org/en/25.1.0/_static/sponsors/emsys-renewables.svg" width="190" /></a>
|
| 105 |
+
<a href="https://filepreviews.io/"><img title="FilePreviews" src="https://www.attrs.org/en/25.1.0/_static/sponsors/FilePreviews.svg" width="190" /></a>
|
| 106 |
+
<a href="https://privacy-solutions.org/"><img title="Privacy Solutions" src="https://www.attrs.org/en/25.1.0/_static/sponsors/Privacy-Solutions.svg" width="190" /></a>
|
| 107 |
+
<a href="https://polar.sh/"><img title="Polar" src="https://www.attrs.org/en/25.1.0/_static/sponsors/Polar.svg" width="190" /></a>
|
| 108 |
+
<!-- [[[end]]] -->
|
| 109 |
+
|
| 110 |
+
</p>
|
| 111 |
+
|
| 112 |
+
<!-- sponsor-break-end -->
|
| 113 |
+
|
| 114 |
+
<p align="center">
|
| 115 |
+
<strong>Please consider <a href="https://github.com/sponsors/hynek">joining them</a> to help make <em>attrs</em>’s maintenance more sustainable!</strong>
|
| 116 |
+
</p>
|
| 117 |
+
|
| 118 |
+
<!-- teaser-end -->
|
| 119 |
+
|
| 120 |
+
## Example
|
| 121 |
+
|
| 122 |
+
*attrs* gives you a class decorator and a way to declaratively define the attributes on that class:
|
| 123 |
+
|
| 124 |
+
<!-- code-begin -->
|
| 125 |
+
|
| 126 |
+
```pycon
|
| 127 |
+
>>> from attrs import asdict, define, make_class, Factory
|
| 128 |
+
|
| 129 |
+
>>> @define
|
| 130 |
+
... class SomeClass:
|
| 131 |
+
... a_number: int = 42
|
| 132 |
+
... list_of_numbers: list[int] = Factory(list)
|
| 133 |
+
...
|
| 134 |
+
... def hard_math(self, another_number):
|
| 135 |
+
... return self.a_number + sum(self.list_of_numbers) * another_number
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
>>> sc = SomeClass(1, [1, 2, 3])
|
| 139 |
+
>>> sc
|
| 140 |
+
SomeClass(a_number=1, list_of_numbers=[1, 2, 3])
|
| 141 |
+
|
| 142 |
+
>>> sc.hard_math(3)
|
| 143 |
+
19
|
| 144 |
+
>>> sc == SomeClass(1, [1, 2, 3])
|
| 145 |
+
True
|
| 146 |
+
>>> sc != SomeClass(2, [3, 2, 1])
|
| 147 |
+
True
|
| 148 |
+
|
| 149 |
+
>>> asdict(sc)
|
| 150 |
+
{'a_number': 1, 'list_of_numbers': [1, 2, 3]}
|
| 151 |
+
|
| 152 |
+
>>> SomeClass()
|
| 153 |
+
SomeClass(a_number=42, list_of_numbers=[])
|
| 154 |
+
|
| 155 |
+
>>> C = make_class("C", ["a", "b"])
|
| 156 |
+
>>> C("foo", "bar")
|
| 157 |
+
C(a='foo', b='bar')
|
| 158 |
+
```
|
| 159 |
+
|
| 160 |
+
After *declaring* your attributes, *attrs* gives you:
|
| 161 |
+
|
| 162 |
+
- a concise and explicit overview of the class's attributes,
|
| 163 |
+
- a nice human-readable `__repr__`,
|
| 164 |
+
- equality-checking methods,
|
| 165 |
+
- an initializer,
|
| 166 |
+
- and much more,
|
| 167 |
+
|
| 168 |
+
*without* writing dull boilerplate code again and again and *without* runtime performance penalties.
|
| 169 |
+
|
| 170 |
+
---
|
| 171 |
+
|
| 172 |
+
This example uses *attrs*'s modern APIs that have been introduced in version 20.1.0, and the *attrs* package import name that has been added in version 21.3.0.
|
| 173 |
+
The classic APIs (`@attr.s`, `attr.ib`, plus their serious-business aliases) and the `attr` package import name will remain **indefinitely**.
|
| 174 |
+
|
| 175 |
+
Check out [*On The Core API Names*](https://www.attrs.org/en/latest/names.html) for an in-depth explanation!
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
### Hate Type Annotations!?
|
| 179 |
+
|
| 180 |
+
No problem!
|
| 181 |
+
Types are entirely **optional** with *attrs*.
|
| 182 |
+
Simply assign `attrs.field()` to the attributes instead of annotating them with types:
|
| 183 |
+
|
| 184 |
+
```python
|
| 185 |
+
from attrs import define, field
|
| 186 |
+
|
| 187 |
+
@define
|
| 188 |
+
class SomeClass:
|
| 189 |
+
a_number = field(default=42)
|
| 190 |
+
list_of_numbers = field(factory=list)
|
| 191 |
+
```
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
## Data Classes
|
| 195 |
+
|
| 196 |
+
On the tin, *attrs* might remind you of `dataclasses` (and indeed, `dataclasses` [are a descendant](https://hynek.me/articles/import-attrs/) of *attrs*).
|
| 197 |
+
In practice it does a lot more and is more flexible.
|
| 198 |
+
For instance, it allows you to define [special handling of NumPy arrays for equality checks](https://www.attrs.org/en/stable/comparison.html#customization), allows more ways to [plug into the initialization process](https://www.attrs.org/en/stable/init.html#hooking-yourself-into-initialization), has a replacement for `__init_subclass__`, and allows for stepping through the generated methods using a debugger.
|
| 199 |
+
|
| 200 |
+
For more details, please refer to our [comparison page](https://www.attrs.org/en/stable/why.html#data-classes), but generally speaking, we are more likely to commit crimes against nature to make things work that one would expect to work, but that are quite complicated in practice.
|
| 201 |
+
|
| 202 |
+
|
| 203 |
+
## Project Information
|
| 204 |
+
|
| 205 |
+
- [**Changelog**](https://www.attrs.org/en/stable/changelog.html)
|
| 206 |
+
- [**Documentation**](https://www.attrs.org/)
|
| 207 |
+
- [**PyPI**](https://pypi.org/project/attrs/)
|
| 208 |
+
- [**Source Code**](https://github.com/python-attrs/attrs)
|
| 209 |
+
- [**Contributing**](https://github.com/python-attrs/attrs/blob/main/.github/CONTRIBUTING.md)
|
| 210 |
+
- [**Third-party Extensions**](https://github.com/python-attrs/attrs/wiki/Extensions-to-attrs)
|
| 211 |
+
- **Get Help**: use the `python-attrs` tag on [Stack Overflow](https://stackoverflow.com/questions/tagged/python-attrs)
|
| 212 |
+
|
| 213 |
+
|
| 214 |
+
### *attrs* for Enterprise
|
| 215 |
+
|
| 216 |
+
Available as part of the [Tidelift Subscription](https://tidelift.com/?utm_source=lifter&utm_medium=referral&utm_campaign=hynek).
|
| 217 |
+
|
| 218 |
+
The maintainers of *attrs* and thousands of other packages are working with Tidelift to deliver commercial support and maintenance for the open source packages you use to build your applications.
|
| 219 |
+
Save time, reduce risk, and improve code health, while paying the maintainers of the exact packages you use.
|
| 220 |
+
|
| 221 |
+
## Release Information
|
| 222 |
+
|
| 223 |
+
### Changes
|
| 224 |
+
|
| 225 |
+
- This release only ensures correct PyPI licensing metadata.
|
| 226 |
+
[#1386](https://github.com/python-attrs/attrs/issues/1386)
|
| 227 |
+
|
| 228 |
+
|
| 229 |
+
|
| 230 |
+
---
|
| 231 |
+
|
| 232 |
+
[Full changelog →](https://www.attrs.org/en/stable/changelog.html)
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/attrs-25.1.0.dist-info/RECORD
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
attr/__init__.py,sha256=fOYIvt1eGSqQre4uCS3sJWKZ0mwAuC8UD6qba5OS9_U,2057
|
| 2 |
+
attr/__init__.pyi,sha256=QIXnnHPoucmDWkbpNsWTP-cgJ1bn8le7DjyRa_wYdew,11281
|
| 3 |
+
attr/__pycache__/__init__.cpython-312.pyc,,
|
| 4 |
+
attr/__pycache__/_cmp.cpython-312.pyc,,
|
| 5 |
+
attr/__pycache__/_compat.cpython-312.pyc,,
|
| 6 |
+
attr/__pycache__/_config.cpython-312.pyc,,
|
| 7 |
+
attr/__pycache__/_funcs.cpython-312.pyc,,
|
| 8 |
+
attr/__pycache__/_make.cpython-312.pyc,,
|
| 9 |
+
attr/__pycache__/_next_gen.cpython-312.pyc,,
|
| 10 |
+
attr/__pycache__/_version_info.cpython-312.pyc,,
|
| 11 |
+
attr/__pycache__/converters.cpython-312.pyc,,
|
| 12 |
+
attr/__pycache__/exceptions.cpython-312.pyc,,
|
| 13 |
+
attr/__pycache__/filters.cpython-312.pyc,,
|
| 14 |
+
attr/__pycache__/setters.cpython-312.pyc,,
|
| 15 |
+
attr/__pycache__/validators.cpython-312.pyc,,
|
| 16 |
+
attr/_cmp.py,sha256=3umHiBtgsEYtvNP_8XrQwTCdFoZIX4DEur76N-2a3X8,4123
|
| 17 |
+
attr/_cmp.pyi,sha256=U-_RU_UZOyPUEQzXE6RMYQQcjkZRY25wTH99sN0s7MM,368
|
| 18 |
+
attr/_compat.py,sha256=4hlXbWhdDjQCDK6FKF1EgnZ3POiHgtpp54qE0nxaGHg,2704
|
| 19 |
+
attr/_config.py,sha256=dGq3xR6fgZEF6UBt_L0T-eUHIB4i43kRmH0P28sJVw8,843
|
| 20 |
+
attr/_funcs.py,sha256=5-tUKJtp3h5El55EcDl6GWXFp68fT8D8U7uCRN6497I,15854
|
| 21 |
+
attr/_make.py,sha256=XS_pYn_-KNo69Tb8-_y3YUcB3Xus00MwAShh2WulkjQ,94157
|
| 22 |
+
attr/_next_gen.py,sha256=7FRkbtl_N017SuBhf_Vw3mw2c2pGZhtCGOzadgz7tp4,24395
|
| 23 |
+
attr/_typing_compat.pyi,sha256=XDP54TUn-ZKhD62TOQebmzrwFyomhUCoGRpclb6alRA,469
|
| 24 |
+
attr/_version_info.py,sha256=exSqb3b5E-fMSsgZAlEw9XcLpEgobPORCZpcaEglAM4,2121
|
| 25 |
+
attr/_version_info.pyi,sha256=x_M3L3WuB7r_ULXAWjx959udKQ4HLB8l-hsc1FDGNvk,209
|
| 26 |
+
attr/converters.py,sha256=GlDeOzPeTFgeBBLbj9G57Ez5lAk68uhSALRYJ_exe84,3861
|
| 27 |
+
attr/converters.pyi,sha256=orU2bff-VjQa2kMDyvnMQV73oJT2WRyQuw4ZR1ym1bE,643
|
| 28 |
+
attr/exceptions.py,sha256=HRFq4iybmv7-DcZwyjl6M1euM2YeJVK_hFxuaBGAngI,1977
|
| 29 |
+
attr/exceptions.pyi,sha256=zZq8bCUnKAy9mDtBEw42ZhPhAUIHoTKedDQInJD883M,539
|
| 30 |
+
attr/filters.py,sha256=ZBiKWLp3R0LfCZsq7X11pn9WX8NslS2wXM4jsnLOGc8,1795
|
| 31 |
+
attr/filters.pyi,sha256=3J5BG-dTxltBk1_-RuNRUHrv2qu1v8v4aDNAQ7_mifA,208
|
| 32 |
+
attr/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
| 33 |
+
attr/setters.py,sha256=5-dcT63GQK35ONEzSgfXCkbB7pPkaR-qv15mm4PVSzQ,1617
|
| 34 |
+
attr/setters.pyi,sha256=NnVkaFU1BB4JB8E4JuXyrzTUgvtMpj8p3wBdJY7uix4,584
|
| 35 |
+
attr/validators.py,sha256=WaB1HLAHHqRHWsrv_K9H-sJ7ESil3H3Cmv2d8TtVZx4,20046
|
| 36 |
+
attr/validators.pyi,sha256=s2WhKPqskxbsckJfKk8zOuuB088GfgpyxcCYSNFLqNU,2603
|
| 37 |
+
attrs-25.1.0.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
|
| 38 |
+
attrs-25.1.0.dist-info/METADATA,sha256=bZidcSPgoF4BvFNQYyqph4NeHVg9r55WXiwAEtbvRnc,10999
|
| 39 |
+
attrs-25.1.0.dist-info/RECORD,,
|
| 40 |
+
attrs-25.1.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
| 41 |
+
attrs-25.1.0.dist-info/licenses/LICENSE,sha256=iCEVyV38KvHutnFPjsbVy8q_Znyv-HKfQkINpj9xTp8,1109
|
| 42 |
+
attrs/__init__.py,sha256=qeQJZ4O08yczSn840v9bYOaZyRE81WsVi-QCrY3krCU,1107
|
| 43 |
+
attrs/__init__.pyi,sha256=nZmInocjM7tHV4AQw0vxO_fo6oJjL_PonlV9zKKW8DY,7931
|
| 44 |
+
attrs/__pycache__/__init__.cpython-312.pyc,,
|
| 45 |
+
attrs/__pycache__/converters.cpython-312.pyc,,
|
| 46 |
+
attrs/__pycache__/exceptions.cpython-312.pyc,,
|
| 47 |
+
attrs/__pycache__/filters.cpython-312.pyc,,
|
| 48 |
+
attrs/__pycache__/setters.cpython-312.pyc,,
|
| 49 |
+
attrs/__pycache__/validators.cpython-312.pyc,,
|
| 50 |
+
attrs/converters.py,sha256=8kQljrVwfSTRu8INwEk8SI0eGrzmWftsT7rM0EqyohM,76
|
| 51 |
+
attrs/exceptions.py,sha256=ACCCmg19-vDFaDPY9vFl199SPXCQMN_bENs4DALjzms,76
|
| 52 |
+
attrs/filters.py,sha256=VOUMZug9uEU6dUuA0dF1jInUK0PL3fLgP0VBS5d-CDE,73
|
| 53 |
+
attrs/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
| 54 |
+
attrs/setters.py,sha256=eL1YidYQV3T2h9_SYIZSZR1FAcHGb1TuCTy0E0Lv2SU,73
|
| 55 |
+
attrs/validators.py,sha256=xcy6wD5TtTkdCG1f4XWbocPSO0faBjk5IfVJfP6SUj0,76
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/attrs-25.1.0.dist-info/WHEEL
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Wheel-Version: 1.0
|
| 2 |
+
Generator: hatchling 1.27.0
|
| 3 |
+
Root-Is-Purelib: true
|
| 4 |
+
Tag: py3-none-any
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/colorama/__init__.py
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright Jonathan Hartley 2013. BSD 3-Clause license, see LICENSE file.
|
| 2 |
+
from .initialise import init, deinit, reinit, colorama_text, just_fix_windows_console
|
| 3 |
+
from .ansi import Fore, Back, Style, Cursor
|
| 4 |
+
from .ansitowin32 import AnsiToWin32
|
| 5 |
+
|
| 6 |
+
__version__ = '0.4.6'
|
| 7 |
+
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/colorama/ansi.py
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright Jonathan Hartley 2013. BSD 3-Clause license, see LICENSE file.
|
| 2 |
+
'''
|
| 3 |
+
This module generates ANSI character codes to printing colors to terminals.
|
| 4 |
+
See: http://en.wikipedia.org/wiki/ANSI_escape_code
|
| 5 |
+
'''
|
| 6 |
+
|
| 7 |
+
CSI = '\033['
|
| 8 |
+
OSC = '\033]'
|
| 9 |
+
BEL = '\a'
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def code_to_chars(code):
|
| 13 |
+
return CSI + str(code) + 'm'
|
| 14 |
+
|
| 15 |
+
def set_title(title):
|
| 16 |
+
return OSC + '2;' + title + BEL
|
| 17 |
+
|
| 18 |
+
def clear_screen(mode=2):
|
| 19 |
+
return CSI + str(mode) + 'J'
|
| 20 |
+
|
| 21 |
+
def clear_line(mode=2):
|
| 22 |
+
return CSI + str(mode) + 'K'
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
class AnsiCodes(object):
|
| 26 |
+
def __init__(self):
|
| 27 |
+
# the subclasses declare class attributes which are numbers.
|
| 28 |
+
# Upon instantiation we define instance attributes, which are the same
|
| 29 |
+
# as the class attributes but wrapped with the ANSI escape sequence
|
| 30 |
+
for name in dir(self):
|
| 31 |
+
if not name.startswith('_'):
|
| 32 |
+
value = getattr(self, name)
|
| 33 |
+
setattr(self, name, code_to_chars(value))
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
class AnsiCursor(object):
|
| 37 |
+
def UP(self, n=1):
|
| 38 |
+
return CSI + str(n) + 'A'
|
| 39 |
+
def DOWN(self, n=1):
|
| 40 |
+
return CSI + str(n) + 'B'
|
| 41 |
+
def FORWARD(self, n=1):
|
| 42 |
+
return CSI + str(n) + 'C'
|
| 43 |
+
def BACK(self, n=1):
|
| 44 |
+
return CSI + str(n) + 'D'
|
| 45 |
+
def POS(self, x=1, y=1):
|
| 46 |
+
return CSI + str(y) + ';' + str(x) + 'H'
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
class AnsiFore(AnsiCodes):
|
| 50 |
+
BLACK = 30
|
| 51 |
+
RED = 31
|
| 52 |
+
GREEN = 32
|
| 53 |
+
YELLOW = 33
|
| 54 |
+
BLUE = 34
|
| 55 |
+
MAGENTA = 35
|
| 56 |
+
CYAN = 36
|
| 57 |
+
WHITE = 37
|
| 58 |
+
RESET = 39
|
| 59 |
+
|
| 60 |
+
# These are fairly well supported, but not part of the standard.
|
| 61 |
+
LIGHTBLACK_EX = 90
|
| 62 |
+
LIGHTRED_EX = 91
|
| 63 |
+
LIGHTGREEN_EX = 92
|
| 64 |
+
LIGHTYELLOW_EX = 93
|
| 65 |
+
LIGHTBLUE_EX = 94
|
| 66 |
+
LIGHTMAGENTA_EX = 95
|
| 67 |
+
LIGHTCYAN_EX = 96
|
| 68 |
+
LIGHTWHITE_EX = 97
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
class AnsiBack(AnsiCodes):
|
| 72 |
+
BLACK = 40
|
| 73 |
+
RED = 41
|
| 74 |
+
GREEN = 42
|
| 75 |
+
YELLOW = 43
|
| 76 |
+
BLUE = 44
|
| 77 |
+
MAGENTA = 45
|
| 78 |
+
CYAN = 46
|
| 79 |
+
WHITE = 47
|
| 80 |
+
RESET = 49
|
| 81 |
+
|
| 82 |
+
# These are fairly well supported, but not part of the standard.
|
| 83 |
+
LIGHTBLACK_EX = 100
|
| 84 |
+
LIGHTRED_EX = 101
|
| 85 |
+
LIGHTGREEN_EX = 102
|
| 86 |
+
LIGHTYELLOW_EX = 103
|
| 87 |
+
LIGHTBLUE_EX = 104
|
| 88 |
+
LIGHTMAGENTA_EX = 105
|
| 89 |
+
LIGHTCYAN_EX = 106
|
| 90 |
+
LIGHTWHITE_EX = 107
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
class AnsiStyle(AnsiCodes):
|
| 94 |
+
BRIGHT = 1
|
| 95 |
+
DIM = 2
|
| 96 |
+
NORMAL = 22
|
| 97 |
+
RESET_ALL = 0
|
| 98 |
+
|
| 99 |
+
Fore = AnsiFore()
|
| 100 |
+
Back = AnsiBack()
|
| 101 |
+
Style = AnsiStyle()
|
| 102 |
+
Cursor = AnsiCursor()
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/colorama/ansitowin32.py
ADDED
|
@@ -0,0 +1,277 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright Jonathan Hartley 2013. BSD 3-Clause license, see LICENSE file.
|
| 2 |
+
import re
|
| 3 |
+
import sys
|
| 4 |
+
import os
|
| 5 |
+
|
| 6 |
+
from .ansi import AnsiFore, AnsiBack, AnsiStyle, Style, BEL
|
| 7 |
+
from .winterm import enable_vt_processing, WinTerm, WinColor, WinStyle
|
| 8 |
+
from .win32 import windll, winapi_test
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
winterm = None
|
| 12 |
+
if windll is not None:
|
| 13 |
+
winterm = WinTerm()
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class StreamWrapper(object):
|
| 17 |
+
'''
|
| 18 |
+
Wraps a stream (such as stdout), acting as a transparent proxy for all
|
| 19 |
+
attribute access apart from method 'write()', which is delegated to our
|
| 20 |
+
Converter instance.
|
| 21 |
+
'''
|
| 22 |
+
def __init__(self, wrapped, converter):
|
| 23 |
+
# double-underscore everything to prevent clashes with names of
|
| 24 |
+
# attributes on the wrapped stream object.
|
| 25 |
+
self.__wrapped = wrapped
|
| 26 |
+
self.__convertor = converter
|
| 27 |
+
|
| 28 |
+
def __getattr__(self, name):
|
| 29 |
+
return getattr(self.__wrapped, name)
|
| 30 |
+
|
| 31 |
+
def __enter__(self, *args, **kwargs):
|
| 32 |
+
# special method lookup bypasses __getattr__/__getattribute__, see
|
| 33 |
+
# https://stackoverflow.com/questions/12632894/why-doesnt-getattr-work-with-exit
|
| 34 |
+
# thus, contextlib magic methods are not proxied via __getattr__
|
| 35 |
+
return self.__wrapped.__enter__(*args, **kwargs)
|
| 36 |
+
|
| 37 |
+
def __exit__(self, *args, **kwargs):
|
| 38 |
+
return self.__wrapped.__exit__(*args, **kwargs)
|
| 39 |
+
|
| 40 |
+
def __setstate__(self, state):
|
| 41 |
+
self.__dict__ = state
|
| 42 |
+
|
| 43 |
+
def __getstate__(self):
|
| 44 |
+
return self.__dict__
|
| 45 |
+
|
| 46 |
+
def write(self, text):
|
| 47 |
+
self.__convertor.write(text)
|
| 48 |
+
|
| 49 |
+
def isatty(self):
|
| 50 |
+
stream = self.__wrapped
|
| 51 |
+
if 'PYCHARM_HOSTED' in os.environ:
|
| 52 |
+
if stream is not None and (stream is sys.__stdout__ or stream is sys.__stderr__):
|
| 53 |
+
return True
|
| 54 |
+
try:
|
| 55 |
+
stream_isatty = stream.isatty
|
| 56 |
+
except AttributeError:
|
| 57 |
+
return False
|
| 58 |
+
else:
|
| 59 |
+
return stream_isatty()
|
| 60 |
+
|
| 61 |
+
@property
|
| 62 |
+
def closed(self):
|
| 63 |
+
stream = self.__wrapped
|
| 64 |
+
try:
|
| 65 |
+
return stream.closed
|
| 66 |
+
# AttributeError in the case that the stream doesn't support being closed
|
| 67 |
+
# ValueError for the case that the stream has already been detached when atexit runs
|
| 68 |
+
except (AttributeError, ValueError):
|
| 69 |
+
return True
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
class AnsiToWin32(object):
|
| 73 |
+
'''
|
| 74 |
+
Implements a 'write()' method which, on Windows, will strip ANSI character
|
| 75 |
+
sequences from the text, and if outputting to a tty, will convert them into
|
| 76 |
+
win32 function calls.
|
| 77 |
+
'''
|
| 78 |
+
ANSI_CSI_RE = re.compile('\001?\033\\[((?:\\d|;)*)([a-zA-Z])\002?') # Control Sequence Introducer
|
| 79 |
+
ANSI_OSC_RE = re.compile('\001?\033\\]([^\a]*)(\a)\002?') # Operating System Command
|
| 80 |
+
|
| 81 |
+
def __init__(self, wrapped, convert=None, strip=None, autoreset=False):
|
| 82 |
+
# The wrapped stream (normally sys.stdout or sys.stderr)
|
| 83 |
+
self.wrapped = wrapped
|
| 84 |
+
|
| 85 |
+
# should we reset colors to defaults after every .write()
|
| 86 |
+
self.autoreset = autoreset
|
| 87 |
+
|
| 88 |
+
# create the proxy wrapping our output stream
|
| 89 |
+
self.stream = StreamWrapper(wrapped, self)
|
| 90 |
+
|
| 91 |
+
on_windows = os.name == 'nt'
|
| 92 |
+
# We test if the WinAPI works, because even if we are on Windows
|
| 93 |
+
# we may be using a terminal that doesn't support the WinAPI
|
| 94 |
+
# (e.g. Cygwin Terminal). In this case it's up to the terminal
|
| 95 |
+
# to support the ANSI codes.
|
| 96 |
+
conversion_supported = on_windows and winapi_test()
|
| 97 |
+
try:
|
| 98 |
+
fd = wrapped.fileno()
|
| 99 |
+
except Exception:
|
| 100 |
+
fd = -1
|
| 101 |
+
system_has_native_ansi = not on_windows or enable_vt_processing(fd)
|
| 102 |
+
have_tty = not self.stream.closed and self.stream.isatty()
|
| 103 |
+
need_conversion = conversion_supported and not system_has_native_ansi
|
| 104 |
+
|
| 105 |
+
# should we strip ANSI sequences from our output?
|
| 106 |
+
if strip is None:
|
| 107 |
+
strip = need_conversion or not have_tty
|
| 108 |
+
self.strip = strip
|
| 109 |
+
|
| 110 |
+
# should we should convert ANSI sequences into win32 calls?
|
| 111 |
+
if convert is None:
|
| 112 |
+
convert = need_conversion and have_tty
|
| 113 |
+
self.convert = convert
|
| 114 |
+
|
| 115 |
+
# dict of ansi codes to win32 functions and parameters
|
| 116 |
+
self.win32_calls = self.get_win32_calls()
|
| 117 |
+
|
| 118 |
+
# are we wrapping stderr?
|
| 119 |
+
self.on_stderr = self.wrapped is sys.stderr
|
| 120 |
+
|
| 121 |
+
def should_wrap(self):
|
| 122 |
+
'''
|
| 123 |
+
True if this class is actually needed. If false, then the output
|
| 124 |
+
stream will not be affected, nor will win32 calls be issued, so
|
| 125 |
+
wrapping stdout is not actually required. This will generally be
|
| 126 |
+
False on non-Windows platforms, unless optional functionality like
|
| 127 |
+
autoreset has been requested using kwargs to init()
|
| 128 |
+
'''
|
| 129 |
+
return self.convert or self.strip or self.autoreset
|
| 130 |
+
|
| 131 |
+
def get_win32_calls(self):
|
| 132 |
+
if self.convert and winterm:
|
| 133 |
+
return {
|
| 134 |
+
AnsiStyle.RESET_ALL: (winterm.reset_all, ),
|
| 135 |
+
AnsiStyle.BRIGHT: (winterm.style, WinStyle.BRIGHT),
|
| 136 |
+
AnsiStyle.DIM: (winterm.style, WinStyle.NORMAL),
|
| 137 |
+
AnsiStyle.NORMAL: (winterm.style, WinStyle.NORMAL),
|
| 138 |
+
AnsiFore.BLACK: (winterm.fore, WinColor.BLACK),
|
| 139 |
+
AnsiFore.RED: (winterm.fore, WinColor.RED),
|
| 140 |
+
AnsiFore.GREEN: (winterm.fore, WinColor.GREEN),
|
| 141 |
+
AnsiFore.YELLOW: (winterm.fore, WinColor.YELLOW),
|
| 142 |
+
AnsiFore.BLUE: (winterm.fore, WinColor.BLUE),
|
| 143 |
+
AnsiFore.MAGENTA: (winterm.fore, WinColor.MAGENTA),
|
| 144 |
+
AnsiFore.CYAN: (winterm.fore, WinColor.CYAN),
|
| 145 |
+
AnsiFore.WHITE: (winterm.fore, WinColor.GREY),
|
| 146 |
+
AnsiFore.RESET: (winterm.fore, ),
|
| 147 |
+
AnsiFore.LIGHTBLACK_EX: (winterm.fore, WinColor.BLACK, True),
|
| 148 |
+
AnsiFore.LIGHTRED_EX: (winterm.fore, WinColor.RED, True),
|
| 149 |
+
AnsiFore.LIGHTGREEN_EX: (winterm.fore, WinColor.GREEN, True),
|
| 150 |
+
AnsiFore.LIGHTYELLOW_EX: (winterm.fore, WinColor.YELLOW, True),
|
| 151 |
+
AnsiFore.LIGHTBLUE_EX: (winterm.fore, WinColor.BLUE, True),
|
| 152 |
+
AnsiFore.LIGHTMAGENTA_EX: (winterm.fore, WinColor.MAGENTA, True),
|
| 153 |
+
AnsiFore.LIGHTCYAN_EX: (winterm.fore, WinColor.CYAN, True),
|
| 154 |
+
AnsiFore.LIGHTWHITE_EX: (winterm.fore, WinColor.GREY, True),
|
| 155 |
+
AnsiBack.BLACK: (winterm.back, WinColor.BLACK),
|
| 156 |
+
AnsiBack.RED: (winterm.back, WinColor.RED),
|
| 157 |
+
AnsiBack.GREEN: (winterm.back, WinColor.GREEN),
|
| 158 |
+
AnsiBack.YELLOW: (winterm.back, WinColor.YELLOW),
|
| 159 |
+
AnsiBack.BLUE: (winterm.back, WinColor.BLUE),
|
| 160 |
+
AnsiBack.MAGENTA: (winterm.back, WinColor.MAGENTA),
|
| 161 |
+
AnsiBack.CYAN: (winterm.back, WinColor.CYAN),
|
| 162 |
+
AnsiBack.WHITE: (winterm.back, WinColor.GREY),
|
| 163 |
+
AnsiBack.RESET: (winterm.back, ),
|
| 164 |
+
AnsiBack.LIGHTBLACK_EX: (winterm.back, WinColor.BLACK, True),
|
| 165 |
+
AnsiBack.LIGHTRED_EX: (winterm.back, WinColor.RED, True),
|
| 166 |
+
AnsiBack.LIGHTGREEN_EX: (winterm.back, WinColor.GREEN, True),
|
| 167 |
+
AnsiBack.LIGHTYELLOW_EX: (winterm.back, WinColor.YELLOW, True),
|
| 168 |
+
AnsiBack.LIGHTBLUE_EX: (winterm.back, WinColor.BLUE, True),
|
| 169 |
+
AnsiBack.LIGHTMAGENTA_EX: (winterm.back, WinColor.MAGENTA, True),
|
| 170 |
+
AnsiBack.LIGHTCYAN_EX: (winterm.back, WinColor.CYAN, True),
|
| 171 |
+
AnsiBack.LIGHTWHITE_EX: (winterm.back, WinColor.GREY, True),
|
| 172 |
+
}
|
| 173 |
+
return dict()
|
| 174 |
+
|
| 175 |
+
def write(self, text):
|
| 176 |
+
if self.strip or self.convert:
|
| 177 |
+
self.write_and_convert(text)
|
| 178 |
+
else:
|
| 179 |
+
self.wrapped.write(text)
|
| 180 |
+
self.wrapped.flush()
|
| 181 |
+
if self.autoreset:
|
| 182 |
+
self.reset_all()
|
| 183 |
+
|
| 184 |
+
|
| 185 |
+
def reset_all(self):
|
| 186 |
+
if self.convert:
|
| 187 |
+
self.call_win32('m', (0,))
|
| 188 |
+
elif not self.strip and not self.stream.closed:
|
| 189 |
+
self.wrapped.write(Style.RESET_ALL)
|
| 190 |
+
|
| 191 |
+
|
| 192 |
+
def write_and_convert(self, text):
|
| 193 |
+
'''
|
| 194 |
+
Write the given text to our wrapped stream, stripping any ANSI
|
| 195 |
+
sequences from the text, and optionally converting them into win32
|
| 196 |
+
calls.
|
| 197 |
+
'''
|
| 198 |
+
cursor = 0
|
| 199 |
+
text = self.convert_osc(text)
|
| 200 |
+
for match in self.ANSI_CSI_RE.finditer(text):
|
| 201 |
+
start, end = match.span()
|
| 202 |
+
self.write_plain_text(text, cursor, start)
|
| 203 |
+
self.convert_ansi(*match.groups())
|
| 204 |
+
cursor = end
|
| 205 |
+
self.write_plain_text(text, cursor, len(text))
|
| 206 |
+
|
| 207 |
+
|
| 208 |
+
def write_plain_text(self, text, start, end):
|
| 209 |
+
if start < end:
|
| 210 |
+
self.wrapped.write(text[start:end])
|
| 211 |
+
self.wrapped.flush()
|
| 212 |
+
|
| 213 |
+
|
| 214 |
+
def convert_ansi(self, paramstring, command):
|
| 215 |
+
if self.convert:
|
| 216 |
+
params = self.extract_params(command, paramstring)
|
| 217 |
+
self.call_win32(command, params)
|
| 218 |
+
|
| 219 |
+
|
| 220 |
+
def extract_params(self, command, paramstring):
|
| 221 |
+
if command in 'Hf':
|
| 222 |
+
params = tuple(int(p) if len(p) != 0 else 1 for p in paramstring.split(';'))
|
| 223 |
+
while len(params) < 2:
|
| 224 |
+
# defaults:
|
| 225 |
+
params = params + (1,)
|
| 226 |
+
else:
|
| 227 |
+
params = tuple(int(p) for p in paramstring.split(';') if len(p) != 0)
|
| 228 |
+
if len(params) == 0:
|
| 229 |
+
# defaults:
|
| 230 |
+
if command in 'JKm':
|
| 231 |
+
params = (0,)
|
| 232 |
+
elif command in 'ABCD':
|
| 233 |
+
params = (1,)
|
| 234 |
+
|
| 235 |
+
return params
|
| 236 |
+
|
| 237 |
+
|
| 238 |
+
def call_win32(self, command, params):
|
| 239 |
+
if command == 'm':
|
| 240 |
+
for param in params:
|
| 241 |
+
if param in self.win32_calls:
|
| 242 |
+
func_args = self.win32_calls[param]
|
| 243 |
+
func = func_args[0]
|
| 244 |
+
args = func_args[1:]
|
| 245 |
+
kwargs = dict(on_stderr=self.on_stderr)
|
| 246 |
+
func(*args, **kwargs)
|
| 247 |
+
elif command in 'J':
|
| 248 |
+
winterm.erase_screen(params[0], on_stderr=self.on_stderr)
|
| 249 |
+
elif command in 'K':
|
| 250 |
+
winterm.erase_line(params[0], on_stderr=self.on_stderr)
|
| 251 |
+
elif command in 'Hf': # cursor position - absolute
|
| 252 |
+
winterm.set_cursor_position(params, on_stderr=self.on_stderr)
|
| 253 |
+
elif command in 'ABCD': # cursor position - relative
|
| 254 |
+
n = params[0]
|
| 255 |
+
# A - up, B - down, C - forward, D - back
|
| 256 |
+
x, y = {'A': (0, -n), 'B': (0, n), 'C': (n, 0), 'D': (-n, 0)}[command]
|
| 257 |
+
winterm.cursor_adjust(x, y, on_stderr=self.on_stderr)
|
| 258 |
+
|
| 259 |
+
|
| 260 |
+
def convert_osc(self, text):
|
| 261 |
+
for match in self.ANSI_OSC_RE.finditer(text):
|
| 262 |
+
start, end = match.span()
|
| 263 |
+
text = text[:start] + text[end:]
|
| 264 |
+
paramstring, command = match.groups()
|
| 265 |
+
if command == BEL:
|
| 266 |
+
if paramstring.count(";") == 1:
|
| 267 |
+
params = paramstring.split(";")
|
| 268 |
+
# 0 - change title and icon (we will only change title)
|
| 269 |
+
# 1 - change icon (we don't support this)
|
| 270 |
+
# 2 - change title
|
| 271 |
+
if params[0] in '02':
|
| 272 |
+
winterm.set_title(params[1])
|
| 273 |
+
return text
|
| 274 |
+
|
| 275 |
+
|
| 276 |
+
def flush(self):
|
| 277 |
+
self.wrapped.flush()
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/colorama/initialise.py
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright Jonathan Hartley 2013. BSD 3-Clause license, see LICENSE file.
|
| 2 |
+
import atexit
|
| 3 |
+
import contextlib
|
| 4 |
+
import sys
|
| 5 |
+
|
| 6 |
+
from .ansitowin32 import AnsiToWin32
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def _wipe_internal_state_for_tests():
|
| 10 |
+
global orig_stdout, orig_stderr
|
| 11 |
+
orig_stdout = None
|
| 12 |
+
orig_stderr = None
|
| 13 |
+
|
| 14 |
+
global wrapped_stdout, wrapped_stderr
|
| 15 |
+
wrapped_stdout = None
|
| 16 |
+
wrapped_stderr = None
|
| 17 |
+
|
| 18 |
+
global atexit_done
|
| 19 |
+
atexit_done = False
|
| 20 |
+
|
| 21 |
+
global fixed_windows_console
|
| 22 |
+
fixed_windows_console = False
|
| 23 |
+
|
| 24 |
+
try:
|
| 25 |
+
# no-op if it wasn't registered
|
| 26 |
+
atexit.unregister(reset_all)
|
| 27 |
+
except AttributeError:
|
| 28 |
+
# python 2: no atexit.unregister. Oh well, we did our best.
|
| 29 |
+
pass
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def reset_all():
|
| 33 |
+
if AnsiToWin32 is not None: # Issue #74: objects might become None at exit
|
| 34 |
+
AnsiToWin32(orig_stdout).reset_all()
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def init(autoreset=False, convert=None, strip=None, wrap=True):
|
| 38 |
+
|
| 39 |
+
if not wrap and any([autoreset, convert, strip]):
|
| 40 |
+
raise ValueError('wrap=False conflicts with any other arg=True')
|
| 41 |
+
|
| 42 |
+
global wrapped_stdout, wrapped_stderr
|
| 43 |
+
global orig_stdout, orig_stderr
|
| 44 |
+
|
| 45 |
+
orig_stdout = sys.stdout
|
| 46 |
+
orig_stderr = sys.stderr
|
| 47 |
+
|
| 48 |
+
if sys.stdout is None:
|
| 49 |
+
wrapped_stdout = None
|
| 50 |
+
else:
|
| 51 |
+
sys.stdout = wrapped_stdout = \
|
| 52 |
+
wrap_stream(orig_stdout, convert, strip, autoreset, wrap)
|
| 53 |
+
if sys.stderr is None:
|
| 54 |
+
wrapped_stderr = None
|
| 55 |
+
else:
|
| 56 |
+
sys.stderr = wrapped_stderr = \
|
| 57 |
+
wrap_stream(orig_stderr, convert, strip, autoreset, wrap)
|
| 58 |
+
|
| 59 |
+
global atexit_done
|
| 60 |
+
if not atexit_done:
|
| 61 |
+
atexit.register(reset_all)
|
| 62 |
+
atexit_done = True
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def deinit():
|
| 66 |
+
if orig_stdout is not None:
|
| 67 |
+
sys.stdout = orig_stdout
|
| 68 |
+
if orig_stderr is not None:
|
| 69 |
+
sys.stderr = orig_stderr
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
def just_fix_windows_console():
|
| 73 |
+
global fixed_windows_console
|
| 74 |
+
|
| 75 |
+
if sys.platform != "win32":
|
| 76 |
+
return
|
| 77 |
+
if fixed_windows_console:
|
| 78 |
+
return
|
| 79 |
+
if wrapped_stdout is not None or wrapped_stderr is not None:
|
| 80 |
+
# Someone already ran init() and it did stuff, so we won't second-guess them
|
| 81 |
+
return
|
| 82 |
+
|
| 83 |
+
# On newer versions of Windows, AnsiToWin32.__init__ will implicitly enable the
|
| 84 |
+
# native ANSI support in the console as a side-effect. We only need to actually
|
| 85 |
+
# replace sys.stdout/stderr if we're in the old-style conversion mode.
|
| 86 |
+
new_stdout = AnsiToWin32(sys.stdout, convert=None, strip=None, autoreset=False)
|
| 87 |
+
if new_stdout.convert:
|
| 88 |
+
sys.stdout = new_stdout
|
| 89 |
+
new_stderr = AnsiToWin32(sys.stderr, convert=None, strip=None, autoreset=False)
|
| 90 |
+
if new_stderr.convert:
|
| 91 |
+
sys.stderr = new_stderr
|
| 92 |
+
|
| 93 |
+
fixed_windows_console = True
|
| 94 |
+
|
| 95 |
+
@contextlib.contextmanager
|
| 96 |
+
def colorama_text(*args, **kwargs):
|
| 97 |
+
init(*args, **kwargs)
|
| 98 |
+
try:
|
| 99 |
+
yield
|
| 100 |
+
finally:
|
| 101 |
+
deinit()
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def reinit():
|
| 105 |
+
if wrapped_stdout is not None:
|
| 106 |
+
sys.stdout = wrapped_stdout
|
| 107 |
+
if wrapped_stderr is not None:
|
| 108 |
+
sys.stderr = wrapped_stderr
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
def wrap_stream(stream, convert, strip, autoreset, wrap):
|
| 112 |
+
if wrap:
|
| 113 |
+
wrapper = AnsiToWin32(stream,
|
| 114 |
+
convert=convert, strip=strip, autoreset=autoreset)
|
| 115 |
+
if wrapper.should_wrap():
|
| 116 |
+
stream = wrapper.stream
|
| 117 |
+
return stream
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
# Use this for initial setup as well, to reduce code duplication
|
| 121 |
+
_wipe_internal_state_for_tests()
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/colorama/win32.py
ADDED
|
@@ -0,0 +1,180 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright Jonathan Hartley 2013. BSD 3-Clause license, see LICENSE file.
|
| 2 |
+
|
| 3 |
+
# from winbase.h
|
| 4 |
+
STDOUT = -11
|
| 5 |
+
STDERR = -12
|
| 6 |
+
|
| 7 |
+
ENABLE_VIRTUAL_TERMINAL_PROCESSING = 0x0004
|
| 8 |
+
|
| 9 |
+
try:
|
| 10 |
+
import ctypes
|
| 11 |
+
from ctypes import LibraryLoader
|
| 12 |
+
windll = LibraryLoader(ctypes.WinDLL)
|
| 13 |
+
from ctypes import wintypes
|
| 14 |
+
except (AttributeError, ImportError):
|
| 15 |
+
windll = None
|
| 16 |
+
SetConsoleTextAttribute = lambda *_: None
|
| 17 |
+
winapi_test = lambda *_: None
|
| 18 |
+
else:
|
| 19 |
+
from ctypes import byref, Structure, c_char, POINTER
|
| 20 |
+
|
| 21 |
+
COORD = wintypes._COORD
|
| 22 |
+
|
| 23 |
+
class CONSOLE_SCREEN_BUFFER_INFO(Structure):
|
| 24 |
+
"""struct in wincon.h."""
|
| 25 |
+
_fields_ = [
|
| 26 |
+
("dwSize", COORD),
|
| 27 |
+
("dwCursorPosition", COORD),
|
| 28 |
+
("wAttributes", wintypes.WORD),
|
| 29 |
+
("srWindow", wintypes.SMALL_RECT),
|
| 30 |
+
("dwMaximumWindowSize", COORD),
|
| 31 |
+
]
|
| 32 |
+
def __str__(self):
|
| 33 |
+
return '(%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d)' % (
|
| 34 |
+
self.dwSize.Y, self.dwSize.X
|
| 35 |
+
, self.dwCursorPosition.Y, self.dwCursorPosition.X
|
| 36 |
+
, self.wAttributes
|
| 37 |
+
, self.srWindow.Top, self.srWindow.Left, self.srWindow.Bottom, self.srWindow.Right
|
| 38 |
+
, self.dwMaximumWindowSize.Y, self.dwMaximumWindowSize.X
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
_GetStdHandle = windll.kernel32.GetStdHandle
|
| 42 |
+
_GetStdHandle.argtypes = [
|
| 43 |
+
wintypes.DWORD,
|
| 44 |
+
]
|
| 45 |
+
_GetStdHandle.restype = wintypes.HANDLE
|
| 46 |
+
|
| 47 |
+
_GetConsoleScreenBufferInfo = windll.kernel32.GetConsoleScreenBufferInfo
|
| 48 |
+
_GetConsoleScreenBufferInfo.argtypes = [
|
| 49 |
+
wintypes.HANDLE,
|
| 50 |
+
POINTER(CONSOLE_SCREEN_BUFFER_INFO),
|
| 51 |
+
]
|
| 52 |
+
_GetConsoleScreenBufferInfo.restype = wintypes.BOOL
|
| 53 |
+
|
| 54 |
+
_SetConsoleTextAttribute = windll.kernel32.SetConsoleTextAttribute
|
| 55 |
+
_SetConsoleTextAttribute.argtypes = [
|
| 56 |
+
wintypes.HANDLE,
|
| 57 |
+
wintypes.WORD,
|
| 58 |
+
]
|
| 59 |
+
_SetConsoleTextAttribute.restype = wintypes.BOOL
|
| 60 |
+
|
| 61 |
+
_SetConsoleCursorPosition = windll.kernel32.SetConsoleCursorPosition
|
| 62 |
+
_SetConsoleCursorPosition.argtypes = [
|
| 63 |
+
wintypes.HANDLE,
|
| 64 |
+
COORD,
|
| 65 |
+
]
|
| 66 |
+
_SetConsoleCursorPosition.restype = wintypes.BOOL
|
| 67 |
+
|
| 68 |
+
_FillConsoleOutputCharacterA = windll.kernel32.FillConsoleOutputCharacterA
|
| 69 |
+
_FillConsoleOutputCharacterA.argtypes = [
|
| 70 |
+
wintypes.HANDLE,
|
| 71 |
+
c_char,
|
| 72 |
+
wintypes.DWORD,
|
| 73 |
+
COORD,
|
| 74 |
+
POINTER(wintypes.DWORD),
|
| 75 |
+
]
|
| 76 |
+
_FillConsoleOutputCharacterA.restype = wintypes.BOOL
|
| 77 |
+
|
| 78 |
+
_FillConsoleOutputAttribute = windll.kernel32.FillConsoleOutputAttribute
|
| 79 |
+
_FillConsoleOutputAttribute.argtypes = [
|
| 80 |
+
wintypes.HANDLE,
|
| 81 |
+
wintypes.WORD,
|
| 82 |
+
wintypes.DWORD,
|
| 83 |
+
COORD,
|
| 84 |
+
POINTER(wintypes.DWORD),
|
| 85 |
+
]
|
| 86 |
+
_FillConsoleOutputAttribute.restype = wintypes.BOOL
|
| 87 |
+
|
| 88 |
+
_SetConsoleTitleW = windll.kernel32.SetConsoleTitleW
|
| 89 |
+
_SetConsoleTitleW.argtypes = [
|
| 90 |
+
wintypes.LPCWSTR
|
| 91 |
+
]
|
| 92 |
+
_SetConsoleTitleW.restype = wintypes.BOOL
|
| 93 |
+
|
| 94 |
+
_GetConsoleMode = windll.kernel32.GetConsoleMode
|
| 95 |
+
_GetConsoleMode.argtypes = [
|
| 96 |
+
wintypes.HANDLE,
|
| 97 |
+
POINTER(wintypes.DWORD)
|
| 98 |
+
]
|
| 99 |
+
_GetConsoleMode.restype = wintypes.BOOL
|
| 100 |
+
|
| 101 |
+
_SetConsoleMode = windll.kernel32.SetConsoleMode
|
| 102 |
+
_SetConsoleMode.argtypes = [
|
| 103 |
+
wintypes.HANDLE,
|
| 104 |
+
wintypes.DWORD
|
| 105 |
+
]
|
| 106 |
+
_SetConsoleMode.restype = wintypes.BOOL
|
| 107 |
+
|
| 108 |
+
def _winapi_test(handle):
|
| 109 |
+
csbi = CONSOLE_SCREEN_BUFFER_INFO()
|
| 110 |
+
success = _GetConsoleScreenBufferInfo(
|
| 111 |
+
handle, byref(csbi))
|
| 112 |
+
return bool(success)
|
| 113 |
+
|
| 114 |
+
def winapi_test():
|
| 115 |
+
return any(_winapi_test(h) for h in
|
| 116 |
+
(_GetStdHandle(STDOUT), _GetStdHandle(STDERR)))
|
| 117 |
+
|
| 118 |
+
def GetConsoleScreenBufferInfo(stream_id=STDOUT):
|
| 119 |
+
handle = _GetStdHandle(stream_id)
|
| 120 |
+
csbi = CONSOLE_SCREEN_BUFFER_INFO()
|
| 121 |
+
success = _GetConsoleScreenBufferInfo(
|
| 122 |
+
handle, byref(csbi))
|
| 123 |
+
return csbi
|
| 124 |
+
|
| 125 |
+
def SetConsoleTextAttribute(stream_id, attrs):
|
| 126 |
+
handle = _GetStdHandle(stream_id)
|
| 127 |
+
return _SetConsoleTextAttribute(handle, attrs)
|
| 128 |
+
|
| 129 |
+
def SetConsoleCursorPosition(stream_id, position, adjust=True):
|
| 130 |
+
position = COORD(*position)
|
| 131 |
+
# If the position is out of range, do nothing.
|
| 132 |
+
if position.Y <= 0 or position.X <= 0:
|
| 133 |
+
return
|
| 134 |
+
# Adjust for Windows' SetConsoleCursorPosition:
|
| 135 |
+
# 1. being 0-based, while ANSI is 1-based.
|
| 136 |
+
# 2. expecting (x,y), while ANSI uses (y,x).
|
| 137 |
+
adjusted_position = COORD(position.Y - 1, position.X - 1)
|
| 138 |
+
if adjust:
|
| 139 |
+
# Adjust for viewport's scroll position
|
| 140 |
+
sr = GetConsoleScreenBufferInfo(STDOUT).srWindow
|
| 141 |
+
adjusted_position.Y += sr.Top
|
| 142 |
+
adjusted_position.X += sr.Left
|
| 143 |
+
# Resume normal processing
|
| 144 |
+
handle = _GetStdHandle(stream_id)
|
| 145 |
+
return _SetConsoleCursorPosition(handle, adjusted_position)
|
| 146 |
+
|
| 147 |
+
def FillConsoleOutputCharacter(stream_id, char, length, start):
|
| 148 |
+
handle = _GetStdHandle(stream_id)
|
| 149 |
+
char = c_char(char.encode())
|
| 150 |
+
length = wintypes.DWORD(length)
|
| 151 |
+
num_written = wintypes.DWORD(0)
|
| 152 |
+
# Note that this is hard-coded for ANSI (vs wide) bytes.
|
| 153 |
+
success = _FillConsoleOutputCharacterA(
|
| 154 |
+
handle, char, length, start, byref(num_written))
|
| 155 |
+
return num_written.value
|
| 156 |
+
|
| 157 |
+
def FillConsoleOutputAttribute(stream_id, attr, length, start):
|
| 158 |
+
''' FillConsoleOutputAttribute( hConsole, csbi.wAttributes, dwConSize, coordScreen, &cCharsWritten )'''
|
| 159 |
+
handle = _GetStdHandle(stream_id)
|
| 160 |
+
attribute = wintypes.WORD(attr)
|
| 161 |
+
length = wintypes.DWORD(length)
|
| 162 |
+
num_written = wintypes.DWORD(0)
|
| 163 |
+
# Note that this is hard-coded for ANSI (vs wide) bytes.
|
| 164 |
+
return _FillConsoleOutputAttribute(
|
| 165 |
+
handle, attribute, length, start, byref(num_written))
|
| 166 |
+
|
| 167 |
+
def SetConsoleTitle(title):
|
| 168 |
+
return _SetConsoleTitleW(title)
|
| 169 |
+
|
| 170 |
+
def GetConsoleMode(handle):
|
| 171 |
+
mode = wintypes.DWORD()
|
| 172 |
+
success = _GetConsoleMode(handle, byref(mode))
|
| 173 |
+
if not success:
|
| 174 |
+
raise ctypes.WinError()
|
| 175 |
+
return mode.value
|
| 176 |
+
|
| 177 |
+
def SetConsoleMode(handle, mode):
|
| 178 |
+
success = _SetConsoleMode(handle, mode)
|
| 179 |
+
if not success:
|
| 180 |
+
raise ctypes.WinError()
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/colorama/winterm.py
ADDED
|
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright Jonathan Hartley 2013. BSD 3-Clause license, see LICENSE file.
|
| 2 |
+
try:
|
| 3 |
+
from msvcrt import get_osfhandle
|
| 4 |
+
except ImportError:
|
| 5 |
+
def get_osfhandle(_):
|
| 6 |
+
raise OSError("This isn't windows!")
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
from . import win32
|
| 10 |
+
|
| 11 |
+
# from wincon.h
|
| 12 |
+
class WinColor(object):
|
| 13 |
+
BLACK = 0
|
| 14 |
+
BLUE = 1
|
| 15 |
+
GREEN = 2
|
| 16 |
+
CYAN = 3
|
| 17 |
+
RED = 4
|
| 18 |
+
MAGENTA = 5
|
| 19 |
+
YELLOW = 6
|
| 20 |
+
GREY = 7
|
| 21 |
+
|
| 22 |
+
# from wincon.h
|
| 23 |
+
class WinStyle(object):
|
| 24 |
+
NORMAL = 0x00 # dim text, dim background
|
| 25 |
+
BRIGHT = 0x08 # bright text, dim background
|
| 26 |
+
BRIGHT_BACKGROUND = 0x80 # dim text, bright background
|
| 27 |
+
|
| 28 |
+
class WinTerm(object):
|
| 29 |
+
|
| 30 |
+
def __init__(self):
|
| 31 |
+
self._default = win32.GetConsoleScreenBufferInfo(win32.STDOUT).wAttributes
|
| 32 |
+
self.set_attrs(self._default)
|
| 33 |
+
self._default_fore = self._fore
|
| 34 |
+
self._default_back = self._back
|
| 35 |
+
self._default_style = self._style
|
| 36 |
+
# In order to emulate LIGHT_EX in windows, we borrow the BRIGHT style.
|
| 37 |
+
# So that LIGHT_EX colors and BRIGHT style do not clobber each other,
|
| 38 |
+
# we track them separately, since LIGHT_EX is overwritten by Fore/Back
|
| 39 |
+
# and BRIGHT is overwritten by Style codes.
|
| 40 |
+
self._light = 0
|
| 41 |
+
|
| 42 |
+
def get_attrs(self):
|
| 43 |
+
return self._fore + self._back * 16 + (self._style | self._light)
|
| 44 |
+
|
| 45 |
+
def set_attrs(self, value):
|
| 46 |
+
self._fore = value & 7
|
| 47 |
+
self._back = (value >> 4) & 7
|
| 48 |
+
self._style = value & (WinStyle.BRIGHT | WinStyle.BRIGHT_BACKGROUND)
|
| 49 |
+
|
| 50 |
+
def reset_all(self, on_stderr=None):
|
| 51 |
+
self.set_attrs(self._default)
|
| 52 |
+
self.set_console(attrs=self._default)
|
| 53 |
+
self._light = 0
|
| 54 |
+
|
| 55 |
+
def fore(self, fore=None, light=False, on_stderr=False):
|
| 56 |
+
if fore is None:
|
| 57 |
+
fore = self._default_fore
|
| 58 |
+
self._fore = fore
|
| 59 |
+
# Emulate LIGHT_EX with BRIGHT Style
|
| 60 |
+
if light:
|
| 61 |
+
self._light |= WinStyle.BRIGHT
|
| 62 |
+
else:
|
| 63 |
+
self._light &= ~WinStyle.BRIGHT
|
| 64 |
+
self.set_console(on_stderr=on_stderr)
|
| 65 |
+
|
| 66 |
+
def back(self, back=None, light=False, on_stderr=False):
|
| 67 |
+
if back is None:
|
| 68 |
+
back = self._default_back
|
| 69 |
+
self._back = back
|
| 70 |
+
# Emulate LIGHT_EX with BRIGHT_BACKGROUND Style
|
| 71 |
+
if light:
|
| 72 |
+
self._light |= WinStyle.BRIGHT_BACKGROUND
|
| 73 |
+
else:
|
| 74 |
+
self._light &= ~WinStyle.BRIGHT_BACKGROUND
|
| 75 |
+
self.set_console(on_stderr=on_stderr)
|
| 76 |
+
|
| 77 |
+
def style(self, style=None, on_stderr=False):
|
| 78 |
+
if style is None:
|
| 79 |
+
style = self._default_style
|
| 80 |
+
self._style = style
|
| 81 |
+
self.set_console(on_stderr=on_stderr)
|
| 82 |
+
|
| 83 |
+
def set_console(self, attrs=None, on_stderr=False):
|
| 84 |
+
if attrs is None:
|
| 85 |
+
attrs = self.get_attrs()
|
| 86 |
+
handle = win32.STDOUT
|
| 87 |
+
if on_stderr:
|
| 88 |
+
handle = win32.STDERR
|
| 89 |
+
win32.SetConsoleTextAttribute(handle, attrs)
|
| 90 |
+
|
| 91 |
+
def get_position(self, handle):
|
| 92 |
+
position = win32.GetConsoleScreenBufferInfo(handle).dwCursorPosition
|
| 93 |
+
# Because Windows coordinates are 0-based,
|
| 94 |
+
# and win32.SetConsoleCursorPosition expects 1-based.
|
| 95 |
+
position.X += 1
|
| 96 |
+
position.Y += 1
|
| 97 |
+
return position
|
| 98 |
+
|
| 99 |
+
def set_cursor_position(self, position=None, on_stderr=False):
|
| 100 |
+
if position is None:
|
| 101 |
+
# I'm not currently tracking the position, so there is no default.
|
| 102 |
+
# position = self.get_position()
|
| 103 |
+
return
|
| 104 |
+
handle = win32.STDOUT
|
| 105 |
+
if on_stderr:
|
| 106 |
+
handle = win32.STDERR
|
| 107 |
+
win32.SetConsoleCursorPosition(handle, position)
|
| 108 |
+
|
| 109 |
+
def cursor_adjust(self, x, y, on_stderr=False):
|
| 110 |
+
handle = win32.STDOUT
|
| 111 |
+
if on_stderr:
|
| 112 |
+
handle = win32.STDERR
|
| 113 |
+
position = self.get_position(handle)
|
| 114 |
+
adjusted_position = (position.Y + y, position.X + x)
|
| 115 |
+
win32.SetConsoleCursorPosition(handle, adjusted_position, adjust=False)
|
| 116 |
+
|
| 117 |
+
def erase_screen(self, mode=0, on_stderr=False):
|
| 118 |
+
# 0 should clear from the cursor to the end of the screen.
|
| 119 |
+
# 1 should clear from the cursor to the beginning of the screen.
|
| 120 |
+
# 2 should clear the entire screen, and move cursor to (1,1)
|
| 121 |
+
handle = win32.STDOUT
|
| 122 |
+
if on_stderr:
|
| 123 |
+
handle = win32.STDERR
|
| 124 |
+
csbi = win32.GetConsoleScreenBufferInfo(handle)
|
| 125 |
+
# get the number of character cells in the current buffer
|
| 126 |
+
cells_in_screen = csbi.dwSize.X * csbi.dwSize.Y
|
| 127 |
+
# get number of character cells before current cursor position
|
| 128 |
+
cells_before_cursor = csbi.dwSize.X * csbi.dwCursorPosition.Y + csbi.dwCursorPosition.X
|
| 129 |
+
if mode == 0:
|
| 130 |
+
from_coord = csbi.dwCursorPosition
|
| 131 |
+
cells_to_erase = cells_in_screen - cells_before_cursor
|
| 132 |
+
elif mode == 1:
|
| 133 |
+
from_coord = win32.COORD(0, 0)
|
| 134 |
+
cells_to_erase = cells_before_cursor
|
| 135 |
+
elif mode == 2:
|
| 136 |
+
from_coord = win32.COORD(0, 0)
|
| 137 |
+
cells_to_erase = cells_in_screen
|
| 138 |
+
else:
|
| 139 |
+
# invalid mode
|
| 140 |
+
return
|
| 141 |
+
# fill the entire screen with blanks
|
| 142 |
+
win32.FillConsoleOutputCharacter(handle, ' ', cells_to_erase, from_coord)
|
| 143 |
+
# now set the buffer's attributes accordingly
|
| 144 |
+
win32.FillConsoleOutputAttribute(handle, self.get_attrs(), cells_to_erase, from_coord)
|
| 145 |
+
if mode == 2:
|
| 146 |
+
# put the cursor where needed
|
| 147 |
+
win32.SetConsoleCursorPosition(handle, (1, 1))
|
| 148 |
+
|
| 149 |
+
def erase_line(self, mode=0, on_stderr=False):
|
| 150 |
+
# 0 should clear from the cursor to the end of the line.
|
| 151 |
+
# 1 should clear from the cursor to the beginning of the line.
|
| 152 |
+
# 2 should clear the entire line.
|
| 153 |
+
handle = win32.STDOUT
|
| 154 |
+
if on_stderr:
|
| 155 |
+
handle = win32.STDERR
|
| 156 |
+
csbi = win32.GetConsoleScreenBufferInfo(handle)
|
| 157 |
+
if mode == 0:
|
| 158 |
+
from_coord = csbi.dwCursorPosition
|
| 159 |
+
cells_to_erase = csbi.dwSize.X - csbi.dwCursorPosition.X
|
| 160 |
+
elif mode == 1:
|
| 161 |
+
from_coord = win32.COORD(0, csbi.dwCursorPosition.Y)
|
| 162 |
+
cells_to_erase = csbi.dwCursorPosition.X
|
| 163 |
+
elif mode == 2:
|
| 164 |
+
from_coord = win32.COORD(0, csbi.dwCursorPosition.Y)
|
| 165 |
+
cells_to_erase = csbi.dwSize.X
|
| 166 |
+
else:
|
| 167 |
+
# invalid mode
|
| 168 |
+
return
|
| 169 |
+
# fill the entire screen with blanks
|
| 170 |
+
win32.FillConsoleOutputCharacter(handle, ' ', cells_to_erase, from_coord)
|
| 171 |
+
# now set the buffer's attributes accordingly
|
| 172 |
+
win32.FillConsoleOutputAttribute(handle, self.get_attrs(), cells_to_erase, from_coord)
|
| 173 |
+
|
| 174 |
+
def set_title(self, title):
|
| 175 |
+
win32.SetConsoleTitle(title)
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
def enable_vt_processing(fd):
|
| 179 |
+
if win32.windll is None or not win32.winapi_test():
|
| 180 |
+
return False
|
| 181 |
+
|
| 182 |
+
try:
|
| 183 |
+
handle = get_osfhandle(fd)
|
| 184 |
+
mode = win32.GetConsoleMode(handle)
|
| 185 |
+
win32.SetConsoleMode(
|
| 186 |
+
handle,
|
| 187 |
+
mode | win32.ENABLE_VIRTUAL_TERMINAL_PROCESSING,
|
| 188 |
+
)
|
| 189 |
+
|
| 190 |
+
mode = win32.GetConsoleMode(handle)
|
| 191 |
+
if mode & win32.ENABLE_VIRTUAL_TERMINAL_PROCESSING:
|
| 192 |
+
return True
|
| 193 |
+
# Can get TypeError in testsuite where 'fd' is a Mock()
|
| 194 |
+
except (OSError, TypeError):
|
| 195 |
+
return False
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/cuda/ccuda.pxd
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
| 2 |
+
# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
|
| 3 |
+
|
| 4 |
+
from cuda.bindings.cydriver cimport *
|
| 5 |
+
|
| 6 |
+
cdef extern from *:
|
| 7 |
+
"""
|
| 8 |
+
#ifdef _MSC_VER
|
| 9 |
+
#pragma message ( "The cuda.ccuda module is deprecated and will be removed in a future release, " \
|
| 10 |
+
"please switch to use the cuda.bindings.cydriver module instead." )
|
| 11 |
+
#else
|
| 12 |
+
#warning The cuda.ccuda module is deprecated and will be removed in a future release, \
|
| 13 |
+
please switch to use the cuda.bindings.cydriver module instead.
|
| 14 |
+
#endif
|
| 15 |
+
"""
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/cuda/ccudart.cpython-312-x86_64-linux-gnu.so
ADDED
|
Binary file (23.4 kB). View file
|
|
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/cuda/cnvrtc.pxd
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
| 2 |
+
# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
|
| 3 |
+
|
| 4 |
+
from cuda.bindings.cynvrtc cimport *
|
| 5 |
+
|
| 6 |
+
cdef extern from *:
|
| 7 |
+
"""
|
| 8 |
+
#ifdef _MSC_VER
|
| 9 |
+
#pragma message ( "The cuda.cnvrtc module is deprecated and will be removed in a future release, " \
|
| 10 |
+
"please switch to use the cuda.bindings.cynvrtc module instead." )
|
| 11 |
+
#else
|
| 12 |
+
#warning The cuda.cnvrtc module is deprecated and will be removed in a future release, \
|
| 13 |
+
please switch to use the cuda.bindings.cynvrtc module instead.
|
| 14 |
+
#endif
|
| 15 |
+
"""
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/cuda/cnvrtc.pyx
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
| 2 |
+
# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
|
| 3 |
+
|
| 4 |
+
from cuda.bindings.cynvrtc cimport *
|
| 5 |
+
from cuda.bindings import cynvrtc
|
| 6 |
+
__pyx_capi__ = cynvrtc.__pyx_capi__
|
| 7 |
+
del cynvrtc
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/cuda/cuda.cpp
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/cuda/cuda.cpython-312-x86_64-linux-gnu.so
ADDED
|
Binary file (23.5 kB). View file
|
|
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/cuda/cudart.pyx
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
| 2 |
+
# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
|
| 3 |
+
|
| 4 |
+
import warnings as _warnings
|
| 5 |
+
|
| 6 |
+
from cuda.bindings.runtime import *
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
cdef extern from *:
|
| 10 |
+
"""
|
| 11 |
+
#ifdef _MSC_VER
|
| 12 |
+
#pragma message ( "The cuda.cudart module is deprecated and will be removed in a future release, " \
|
| 13 |
+
"please switch to use the cuda.bindings.runtime module instead." )
|
| 14 |
+
#else
|
| 15 |
+
#warning The cuda.cudart module is deprecated and will be removed in a future release, \
|
| 16 |
+
please switch to use the cuda.bindings.runtime module instead.
|
| 17 |
+
#endif
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
_warnings.warn("The cuda.cudart module is deprecated and will be removed in a future release, "
|
| 22 |
+
"please switch to use the cuda.bindings.runtime module instead.", FutureWarning, stacklevel=2)
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/cuda/nvrtc.pyx
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
| 2 |
+
# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
|
| 3 |
+
|
| 4 |
+
import warnings as _warnings
|
| 5 |
+
|
| 6 |
+
from cuda.bindings.nvrtc import *
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
cdef extern from *:
|
| 10 |
+
"""
|
| 11 |
+
#ifdef _MSC_VER
|
| 12 |
+
#pragma message ( "The cuda.nvrtc module is deprecated and will be removed in a future release, " \
|
| 13 |
+
"please switch to use the cuda.bindings.nvrtc module instead." )
|
| 14 |
+
#else
|
| 15 |
+
#warning The cuda.nvrtc module is deprecated and will be removed in a future release, \
|
| 16 |
+
please switch to use the cuda.bindings.nvrtc module instead.
|
| 17 |
+
#endif
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
_warnings.warn("The cuda.nvrtc module is deprecated and will be removed in a future release, "
|
| 22 |
+
"please switch to use the cuda.bindings.nvrtc module instead.", FutureWarning, stacklevel=2)
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/datasets/arrow_dataset.py
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/datasets/arrow_reader.py
ADDED
|
@@ -0,0 +1,663 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
|
| 15 |
+
# Lint as: python3
|
| 16 |
+
"""Arrow ArrowReader."""
|
| 17 |
+
|
| 18 |
+
import copy
|
| 19 |
+
import math
|
| 20 |
+
import os
|
| 21 |
+
import re
|
| 22 |
+
import shutil
|
| 23 |
+
from dataclasses import dataclass
|
| 24 |
+
from functools import partial
|
| 25 |
+
from pathlib import Path
|
| 26 |
+
from typing import TYPE_CHECKING, List, Optional, Union
|
| 27 |
+
|
| 28 |
+
import pyarrow as pa
|
| 29 |
+
import pyarrow.parquet as pq
|
| 30 |
+
from tqdm.contrib.concurrent import thread_map
|
| 31 |
+
|
| 32 |
+
from .download.download_config import DownloadConfig
|
| 33 |
+
from .naming import _split_re, filenames_for_dataset_split
|
| 34 |
+
from .table import InMemoryTable, MemoryMappedTable, Table, concat_tables
|
| 35 |
+
from .utils import logging
|
| 36 |
+
from .utils import tqdm as hf_tqdm
|
| 37 |
+
from .utils.deprecation_utils import deprecated
|
| 38 |
+
from .utils.file_utils import cached_path
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
if TYPE_CHECKING:
|
| 42 |
+
from .info import DatasetInfo # noqa: F401
|
| 43 |
+
from .splits import Split, SplitInfo # noqa: F401
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
logger = logging.get_logger(__name__)
|
| 47 |
+
|
| 48 |
+
HF_GCP_BASE_URL = "https://storage.googleapis.com/huggingface-nlp/cache/datasets"
|
| 49 |
+
|
| 50 |
+
_SUB_SPEC_RE = re.compile(
|
| 51 |
+
rf"""
|
| 52 |
+
^
|
| 53 |
+
(?P<split>{_split_re[1:-1]})
|
| 54 |
+
(\[
|
| 55 |
+
((?P<from>-?\d+)
|
| 56 |
+
(?P<from_pct>%)?)?
|
| 57 |
+
:
|
| 58 |
+
((?P<to>-?\d+)
|
| 59 |
+
(?P<to_pct>%)?)?
|
| 60 |
+
\])?(\((?P<rounding>[^\)]*)\))?
|
| 61 |
+
$
|
| 62 |
+
""", # remove ^ and $
|
| 63 |
+
re.X,
|
| 64 |
+
)
|
| 65 |
+
|
| 66 |
+
_ADDITION_SEP_RE = re.compile(r"\s*\+\s*")
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
class DatasetNotOnHfGcsError(ConnectionError):
|
| 70 |
+
"""When you can't get the dataset from the Hf google cloud storage"""
|
| 71 |
+
|
| 72 |
+
pass
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
class MissingFilesOnHfGcsError(ConnectionError):
|
| 76 |
+
"""When some files are missing on the Hf oogle cloud storage"""
|
| 77 |
+
|
| 78 |
+
pass
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
@dataclass(frozen=True)
|
| 82 |
+
class FileInstructions:
|
| 83 |
+
"""The file instructions associated with a split ReadInstruction.
|
| 84 |
+
|
| 85 |
+
Attributes:
|
| 86 |
+
num_examples: `int`, The total number of examples
|
| 87 |
+
file_instructions: List[dict(filename, skip, take)], the files information.
|
| 88 |
+
The filenames contains the relative path, not absolute.
|
| 89 |
+
skip/take indicates which example read in the file: `ds.slice(skip, take)`
|
| 90 |
+
"""
|
| 91 |
+
|
| 92 |
+
num_examples: int
|
| 93 |
+
file_instructions: List[dict]
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
def make_file_instructions(
|
| 97 |
+
name: str,
|
| 98 |
+
split_infos: List["SplitInfo"],
|
| 99 |
+
instruction: Union[str, "ReadInstruction"],
|
| 100 |
+
filetype_suffix: Optional[str] = None,
|
| 101 |
+
prefix_path: Optional[str] = None,
|
| 102 |
+
) -> FileInstructions:
|
| 103 |
+
"""Returns instructions of the split dict.
|
| 104 |
+
|
| 105 |
+
Args:
|
| 106 |
+
name (`str`): Name of the dataset.
|
| 107 |
+
split_infos (`list` of `[SplitInfo]`): Dataset splits information.
|
| 108 |
+
instruction ([`ReadInstruction`] or `str`): Reading instruction for a dataset.
|
| 109 |
+
filetype_suffix (`str`, *optional*): Suffix of dataset files, e.g. 'arrow' or 'parquet'.
|
| 110 |
+
prefix_path (`str`, *optional*): Prefix of dataset files, e.g. directory name.
|
| 111 |
+
|
| 112 |
+
Returns:
|
| 113 |
+
[`FileInstructions`]
|
| 114 |
+
"""
|
| 115 |
+
if not isinstance(name, str):
|
| 116 |
+
raise TypeError(f"Expected str 'name', but got: {type(name).__name__}")
|
| 117 |
+
elif not name:
|
| 118 |
+
raise ValueError("Expected non-empty str 'name'")
|
| 119 |
+
name2len = {info.name: info.num_examples for info in split_infos}
|
| 120 |
+
name2shard_lengths = {info.name: info.shard_lengths for info in split_infos}
|
| 121 |
+
name2filenames = {
|
| 122 |
+
info.name: filenames_for_dataset_split(
|
| 123 |
+
path=prefix_path,
|
| 124 |
+
dataset_name=name,
|
| 125 |
+
split=info.name,
|
| 126 |
+
filetype_suffix=filetype_suffix,
|
| 127 |
+
shard_lengths=name2shard_lengths[info.name],
|
| 128 |
+
)
|
| 129 |
+
for info in split_infos
|
| 130 |
+
}
|
| 131 |
+
if not isinstance(instruction, ReadInstruction):
|
| 132 |
+
instruction = ReadInstruction.from_spec(instruction)
|
| 133 |
+
# Create the absolute instruction (per split)
|
| 134 |
+
absolute_instructions = instruction.to_absolute(name2len)
|
| 135 |
+
|
| 136 |
+
# For each split, return the files instruction (skip/take)
|
| 137 |
+
file_instructions = []
|
| 138 |
+
num_examples = 0
|
| 139 |
+
for abs_instr in absolute_instructions:
|
| 140 |
+
split_length = name2len[abs_instr.splitname]
|
| 141 |
+
filenames = name2filenames[abs_instr.splitname]
|
| 142 |
+
shard_lengths = name2shard_lengths[abs_instr.splitname]
|
| 143 |
+
from_ = 0 if abs_instr.from_ is None else abs_instr.from_
|
| 144 |
+
to = split_length if abs_instr.to is None else abs_instr.to
|
| 145 |
+
if shard_lengths is None: # not sharded
|
| 146 |
+
for filename in filenames:
|
| 147 |
+
take = to - from_
|
| 148 |
+
if take == 0:
|
| 149 |
+
continue
|
| 150 |
+
num_examples += take
|
| 151 |
+
file_instructions.append({"filename": filename, "skip": from_, "take": take})
|
| 152 |
+
else: # sharded
|
| 153 |
+
index_start = 0 # Beginning (included) of moving window.
|
| 154 |
+
index_end = 0 # End (excluded) of moving window.
|
| 155 |
+
for filename, shard_length in zip(filenames, shard_lengths):
|
| 156 |
+
index_end += shard_length
|
| 157 |
+
if from_ < index_end and to > index_start: # There is something to take.
|
| 158 |
+
skip = from_ - index_start if from_ > index_start else 0
|
| 159 |
+
take = to - index_start - skip if to < index_end else -1
|
| 160 |
+
if take == 0:
|
| 161 |
+
continue
|
| 162 |
+
file_instructions.append({"filename": filename, "skip": skip, "take": take})
|
| 163 |
+
num_examples += shard_length - skip if take == -1 else take
|
| 164 |
+
index_start += shard_length
|
| 165 |
+
return FileInstructions(
|
| 166 |
+
num_examples=num_examples,
|
| 167 |
+
file_instructions=file_instructions,
|
| 168 |
+
)
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
class BaseReader:
|
| 172 |
+
"""
|
| 173 |
+
Build a Dataset object out of Instruction instance(s).
|
| 174 |
+
"""
|
| 175 |
+
|
| 176 |
+
def __init__(self, path: str, info: Optional["DatasetInfo"]):
|
| 177 |
+
"""Initializes ArrowReader.
|
| 178 |
+
|
| 179 |
+
Args:
|
| 180 |
+
path (str): path where tfrecords are stored.
|
| 181 |
+
info (DatasetInfo): info about the dataset.
|
| 182 |
+
"""
|
| 183 |
+
self._path: str = path
|
| 184 |
+
self._info: Optional["DatasetInfo"] = info
|
| 185 |
+
self._filetype_suffix: Optional[str] = None
|
| 186 |
+
|
| 187 |
+
def _get_table_from_filename(self, filename_skip_take, in_memory=False) -> Table:
|
| 188 |
+
"""Returns a Dataset instance from given (filename, skip, take)."""
|
| 189 |
+
raise NotImplementedError
|
| 190 |
+
|
| 191 |
+
def _read_files(self, files, in_memory=False) -> Table:
|
| 192 |
+
"""Returns Dataset for given file instructions.
|
| 193 |
+
|
| 194 |
+
Args:
|
| 195 |
+
files: List[dict(filename, skip, take)], the files information.
|
| 196 |
+
The filenames contain the absolute path, not relative.
|
| 197 |
+
skip/take indicates which example read in the file: `ds.slice(skip, take)`
|
| 198 |
+
in_memory (bool, default False): Whether to copy the data in-memory.
|
| 199 |
+
"""
|
| 200 |
+
if len(files) == 0 or not all(isinstance(f, dict) for f in files):
|
| 201 |
+
raise ValueError("please provide valid file informations")
|
| 202 |
+
files = copy.deepcopy(files)
|
| 203 |
+
for f in files:
|
| 204 |
+
f["filename"] = os.path.join(self._path, f["filename"])
|
| 205 |
+
|
| 206 |
+
pa_tables = thread_map(
|
| 207 |
+
partial(self._get_table_from_filename, in_memory=in_memory),
|
| 208 |
+
files,
|
| 209 |
+
tqdm_class=hf_tqdm,
|
| 210 |
+
desc="Loading dataset shards",
|
| 211 |
+
# set `disable=None` rather than `disable=False` by default to disable progress bar when no TTY attached
|
| 212 |
+
disable=len(files) <= 16 or None,
|
| 213 |
+
)
|
| 214 |
+
pa_tables = [t for t in pa_tables if len(t) > 0]
|
| 215 |
+
if not pa_tables and (self._info is None or self._info.features is None):
|
| 216 |
+
raise ValueError(
|
| 217 |
+
"Tried to read an empty table. Please specify at least info.features to create an empty table with the right type."
|
| 218 |
+
)
|
| 219 |
+
pa_tables = pa_tables or [InMemoryTable.from_batches([], schema=pa.schema(self._info.features.type))]
|
| 220 |
+
pa_table = concat_tables(pa_tables) if len(pa_tables) != 1 else pa_tables[0]
|
| 221 |
+
return pa_table
|
| 222 |
+
|
| 223 |
+
def get_file_instructions(self, name, instruction, split_infos):
|
| 224 |
+
"""Return list of dict {'filename': str, 'skip': int, 'take': int}"""
|
| 225 |
+
file_instructions = make_file_instructions(
|
| 226 |
+
name, split_infos, instruction, filetype_suffix=self._filetype_suffix, prefix_path=self._path
|
| 227 |
+
)
|
| 228 |
+
files = file_instructions.file_instructions
|
| 229 |
+
return files
|
| 230 |
+
|
| 231 |
+
def read(
|
| 232 |
+
self,
|
| 233 |
+
name,
|
| 234 |
+
instructions,
|
| 235 |
+
split_infos,
|
| 236 |
+
in_memory=False,
|
| 237 |
+
):
|
| 238 |
+
"""Returns Dataset instance(s).
|
| 239 |
+
|
| 240 |
+
Args:
|
| 241 |
+
name (str): name of the dataset.
|
| 242 |
+
instructions (ReadInstruction): instructions to read.
|
| 243 |
+
Instruction can be string and will then be passed to the Instruction
|
| 244 |
+
constructor as it.
|
| 245 |
+
split_infos (list of SplitInfo proto): the available splits for dataset.
|
| 246 |
+
in_memory (bool, default False): Whether to copy the data in-memory.
|
| 247 |
+
|
| 248 |
+
Returns:
|
| 249 |
+
kwargs to build a single Dataset instance.
|
| 250 |
+
"""
|
| 251 |
+
|
| 252 |
+
files = self.get_file_instructions(name, instructions, split_infos)
|
| 253 |
+
if not files:
|
| 254 |
+
msg = f'Instruction "{instructions}" corresponds to no data!'
|
| 255 |
+
raise ValueError(msg)
|
| 256 |
+
return self.read_files(files=files, original_instructions=instructions, in_memory=in_memory)
|
| 257 |
+
|
| 258 |
+
def read_files(
|
| 259 |
+
self,
|
| 260 |
+
files: List[dict],
|
| 261 |
+
original_instructions: Union[None, "ReadInstruction", "Split"] = None,
|
| 262 |
+
in_memory=False,
|
| 263 |
+
):
|
| 264 |
+
"""Returns single Dataset instance for the set of file instructions.
|
| 265 |
+
|
| 266 |
+
Args:
|
| 267 |
+
files: List[dict(filename, skip, take)], the files information.
|
| 268 |
+
The filenames contains the relative path, not absolute.
|
| 269 |
+
skip/take indicates which example read in the file: `ds.skip().take()`
|
| 270 |
+
original_instructions: store the original instructions used to build the dataset split in the dataset.
|
| 271 |
+
in_memory (bool, default False): Whether to copy the data in-memory.
|
| 272 |
+
|
| 273 |
+
Returns:
|
| 274 |
+
kwargs to build a Dataset instance.
|
| 275 |
+
"""
|
| 276 |
+
# Prepend path to filename
|
| 277 |
+
pa_table = self._read_files(files, in_memory=in_memory)
|
| 278 |
+
# If original_instructions is not None, convert it to a human-readable NamedSplit
|
| 279 |
+
if original_instructions is not None:
|
| 280 |
+
from .splits import Split # noqa
|
| 281 |
+
|
| 282 |
+
split = Split(str(original_instructions))
|
| 283 |
+
else:
|
| 284 |
+
split = None
|
| 285 |
+
dataset_kwargs = {"arrow_table": pa_table, "info": self._info, "split": split}
|
| 286 |
+
return dataset_kwargs
|
| 287 |
+
|
| 288 |
+
@deprecated()
|
| 289 |
+
def download_from_hf_gcs(self, download_config: DownloadConfig, relative_data_dir):
|
| 290 |
+
"""
|
| 291 |
+
Download the dataset files from the Hf GCS
|
| 292 |
+
|
| 293 |
+
Args:
|
| 294 |
+
dl_cache_dir: `str`, the local cache directory used to download files
|
| 295 |
+
relative_data_dir: `str`, the relative directory of the remote files from
|
| 296 |
+
the `datasets` directory on GCS.
|
| 297 |
+
|
| 298 |
+
"""
|
| 299 |
+
remote_cache_dir = HF_GCP_BASE_URL + "/" + relative_data_dir.replace(os.sep, "/")
|
| 300 |
+
try:
|
| 301 |
+
remote_dataset_info = os.path.join(remote_cache_dir, "dataset_info.json")
|
| 302 |
+
downloaded_dataset_info = cached_path(
|
| 303 |
+
remote_dataset_info.replace(os.sep, "/"), download_config=download_config
|
| 304 |
+
)
|
| 305 |
+
shutil.move(downloaded_dataset_info, os.path.join(self._path, "dataset_info.json"))
|
| 306 |
+
if self._info is not None:
|
| 307 |
+
self._info.update(self._info.from_directory(self._path))
|
| 308 |
+
except FileNotFoundError as err:
|
| 309 |
+
raise DatasetNotOnHfGcsError(err) from None
|
| 310 |
+
try:
|
| 311 |
+
for split in self._info.splits:
|
| 312 |
+
file_instructions = self.get_file_instructions(
|
| 313 |
+
name=self._info.builder_name,
|
| 314 |
+
instruction=split,
|
| 315 |
+
split_infos=self._info.splits.values(),
|
| 316 |
+
)
|
| 317 |
+
for file_instruction in file_instructions:
|
| 318 |
+
file_to_download = str(Path(file_instruction["filename"]).relative_to(self._path))
|
| 319 |
+
remote_prepared_filename = os.path.join(remote_cache_dir, file_to_download)
|
| 320 |
+
downloaded_prepared_filename = cached_path(
|
| 321 |
+
remote_prepared_filename.replace(os.sep, "/"), download_config=download_config
|
| 322 |
+
)
|
| 323 |
+
shutil.move(downloaded_prepared_filename, file_instruction["filename"])
|
| 324 |
+
except FileNotFoundError as err:
|
| 325 |
+
raise MissingFilesOnHfGcsError(err) from None
|
| 326 |
+
|
| 327 |
+
|
| 328 |
+
class ArrowReader(BaseReader):
|
| 329 |
+
"""
|
| 330 |
+
Build a Dataset object out of Instruction instance(s).
|
| 331 |
+
This Reader uses either memory mapping or file descriptors (in-memory) on arrow files.
|
| 332 |
+
"""
|
| 333 |
+
|
| 334 |
+
def __init__(self, path: str, info: Optional["DatasetInfo"]):
|
| 335 |
+
"""Initializes ArrowReader.
|
| 336 |
+
|
| 337 |
+
Args:
|
| 338 |
+
path (str): path where Arrow files are stored.
|
| 339 |
+
info (DatasetInfo): info about the dataset.
|
| 340 |
+
"""
|
| 341 |
+
super().__init__(path, info)
|
| 342 |
+
self._filetype_suffix = "arrow"
|
| 343 |
+
|
| 344 |
+
def _get_table_from_filename(self, filename_skip_take, in_memory=False) -> Table:
|
| 345 |
+
"""Returns a Dataset instance from given (filename, skip, take)."""
|
| 346 |
+
filename, skip, take = (
|
| 347 |
+
filename_skip_take["filename"],
|
| 348 |
+
filename_skip_take["skip"] if "skip" in filename_skip_take else None,
|
| 349 |
+
filename_skip_take["take"] if "take" in filename_skip_take else None,
|
| 350 |
+
)
|
| 351 |
+
table = ArrowReader.read_table(filename, in_memory=in_memory)
|
| 352 |
+
if take == -1:
|
| 353 |
+
take = len(table) - skip
|
| 354 |
+
# here we don't want to slice an empty table, or it may segfault
|
| 355 |
+
if skip is not None and take is not None and not (skip == 0 and take == len(table)):
|
| 356 |
+
table = table.slice(skip, take)
|
| 357 |
+
return table
|
| 358 |
+
|
| 359 |
+
@staticmethod
|
| 360 |
+
def read_table(filename, in_memory=False) -> Table:
|
| 361 |
+
"""
|
| 362 |
+
Read table from file.
|
| 363 |
+
|
| 364 |
+
Args:
|
| 365 |
+
filename (str): File name of the table.
|
| 366 |
+
in_memory (bool, default=False): Whether to copy the data in-memory.
|
| 367 |
+
|
| 368 |
+
Returns:
|
| 369 |
+
pyarrow.Table
|
| 370 |
+
"""
|
| 371 |
+
table_cls = InMemoryTable if in_memory else MemoryMappedTable
|
| 372 |
+
return table_cls.from_file(filename)
|
| 373 |
+
|
| 374 |
+
|
| 375 |
+
class ParquetReader(BaseReader):
|
| 376 |
+
"""
|
| 377 |
+
Build a Dataset object out of Instruction instance(s).
|
| 378 |
+
This Reader uses memory mapping on parquet files.
|
| 379 |
+
"""
|
| 380 |
+
|
| 381 |
+
def __init__(self, path: str, info: Optional["DatasetInfo"]):
|
| 382 |
+
"""Initializes ParquetReader.
|
| 383 |
+
|
| 384 |
+
Args:
|
| 385 |
+
path (str): path where tfrecords are stored.
|
| 386 |
+
info (DatasetInfo): info about the dataset.
|
| 387 |
+
"""
|
| 388 |
+
super().__init__(path, info)
|
| 389 |
+
self._filetype_suffix = "parquet"
|
| 390 |
+
|
| 391 |
+
def _get_table_from_filename(self, filename_skip_take, **kwargs):
|
| 392 |
+
"""Returns a Dataset instance from given (filename, skip, take)."""
|
| 393 |
+
filename, skip, take = (
|
| 394 |
+
filename_skip_take["filename"],
|
| 395 |
+
filename_skip_take["skip"] if "skip" in filename_skip_take else None,
|
| 396 |
+
filename_skip_take["take"] if "take" in filename_skip_take else None,
|
| 397 |
+
)
|
| 398 |
+
# Parquet read_table always loads data in memory, independently of memory_map
|
| 399 |
+
pa_table = pq.read_table(filename, memory_map=True)
|
| 400 |
+
# here we don't want to slice an empty table, or it may segfault
|
| 401 |
+
if skip is not None and take is not None and not (skip == 0 and take == len(pa_table)):
|
| 402 |
+
pa_table = pa_table.slice(skip, take)
|
| 403 |
+
return pa_table
|
| 404 |
+
|
| 405 |
+
|
| 406 |
+
@dataclass(frozen=True)
|
| 407 |
+
class _AbsoluteInstruction:
|
| 408 |
+
"""A machine friendly slice: defined absolute positive boundaries."""
|
| 409 |
+
|
| 410 |
+
splitname: str
|
| 411 |
+
from_: int # uint (starting index).
|
| 412 |
+
to: int # uint (ending index).
|
| 413 |
+
|
| 414 |
+
|
| 415 |
+
@dataclass(frozen=True)
|
| 416 |
+
class _RelativeInstruction:
|
| 417 |
+
"""Represents a single parsed slicing instruction, can use % and negatives."""
|
| 418 |
+
|
| 419 |
+
splitname: str
|
| 420 |
+
from_: Optional[int] = None # int (starting index) or None if no lower boundary.
|
| 421 |
+
to: Optional[int] = None # int (ending index) or None if no upper boundary.
|
| 422 |
+
unit: Optional[str] = None
|
| 423 |
+
rounding: Optional[str] = None
|
| 424 |
+
|
| 425 |
+
def __post_init__(self):
|
| 426 |
+
if self.unit is not None and self.unit not in ["%", "abs"]:
|
| 427 |
+
raise ValueError("unit must be either % or abs")
|
| 428 |
+
if self.rounding is not None and self.rounding not in ["closest", "pct1_dropremainder"]:
|
| 429 |
+
raise ValueError("rounding must be either closest or pct1_dropremainder")
|
| 430 |
+
if self.unit != "%" and self.rounding is not None:
|
| 431 |
+
raise ValueError("It is forbidden to specify rounding if not using percent slicing.")
|
| 432 |
+
if self.unit == "%" and self.from_ is not None and abs(self.from_) > 100:
|
| 433 |
+
raise ValueError("Percent slice boundaries must be > -100 and < 100.")
|
| 434 |
+
if self.unit == "%" and self.to is not None and abs(self.to) > 100:
|
| 435 |
+
raise ValueError("Percent slice boundaries must be > -100 and < 100.")
|
| 436 |
+
# Update via __dict__ due to instance being "frozen"
|
| 437 |
+
self.__dict__["rounding"] = "closest" if self.rounding is None and self.unit == "%" else self.rounding
|
| 438 |
+
|
| 439 |
+
|
| 440 |
+
def _str_to_read_instruction(spec):
|
| 441 |
+
"""Returns ReadInstruction for given string."""
|
| 442 |
+
res = _SUB_SPEC_RE.match(spec)
|
| 443 |
+
if not res:
|
| 444 |
+
raise ValueError(f"Unrecognized instruction format: {spec}")
|
| 445 |
+
unit = "%" if res.group("from_pct") or res.group("to_pct") else "abs"
|
| 446 |
+
return ReadInstruction(
|
| 447 |
+
split_name=res.group("split"),
|
| 448 |
+
rounding=res.group("rounding"),
|
| 449 |
+
from_=int(res.group("from")) if res.group("from") else None,
|
| 450 |
+
to=int(res.group("to")) if res.group("to") else None,
|
| 451 |
+
unit=unit,
|
| 452 |
+
)
|
| 453 |
+
|
| 454 |
+
|
| 455 |
+
def _pct_to_abs_pct1(boundary, num_examples):
|
| 456 |
+
# Using math.trunc here, since -99.5% should give -99%, not -100%.
|
| 457 |
+
if num_examples < 100:
|
| 458 |
+
msg = (
|
| 459 |
+
'Using "pct1_dropremainder" rounding on a split with less than 100 '
|
| 460 |
+
"elements is forbidden: it always results in an empty dataset."
|
| 461 |
+
)
|
| 462 |
+
raise ValueError(msg)
|
| 463 |
+
return boundary * math.trunc(num_examples / 100.0)
|
| 464 |
+
|
| 465 |
+
|
| 466 |
+
def _pct_to_abs_closest(boundary, num_examples):
|
| 467 |
+
return int(round(boundary * num_examples / 100.0))
|
| 468 |
+
|
| 469 |
+
|
| 470 |
+
def _rel_to_abs_instr(rel_instr, name2len):
|
| 471 |
+
"""Returns _AbsoluteInstruction instance for given RelativeInstruction.
|
| 472 |
+
|
| 473 |
+
Args:
|
| 474 |
+
rel_instr: RelativeInstruction instance.
|
| 475 |
+
name2len: dict {split_name: num_examples}.
|
| 476 |
+
"""
|
| 477 |
+
pct_to_abs = _pct_to_abs_closest if rel_instr.rounding == "closest" else _pct_to_abs_pct1
|
| 478 |
+
split = rel_instr.splitname
|
| 479 |
+
if split not in name2len:
|
| 480 |
+
raise ValueError(f'Unknown split "{split}". Should be one of {list(name2len)}.')
|
| 481 |
+
num_examples = name2len[split]
|
| 482 |
+
from_ = rel_instr.from_
|
| 483 |
+
to = rel_instr.to
|
| 484 |
+
if rel_instr.unit == "%":
|
| 485 |
+
from_ = 0 if from_ is None else pct_to_abs(from_, num_examples)
|
| 486 |
+
to = num_examples if to is None else pct_to_abs(to, num_examples)
|
| 487 |
+
else:
|
| 488 |
+
from_ = 0 if from_ is None else from_
|
| 489 |
+
to = num_examples if to is None else to
|
| 490 |
+
if from_ < 0:
|
| 491 |
+
from_ = max(num_examples + from_, 0)
|
| 492 |
+
if to < 0:
|
| 493 |
+
to = max(num_examples + to, 0)
|
| 494 |
+
from_ = min(from_, num_examples)
|
| 495 |
+
to = min(to, num_examples)
|
| 496 |
+
return _AbsoluteInstruction(split, from_, to)
|
| 497 |
+
|
| 498 |
+
|
| 499 |
+
class ReadInstruction:
|
| 500 |
+
"""Reading instruction for a dataset.
|
| 501 |
+
|
| 502 |
+
Examples::
|
| 503 |
+
|
| 504 |
+
# The following lines are equivalent:
|
| 505 |
+
ds = datasets.load_dataset('mnist', split='test[:33%]')
|
| 506 |
+
ds = datasets.load_dataset('mnist', split=datasets.ReadInstruction.from_spec('test[:33%]'))
|
| 507 |
+
ds = datasets.load_dataset('mnist', split=datasets.ReadInstruction('test', to=33, unit='%'))
|
| 508 |
+
ds = datasets.load_dataset('mnist', split=datasets.ReadInstruction(
|
| 509 |
+
'test', from_=0, to=33, unit='%'))
|
| 510 |
+
|
| 511 |
+
# The following lines are equivalent:
|
| 512 |
+
ds = datasets.load_dataset('mnist', split='test[:33%]+train[1:-1]')
|
| 513 |
+
ds = datasets.load_dataset('mnist', split=datasets.ReadInstruction.from_spec(
|
| 514 |
+
'test[:33%]+train[1:-1]'))
|
| 515 |
+
ds = datasets.load_dataset('mnist', split=(
|
| 516 |
+
datasets.ReadInstruction('test', to=33, unit='%') +
|
| 517 |
+
datasets.ReadInstruction('train', from_=1, to=-1, unit='abs')))
|
| 518 |
+
|
| 519 |
+
# The following lines are equivalent:
|
| 520 |
+
ds = datasets.load_dataset('mnist', split='test[:33%](pct1_dropremainder)')
|
| 521 |
+
ds = datasets.load_dataset('mnist', split=datasets.ReadInstruction.from_spec(
|
| 522 |
+
'test[:33%](pct1_dropremainder)'))
|
| 523 |
+
ds = datasets.load_dataset('mnist', split=datasets.ReadInstruction(
|
| 524 |
+
'test', from_=0, to=33, unit='%', rounding="pct1_dropremainder"))
|
| 525 |
+
|
| 526 |
+
# 10-fold validation:
|
| 527 |
+
tests = datasets.load_dataset(
|
| 528 |
+
'mnist',
|
| 529 |
+
[datasets.ReadInstruction('train', from_=k, to=k+10, unit='%')
|
| 530 |
+
for k in range(0, 100, 10)])
|
| 531 |
+
trains = datasets.load_dataset(
|
| 532 |
+
'mnist',
|
| 533 |
+
[datasets.ReadInstruction('train', to=k, unit='%') + datasets.ReadInstruction('train', from_=k+10, unit='%')
|
| 534 |
+
for k in range(0, 100, 10)])
|
| 535 |
+
|
| 536 |
+
"""
|
| 537 |
+
|
| 538 |
+
def _init(self, relative_instructions):
|
| 539 |
+
# Private initializer.
|
| 540 |
+
self._relative_instructions = relative_instructions
|
| 541 |
+
|
| 542 |
+
@classmethod
|
| 543 |
+
def _read_instruction_from_relative_instructions(cls, relative_instructions):
|
| 544 |
+
"""Returns ReadInstruction obj initialized with relative_instructions."""
|
| 545 |
+
# Use __new__ to bypass __init__ used by public API and not conveniant here.
|
| 546 |
+
result = cls.__new__(cls)
|
| 547 |
+
result._init(relative_instructions) # pylint: disable=protected-access
|
| 548 |
+
return result
|
| 549 |
+
|
| 550 |
+
def __init__(self, split_name, rounding=None, from_=None, to=None, unit=None):
|
| 551 |
+
"""Initialize ReadInstruction.
|
| 552 |
+
|
| 553 |
+
Args:
|
| 554 |
+
split_name (str): name of the split to read. Eg: 'train'.
|
| 555 |
+
rounding (str, optional): The rounding behaviour to use when percent slicing is
|
| 556 |
+
used. Ignored when slicing with absolute indices.
|
| 557 |
+
Possible values:
|
| 558 |
+
- 'closest' (default): The specified percentages are rounded to the
|
| 559 |
+
closest value. Use this if you want specified percents to be as
|
| 560 |
+
much exact as possible.
|
| 561 |
+
- 'pct1_dropremainder': the specified percentages are treated as
|
| 562 |
+
multiple of 1%. Use this option if you want consistency. Eg:
|
| 563 |
+
len(5%) == 5 * len(1%).
|
| 564 |
+
Using this option, one might not be able to use the full set of
|
| 565 |
+
examples, if the number of those is not a multiple of 100.
|
| 566 |
+
from_ (int):
|
| 567 |
+
to (int): alternative way of specifying slicing boundaries. If any of
|
| 568 |
+
{from_, to, unit} argument is used, slicing cannot be specified as
|
| 569 |
+
string.
|
| 570 |
+
unit (str): optional, one of:
|
| 571 |
+
'%': to set the slicing unit as percents of the split size.
|
| 572 |
+
'abs': to set the slicing unit as absolute numbers.
|
| 573 |
+
"""
|
| 574 |
+
# This constructor is not always called. See factory method
|
| 575 |
+
# `_read_instruction_from_relative_instructions`. Common init instructions
|
| 576 |
+
# MUST be placed in the _init method.
|
| 577 |
+
self._init([_RelativeInstruction(split_name, from_, to, unit, rounding)])
|
| 578 |
+
|
| 579 |
+
@classmethod
|
| 580 |
+
def from_spec(cls, spec):
|
| 581 |
+
"""Creates a `ReadInstruction` instance out of a string spec.
|
| 582 |
+
|
| 583 |
+
Args:
|
| 584 |
+
spec (`str`):
|
| 585 |
+
Split(s) + optional slice(s) to read + optional rounding
|
| 586 |
+
if percents are used as the slicing unit. A slice can be specified,
|
| 587 |
+
using absolute numbers (`int`) or percentages (`int`).
|
| 588 |
+
|
| 589 |
+
Examples:
|
| 590 |
+
|
| 591 |
+
```
|
| 592 |
+
test: test split.
|
| 593 |
+
test + validation: test split + validation split.
|
| 594 |
+
test[10:]: test split, minus its first 10 records.
|
| 595 |
+
test[:10%]: first 10% records of test split.
|
| 596 |
+
test[:20%](pct1_dropremainder): first 10% records, rounded with the pct1_dropremainder rounding.
|
| 597 |
+
test[:-5%]+train[40%:60%]: first 95% of test + middle 20% of train.
|
| 598 |
+
```
|
| 599 |
+
|
| 600 |
+
Returns:
|
| 601 |
+
ReadInstruction instance.
|
| 602 |
+
"""
|
| 603 |
+
spec = str(spec) # Need to convert to str in case of NamedSplit instance.
|
| 604 |
+
subs = _ADDITION_SEP_RE.split(spec)
|
| 605 |
+
if not subs:
|
| 606 |
+
raise ValueError(f"No instructions could be built out of {spec}")
|
| 607 |
+
instruction = _str_to_read_instruction(subs[0])
|
| 608 |
+
return sum((_str_to_read_instruction(sub) for sub in subs[1:]), instruction)
|
| 609 |
+
|
| 610 |
+
def to_spec(self):
|
| 611 |
+
rel_instr_specs = []
|
| 612 |
+
for rel_instr in self._relative_instructions:
|
| 613 |
+
rel_instr_spec = rel_instr.splitname
|
| 614 |
+
if rel_instr.from_ is not None or rel_instr.to is not None:
|
| 615 |
+
from_ = rel_instr.from_
|
| 616 |
+
to = rel_instr.to
|
| 617 |
+
unit = rel_instr.unit
|
| 618 |
+
rounding = rel_instr.rounding
|
| 619 |
+
unit = unit if unit == "%" else ""
|
| 620 |
+
from_ = str(from_) + unit if from_ is not None else ""
|
| 621 |
+
to = str(to) + unit if to is not None else ""
|
| 622 |
+
slice_str = f"[{from_}:{to}]"
|
| 623 |
+
rounding_str = (
|
| 624 |
+
f"({rounding})" if unit == "%" and rounding is not None and rounding != "closest" else ""
|
| 625 |
+
)
|
| 626 |
+
rel_instr_spec += slice_str + rounding_str
|
| 627 |
+
rel_instr_specs.append(rel_instr_spec)
|
| 628 |
+
return "+".join(rel_instr_specs)
|
| 629 |
+
|
| 630 |
+
def __add__(self, other):
|
| 631 |
+
"""Returns a new ReadInstruction obj, result of appending other to self."""
|
| 632 |
+
if not isinstance(other, ReadInstruction):
|
| 633 |
+
msg = "ReadInstruction can only be added to another ReadInstruction obj."
|
| 634 |
+
raise TypeError(msg)
|
| 635 |
+
self_ris = self._relative_instructions
|
| 636 |
+
other_ris = other._relative_instructions # pylint: disable=protected-access
|
| 637 |
+
if (
|
| 638 |
+
self_ris[0].unit != "abs"
|
| 639 |
+
and other_ris[0].unit != "abs"
|
| 640 |
+
and self._relative_instructions[0].rounding != other_ris[0].rounding
|
| 641 |
+
):
|
| 642 |
+
raise ValueError("It is forbidden to sum ReadInstruction instances with different rounding values.")
|
| 643 |
+
return self._read_instruction_from_relative_instructions(self_ris + other_ris)
|
| 644 |
+
|
| 645 |
+
def __str__(self):
|
| 646 |
+
return self.to_spec()
|
| 647 |
+
|
| 648 |
+
def __repr__(self):
|
| 649 |
+
return f"ReadInstruction({self._relative_instructions})"
|
| 650 |
+
|
| 651 |
+
def to_absolute(self, name2len):
|
| 652 |
+
"""Translate instruction into a list of absolute instructions.
|
| 653 |
+
|
| 654 |
+
Those absolute instructions are then to be added together.
|
| 655 |
+
|
| 656 |
+
Args:
|
| 657 |
+
name2len (`dict`):
|
| 658 |
+
Associating split names to number of examples.
|
| 659 |
+
|
| 660 |
+
Returns:
|
| 661 |
+
list of _AbsoluteInstruction instances (corresponds to the + in spec).
|
| 662 |
+
"""
|
| 663 |
+
return [_rel_to_abs_instr(rel_instr, name2len) for rel_instr in self._relative_instructions]
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/datasets/builder.bak.py
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/datasets/builder.py
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/datasets/config.py
ADDED
|
@@ -0,0 +1,272 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import importlib
|
| 2 |
+
import importlib.metadata
|
| 3 |
+
import logging
|
| 4 |
+
import os
|
| 5 |
+
import platform
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
from typing import Optional
|
| 8 |
+
|
| 9 |
+
from packaging import version
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
logger = logging.getLogger(__name__.split(".", 1)[0]) # to avoid circular import from .utils.logging
|
| 13 |
+
|
| 14 |
+
# Datasets
|
| 15 |
+
S3_DATASETS_BUCKET_PREFIX = "https://s3.amazonaws.com/datasets.huggingface.co/datasets/datasets"
|
| 16 |
+
CLOUDFRONT_DATASETS_DISTRIB_PREFIX = "https://cdn-datasets.huggingface.co/datasets/datasets"
|
| 17 |
+
REPO_DATASETS_URL = "https://raw.githubusercontent.com/huggingface/datasets/{revision}/datasets/{path}/{name}"
|
| 18 |
+
|
| 19 |
+
# Metrics
|
| 20 |
+
S3_METRICS_BUCKET_PREFIX = "https://s3.amazonaws.com/datasets.huggingface.co/datasets/metrics"
|
| 21 |
+
CLOUDFRONT_METRICS_DISTRIB_PREFIX = "https://cdn-datasets.huggingface.co/datasets/metric"
|
| 22 |
+
REPO_METRICS_URL = "https://raw.githubusercontent.com/huggingface/datasets/{revision}/metrics/{path}/{name}"
|
| 23 |
+
|
| 24 |
+
# Hub
|
| 25 |
+
HF_ENDPOINT = os.environ.get("HF_ENDPOINT", "https://huggingface.co")
|
| 26 |
+
HUB_DATASETS_URL = HF_ENDPOINT + "/datasets/{repo_id}/resolve/{revision}/{path}"
|
| 27 |
+
HUB_DATASETS_HFFS_URL = "hf://datasets/{repo_id}@{revision}/{path}"
|
| 28 |
+
HUB_DEFAULT_VERSION = "main"
|
| 29 |
+
|
| 30 |
+
PY_VERSION = version.parse(platform.python_version())
|
| 31 |
+
|
| 32 |
+
# General environment variables accepted values for booleans
|
| 33 |
+
ENV_VARS_TRUE_VALUES = {"1", "ON", "YES", "TRUE"}
|
| 34 |
+
ENV_VARS_FALSE_VALUES = {"0", "OFF", "NO", "FALSE"}
|
| 35 |
+
ENV_VARS_TRUE_AND_AUTO_VALUES = ENV_VARS_TRUE_VALUES.union({"AUTO"})
|
| 36 |
+
ENV_VARS_FALSE_AND_AUTO_VALUES = ENV_VARS_FALSE_VALUES.union({"AUTO"})
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
# Imports
|
| 40 |
+
DILL_VERSION = version.parse(importlib.metadata.version("dill"))
|
| 41 |
+
FSSPEC_VERSION = version.parse(importlib.metadata.version("fsspec"))
|
| 42 |
+
PANDAS_VERSION = version.parse(importlib.metadata.version("pandas"))
|
| 43 |
+
PYARROW_VERSION = version.parse(importlib.metadata.version("pyarrow"))
|
| 44 |
+
HF_HUB_VERSION = version.parse(importlib.metadata.version("huggingface_hub"))
|
| 45 |
+
|
| 46 |
+
USE_TF = os.environ.get("USE_TF", "AUTO").upper()
|
| 47 |
+
USE_TORCH = os.environ.get("USE_TORCH", "AUTO").upper()
|
| 48 |
+
USE_JAX = os.environ.get("USE_JAX", "AUTO").upper()
|
| 49 |
+
|
| 50 |
+
TORCH_VERSION = "N/A"
|
| 51 |
+
TORCH_AVAILABLE = False
|
| 52 |
+
|
| 53 |
+
if USE_TORCH in ENV_VARS_TRUE_AND_AUTO_VALUES and USE_TF not in ENV_VARS_TRUE_VALUES:
|
| 54 |
+
TORCH_AVAILABLE = importlib.util.find_spec("torch") is not None
|
| 55 |
+
if TORCH_AVAILABLE:
|
| 56 |
+
try:
|
| 57 |
+
TORCH_VERSION = version.parse(importlib.metadata.version("torch"))
|
| 58 |
+
logger.info(f"PyTorch version {TORCH_VERSION} available.")
|
| 59 |
+
except importlib.metadata.PackageNotFoundError:
|
| 60 |
+
pass
|
| 61 |
+
else:
|
| 62 |
+
logger.info("Disabling PyTorch because USE_TF is set")
|
| 63 |
+
|
| 64 |
+
POLARS_VERSION = "N/A"
|
| 65 |
+
POLARS_AVAILABLE = importlib.util.find_spec("polars") is not None
|
| 66 |
+
|
| 67 |
+
if POLARS_AVAILABLE:
|
| 68 |
+
try:
|
| 69 |
+
POLARS_VERSION = version.parse(importlib.metadata.version("polars"))
|
| 70 |
+
logger.info(f"Polars version {POLARS_VERSION} available.")
|
| 71 |
+
except importlib.metadata.PackageNotFoundError:
|
| 72 |
+
pass
|
| 73 |
+
|
| 74 |
+
TF_VERSION = "N/A"
|
| 75 |
+
TF_AVAILABLE = False
|
| 76 |
+
|
| 77 |
+
if USE_TF in ENV_VARS_TRUE_AND_AUTO_VALUES and USE_TORCH not in ENV_VARS_TRUE_VALUES:
|
| 78 |
+
TF_AVAILABLE = importlib.util.find_spec("tensorflow") is not None
|
| 79 |
+
if TF_AVAILABLE:
|
| 80 |
+
# For the metadata, we have to look for both tensorflow and tensorflow-cpu
|
| 81 |
+
for package in [
|
| 82 |
+
"tensorflow",
|
| 83 |
+
"tensorflow-cpu",
|
| 84 |
+
"tensorflow-gpu",
|
| 85 |
+
"tf-nightly",
|
| 86 |
+
"tf-nightly-cpu",
|
| 87 |
+
"tf-nightly-gpu",
|
| 88 |
+
"intel-tensorflow",
|
| 89 |
+
"tensorflow-rocm",
|
| 90 |
+
"tensorflow-macos",
|
| 91 |
+
]:
|
| 92 |
+
try:
|
| 93 |
+
TF_VERSION = version.parse(importlib.metadata.version(package))
|
| 94 |
+
except importlib.metadata.PackageNotFoundError:
|
| 95 |
+
continue
|
| 96 |
+
else:
|
| 97 |
+
break
|
| 98 |
+
else:
|
| 99 |
+
TF_AVAILABLE = False
|
| 100 |
+
if TF_AVAILABLE:
|
| 101 |
+
if TF_VERSION.major < 2:
|
| 102 |
+
logger.info(f"TensorFlow found but with version {TF_VERSION}. `datasets` requires version 2 minimum.")
|
| 103 |
+
TF_AVAILABLE = False
|
| 104 |
+
else:
|
| 105 |
+
logger.info(f"TensorFlow version {TF_VERSION} available.")
|
| 106 |
+
else:
|
| 107 |
+
logger.info("Disabling Tensorflow because USE_TORCH is set")
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
JAX_VERSION = "N/A"
|
| 111 |
+
JAX_AVAILABLE = False
|
| 112 |
+
|
| 113 |
+
if USE_JAX in ENV_VARS_TRUE_AND_AUTO_VALUES:
|
| 114 |
+
JAX_AVAILABLE = importlib.util.find_spec("jax") is not None and importlib.util.find_spec("jaxlib") is not None
|
| 115 |
+
if JAX_AVAILABLE:
|
| 116 |
+
try:
|
| 117 |
+
JAX_VERSION = version.parse(importlib.metadata.version("jax"))
|
| 118 |
+
logger.info(f"JAX version {JAX_VERSION} available.")
|
| 119 |
+
except importlib.metadata.PackageNotFoundError:
|
| 120 |
+
pass
|
| 121 |
+
else:
|
| 122 |
+
logger.info("Disabling JAX because USE_JAX is set to False")
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
USE_BEAM = os.environ.get("USE_BEAM", "AUTO").upper()
|
| 126 |
+
BEAM_VERSION = "N/A"
|
| 127 |
+
BEAM_AVAILABLE = False
|
| 128 |
+
if USE_BEAM in ENV_VARS_TRUE_AND_AUTO_VALUES:
|
| 129 |
+
try:
|
| 130 |
+
BEAM_VERSION = version.parse(importlib.metadata.version("apache_beam"))
|
| 131 |
+
BEAM_AVAILABLE = True
|
| 132 |
+
logger.info(f"Apache Beam version {BEAM_VERSION} available.")
|
| 133 |
+
except importlib.metadata.PackageNotFoundError:
|
| 134 |
+
pass
|
| 135 |
+
else:
|
| 136 |
+
logger.info("Disabling Apache Beam because USE_BEAM is set to False")
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
# Optional tools for data loading
|
| 140 |
+
SQLALCHEMY_AVAILABLE = importlib.util.find_spec("sqlalchemy") is not None
|
| 141 |
+
|
| 142 |
+
# Optional tools for feature decoding
|
| 143 |
+
PIL_AVAILABLE = importlib.util.find_spec("PIL") is not None
|
| 144 |
+
IS_OPUS_SUPPORTED = importlib.util.find_spec("soundfile") is not None and version.parse(
|
| 145 |
+
importlib.import_module("soundfile").__libsndfile_version__
|
| 146 |
+
) >= version.parse("1.0.31")
|
| 147 |
+
IS_MP3_SUPPORTED = importlib.util.find_spec("soundfile") is not None and version.parse(
|
| 148 |
+
importlib.import_module("soundfile").__libsndfile_version__
|
| 149 |
+
) >= version.parse("1.1.0")
|
| 150 |
+
|
| 151 |
+
# Optional compression tools
|
| 152 |
+
RARFILE_AVAILABLE = importlib.util.find_spec("rarfile") is not None
|
| 153 |
+
ZSTANDARD_AVAILABLE = importlib.util.find_spec("zstandard") is not None
|
| 154 |
+
LZ4_AVAILABLE = importlib.util.find_spec("lz4") is not None
|
| 155 |
+
PY7ZR_AVAILABLE = importlib.util.find_spec("py7zr") is not None
|
| 156 |
+
|
| 157 |
+
# Cache location
|
| 158 |
+
DEFAULT_XDG_CACHE_HOME = "~/.cache"
|
| 159 |
+
XDG_CACHE_HOME = os.getenv("XDG_CACHE_HOME", DEFAULT_XDG_CACHE_HOME)
|
| 160 |
+
DEFAULT_HF_CACHE_HOME = os.path.join(XDG_CACHE_HOME, "huggingface")
|
| 161 |
+
HF_CACHE_HOME = os.path.expanduser(os.getenv("HF_HOME", DEFAULT_HF_CACHE_HOME))
|
| 162 |
+
|
| 163 |
+
DEFAULT_HF_DATASETS_CACHE = os.path.join(HF_CACHE_HOME, "datasets")
|
| 164 |
+
HF_DATASETS_CACHE = Path(os.getenv("HF_DATASETS_CACHE", DEFAULT_HF_DATASETS_CACHE))
|
| 165 |
+
|
| 166 |
+
DEFAULT_HF_METRICS_CACHE = os.path.join(HF_CACHE_HOME, "metrics")
|
| 167 |
+
HF_METRICS_CACHE = Path(os.getenv("HF_METRICS_CACHE", DEFAULT_HF_METRICS_CACHE))
|
| 168 |
+
|
| 169 |
+
DEFAULT_HF_MODULES_CACHE = os.path.join(HF_CACHE_HOME, "modules")
|
| 170 |
+
HF_MODULES_CACHE = Path(os.getenv("HF_MODULES_CACHE", DEFAULT_HF_MODULES_CACHE))
|
| 171 |
+
|
| 172 |
+
DOWNLOADED_DATASETS_DIR = "downloads"
|
| 173 |
+
DEFAULT_DOWNLOADED_DATASETS_PATH = os.path.join(HF_DATASETS_CACHE, DOWNLOADED_DATASETS_DIR)
|
| 174 |
+
DOWNLOADED_DATASETS_PATH = Path(os.getenv("HF_DATASETS_DOWNLOADED_DATASETS_PATH", DEFAULT_DOWNLOADED_DATASETS_PATH))
|
| 175 |
+
|
| 176 |
+
EXTRACTED_DATASETS_DIR = "extracted"
|
| 177 |
+
DEFAULT_EXTRACTED_DATASETS_PATH = os.path.join(DEFAULT_DOWNLOADED_DATASETS_PATH, EXTRACTED_DATASETS_DIR)
|
| 178 |
+
EXTRACTED_DATASETS_PATH = Path(os.getenv("HF_DATASETS_EXTRACTED_DATASETS_PATH", DEFAULT_EXTRACTED_DATASETS_PATH))
|
| 179 |
+
|
| 180 |
+
# Download count for the website
|
| 181 |
+
HF_UPDATE_DOWNLOAD_COUNTS = (
|
| 182 |
+
os.environ.get("HF_UPDATE_DOWNLOAD_COUNTS", "AUTO").upper() in ENV_VARS_TRUE_AND_AUTO_VALUES
|
| 183 |
+
)
|
| 184 |
+
|
| 185 |
+
# For downloads and to check remote files metadata
|
| 186 |
+
HF_DATASETS_MULTITHREADING_MAX_WORKERS = 16
|
| 187 |
+
|
| 188 |
+
# Remote dataset scripts support
|
| 189 |
+
__HF_DATASETS_TRUST_REMOTE_CODE = os.environ.get("HF_DATASETS_TRUST_REMOTE_CODE", "ask")
|
| 190 |
+
HF_DATASETS_TRUST_REMOTE_CODE: Optional[bool] = (
|
| 191 |
+
True
|
| 192 |
+
if __HF_DATASETS_TRUST_REMOTE_CODE.upper() in ENV_VARS_TRUE_VALUES
|
| 193 |
+
else False
|
| 194 |
+
if __HF_DATASETS_TRUST_REMOTE_CODE.upper() in ENV_VARS_FALSE_VALUES
|
| 195 |
+
else None
|
| 196 |
+
)
|
| 197 |
+
TIME_OUT_REMOTE_CODE = 15
|
| 198 |
+
|
| 199 |
+
# Dataset viewer API
|
| 200 |
+
USE_PARQUET_EXPORT = True
|
| 201 |
+
|
| 202 |
+
# Batch size constants. For more info, see:
|
| 203 |
+
# https://github.com/apache/arrow/blob/master/docs/source/cpp/arrays.rst#size-limitations-and-recommendations)
|
| 204 |
+
DEFAULT_MAX_BATCH_SIZE = 1000
|
| 205 |
+
|
| 206 |
+
# Size of the preloaded record batch in `Dataset.__iter__`
|
| 207 |
+
ARROW_READER_BATCH_SIZE_IN_DATASET_ITER = 10
|
| 208 |
+
|
| 209 |
+
# Max shard size in bytes (e.g. to shard parquet datasets in push_to_hub or download_and_prepare)
|
| 210 |
+
MAX_SHARD_SIZE = "500MB"
|
| 211 |
+
|
| 212 |
+
# Parquet configuration
|
| 213 |
+
PARQUET_ROW_GROUP_SIZE_FOR_AUDIO_DATASETS = 100
|
| 214 |
+
PARQUET_ROW_GROUP_SIZE_FOR_IMAGE_DATASETS = 100
|
| 215 |
+
PARQUET_ROW_GROUP_SIZE_FOR_BINARY_DATASETS = 100
|
| 216 |
+
|
| 217 |
+
# Offline mode
|
| 218 |
+
HF_DATASETS_OFFLINE = os.environ.get("HF_DATASETS_OFFLINE", "AUTO").upper() in ENV_VARS_TRUE_VALUES
|
| 219 |
+
|
| 220 |
+
# Here, `True` will disable progress bars globally without possibility of enabling it
|
| 221 |
+
# programmatically. `False` will enable them without possibility of disabling them.
|
| 222 |
+
# If environment variable is not set (None), then the user is free to enable/disable
|
| 223 |
+
# them programmatically.
|
| 224 |
+
# TL;DR: env variable has priority over code
|
| 225 |
+
__HF_DATASETS_DISABLE_PROGRESS_BARS = os.environ.get("HF_DATASETS_DISABLE_PROGRESS_BARS")
|
| 226 |
+
HF_DATASETS_DISABLE_PROGRESS_BARS: Optional[bool] = (
|
| 227 |
+
__HF_DATASETS_DISABLE_PROGRESS_BARS.upper() in ENV_VARS_TRUE_VALUES
|
| 228 |
+
if __HF_DATASETS_DISABLE_PROGRESS_BARS is not None
|
| 229 |
+
else None
|
| 230 |
+
)
|
| 231 |
+
|
| 232 |
+
# In-memory
|
| 233 |
+
DEFAULT_IN_MEMORY_MAX_SIZE = 0 # Disabled
|
| 234 |
+
IN_MEMORY_MAX_SIZE = float(os.environ.get("HF_DATASETS_IN_MEMORY_MAX_SIZE", DEFAULT_IN_MEMORY_MAX_SIZE))
|
| 235 |
+
|
| 236 |
+
# File names
|
| 237 |
+
DATASET_ARROW_FILENAME = "dataset.arrow"
|
| 238 |
+
DATASET_INDICES_FILENAME = "indices.arrow"
|
| 239 |
+
DATASET_STATE_JSON_FILENAME = "state.json"
|
| 240 |
+
DATASET_INFO_FILENAME = "dataset_info.json"
|
| 241 |
+
DATASETDICT_INFOS_FILENAME = "dataset_infos.json"
|
| 242 |
+
LICENSE_FILENAME = "LICENSE"
|
| 243 |
+
METRIC_INFO_FILENAME = "metric_info.json"
|
| 244 |
+
DATASETDICT_JSON_FILENAME = "dataset_dict.json"
|
| 245 |
+
METADATA_CONFIGS_FIELD = "configs"
|
| 246 |
+
REPOCARD_FILENAME = "README.md"
|
| 247 |
+
REPOYAML_FILENAME = ".huggingface.yaml"
|
| 248 |
+
|
| 249 |
+
MODULE_NAME_FOR_DYNAMIC_MODULES = "datasets_modules"
|
| 250 |
+
|
| 251 |
+
MAX_DATASET_CONFIG_ID_READABLE_LENGTH = 255
|
| 252 |
+
|
| 253 |
+
# Temporary cache directory prefix
|
| 254 |
+
TEMP_CACHE_DIR_PREFIX = "hf_datasets-"
|
| 255 |
+
|
| 256 |
+
# Streaming
|
| 257 |
+
STREAMING_READ_MAX_RETRIES = 20
|
| 258 |
+
STREAMING_READ_RETRY_INTERVAL = 5
|
| 259 |
+
|
| 260 |
+
# Datasets without script
|
| 261 |
+
DATA_FILES_MAX_NUMBER_FOR_MODULE_INFERENCE = 200
|
| 262 |
+
GLOBBED_DATA_FILES_MAX_NUMBER_FOR_MODULE_INFERENCE = 10
|
| 263 |
+
ARCHIVED_DATA_FILES_MAX_NUMBER_FOR_MODULE_INFERENCE = 200
|
| 264 |
+
|
| 265 |
+
# Progress bars
|
| 266 |
+
PBAR_REFRESH_TIME_INTERVAL = 0.05 # 20 progress updates per sec
|
| 267 |
+
|
| 268 |
+
# Maximum number of uploaded files per commit
|
| 269 |
+
UPLOADS_MAX_NUMBER_PER_COMMIT = 50
|
| 270 |
+
|
| 271 |
+
# Backward compatibiliy
|
| 272 |
+
MAX_TABLE_NBYTES_FOR_PICKLING = 4 << 30
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/datasets/dataset_dict.py
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/datasets/distributed.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import TypeVar
|
| 2 |
+
|
| 3 |
+
from .arrow_dataset import Dataset, _split_by_node_map_style_dataset
|
| 4 |
+
from .iterable_dataset import IterableDataset, _split_by_node_iterable_dataset
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
DatasetType = TypeVar("DatasetType", Dataset, IterableDataset)
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def split_dataset_by_node(dataset: DatasetType, rank: int, world_size: int) -> DatasetType:
|
| 11 |
+
"""
|
| 12 |
+
Split a dataset for the node at rank `rank` in a pool of nodes of size `world_size`.
|
| 13 |
+
|
| 14 |
+
For map-style datasets:
|
| 15 |
+
|
| 16 |
+
Each node is assigned a chunk of data, e.g. rank 0 is given the first chunk of the dataset.
|
| 17 |
+
To maximize data loading throughput, chunks are made of contiguous data on disk if possible.
|
| 18 |
+
|
| 19 |
+
For iterable datasets:
|
| 20 |
+
|
| 21 |
+
If the dataset has a number of shards that is a factor of `world_size` (i.e. if `dataset.n_shards % world_size == 0`),
|
| 22 |
+
then the shards are evenly assigned across the nodes, which is the most optimized.
|
| 23 |
+
Otherwise, each node keeps 1 example out of `world_size`, skipping the other examples.
|
| 24 |
+
|
| 25 |
+
Args:
|
| 26 |
+
dataset ([`Dataset`] or [`IterableDataset`]):
|
| 27 |
+
The dataset to split by node.
|
| 28 |
+
rank (`int`):
|
| 29 |
+
Rank of the current node.
|
| 30 |
+
world_size (`int`):
|
| 31 |
+
Total number of nodes.
|
| 32 |
+
|
| 33 |
+
Returns:
|
| 34 |
+
[`Dataset`] or [`IterableDataset`]: The dataset to be used on the node at rank `rank`.
|
| 35 |
+
"""
|
| 36 |
+
if isinstance(dataset, Dataset):
|
| 37 |
+
return _split_by_node_map_style_dataset(dataset, rank=rank, world_size=world_size)
|
| 38 |
+
else:
|
| 39 |
+
return _split_by_node_iterable_dataset(dataset, rank=rank, world_size=world_size)
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/datasets/exceptions.py
ADDED
|
@@ -0,0 +1,196 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPDX-License-Identifier: Apache-2.0
|
| 2 |
+
# Copyright 2023 The HuggingFace Authors.
|
| 3 |
+
from typing import Any, Dict, List, Optional, Union
|
| 4 |
+
|
| 5 |
+
from huggingface_hub import HfFileSystem
|
| 6 |
+
|
| 7 |
+
from . import config
|
| 8 |
+
from .table import CastError
|
| 9 |
+
from .utils.deprecation_utils import deprecated
|
| 10 |
+
from .utils.track import TrackedIterable, tracked_list, tracked_str
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class DatasetsError(Exception):
|
| 14 |
+
"""Base class for exceptions in this library."""
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class DefunctDatasetError(DatasetsError):
|
| 18 |
+
"""The dataset has been defunct."""
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class FileNotFoundDatasetsError(DatasetsError, FileNotFoundError):
|
| 22 |
+
"""FileNotFoundError raised by this library."""
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
class DataFilesNotFoundError(FileNotFoundDatasetsError):
|
| 26 |
+
"""No (supported) data files found."""
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
class DatasetNotFoundError(FileNotFoundDatasetsError):
|
| 30 |
+
"""Dataset not found.
|
| 31 |
+
|
| 32 |
+
Raised when trying to access:
|
| 33 |
+
- a missing dataset, or
|
| 34 |
+
- a private/gated dataset and the user is not authenticated.
|
| 35 |
+
"""
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
class DatasetBuildError(DatasetsError):
|
| 39 |
+
pass
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
class ManualDownloadError(DatasetBuildError):
|
| 43 |
+
pass
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
class FileFormatError(DatasetBuildError):
|
| 47 |
+
pass
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
class DatasetGenerationError(DatasetBuildError):
|
| 51 |
+
pass
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
class DatasetGenerationCastError(DatasetGenerationError):
|
| 55 |
+
@classmethod
|
| 56 |
+
def from_cast_error(
|
| 57 |
+
cls,
|
| 58 |
+
cast_error: CastError,
|
| 59 |
+
builder_name: str,
|
| 60 |
+
gen_kwargs: Dict[str, Any],
|
| 61 |
+
token: Optional[Union[bool, str]],
|
| 62 |
+
) -> "DatasetGenerationCastError":
|
| 63 |
+
explanation_message = (
|
| 64 |
+
f"\n\nAll the data files must have the same columns, but at some point {cast_error.details()}"
|
| 65 |
+
)
|
| 66 |
+
formatted_tracked_gen_kwargs: List[str] = []
|
| 67 |
+
for gen_kwarg in gen_kwargs.values():
|
| 68 |
+
if not isinstance(gen_kwarg, (tracked_str, tracked_list, TrackedIterable)):
|
| 69 |
+
continue
|
| 70 |
+
while isinstance(gen_kwarg, (tracked_list, TrackedIterable)) and gen_kwarg.last_item is not None:
|
| 71 |
+
gen_kwarg = gen_kwarg.last_item
|
| 72 |
+
if isinstance(gen_kwarg, tracked_str):
|
| 73 |
+
gen_kwarg = gen_kwarg.get_origin()
|
| 74 |
+
if isinstance(gen_kwarg, str) and gen_kwarg.startswith("hf://"):
|
| 75 |
+
resolved_path = HfFileSystem(endpoint=config.HF_ENDPOINT, token=token).resolve_path(gen_kwarg)
|
| 76 |
+
gen_kwarg = "hf://" + resolved_path.unresolve()
|
| 77 |
+
if "@" + resolved_path.revision in gen_kwarg:
|
| 78 |
+
gen_kwarg = (
|
| 79 |
+
gen_kwarg.replace("@" + resolved_path.revision, "", 1)
|
| 80 |
+
+ f" (at revision {resolved_path.revision})"
|
| 81 |
+
)
|
| 82 |
+
formatted_tracked_gen_kwargs.append(str(gen_kwarg))
|
| 83 |
+
if formatted_tracked_gen_kwargs:
|
| 84 |
+
explanation_message += f"\n\nThis happened while the {builder_name} dataset builder was generating data using\n\n{', '.join(formatted_tracked_gen_kwargs)}"
|
| 85 |
+
help_message = "\n\nPlease either edit the data files to have matching columns, or separate them into different configurations (see docs at https://hf.co/docs/hub/datasets-manual-configuration#multiple-configurations)"
|
| 86 |
+
return cls("An error occurred while generating the dataset" + explanation_message + help_message)
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
@deprecated("Use 'ChecksumVerificationError' instead.")
|
| 90 |
+
class ChecksumVerificationException(Exception):
|
| 91 |
+
"""Exceptions during checksums verifications of downloaded files.
|
| 92 |
+
|
| 93 |
+
<Deprecated version="2.20.0">
|
| 94 |
+
|
| 95 |
+
Use `ChecksumVerificationError` instead.
|
| 96 |
+
|
| 97 |
+
</Deprecated>
|
| 98 |
+
"""
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
class ChecksumVerificationError(DatasetsError, ChecksumVerificationException):
|
| 102 |
+
"""Error raised during checksums verifications of downloaded files."""
|
| 103 |
+
|
| 104 |
+
def __init__(self, *args, **kwargs):
|
| 105 |
+
DatasetsError.__init__(self, *args, **kwargs)
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
@deprecated("Use 'UnexpectedDownloadedFileError' instead.")
|
| 109 |
+
class UnexpectedDownloadedFile(ChecksumVerificationException):
|
| 110 |
+
"""Some downloaded files were not expected.
|
| 111 |
+
|
| 112 |
+
<Deprecated version="2.20.0">
|
| 113 |
+
|
| 114 |
+
Use `UnexpectedDownloadedFileError` instead.
|
| 115 |
+
|
| 116 |
+
</Deprecated>
|
| 117 |
+
"""
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
class UnexpectedDownloadedFileError(ChecksumVerificationError, UnexpectedDownloadedFile):
|
| 121 |
+
"""Some downloaded files were not expected."""
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
@deprecated("Use 'ExpectedMoreDownloadedFilesError' instead.")
|
| 125 |
+
class ExpectedMoreDownloadedFiles(ChecksumVerificationException):
|
| 126 |
+
"""Some files were supposed to be downloaded but were not.
|
| 127 |
+
|
| 128 |
+
<Deprecated version="2.20.0">
|
| 129 |
+
|
| 130 |
+
Use `ExpectedMoreDownloadedFilesError` instead.
|
| 131 |
+
|
| 132 |
+
</Deprecated>
|
| 133 |
+
"""
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
class ExpectedMoreDownloadedFilesError(ChecksumVerificationError, ExpectedMoreDownloadedFiles):
|
| 137 |
+
"""Some files were supposed to be downloaded but were not."""
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
class NonMatchingChecksumError(ChecksumVerificationError):
|
| 141 |
+
"""The downloaded file checksum don't match the expected checksum."""
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
@deprecated("Use 'SplitsVerificationError' instead.")
|
| 145 |
+
class SplitsVerificationException(Exception):
|
| 146 |
+
"""Exceptions during splits verifications.
|
| 147 |
+
|
| 148 |
+
<Deprecated version="2.20.0">
|
| 149 |
+
|
| 150 |
+
Use `SplitsVerificationError` instead.
|
| 151 |
+
|
| 152 |
+
</Deprecated>
|
| 153 |
+
"""
|
| 154 |
+
|
| 155 |
+
|
| 156 |
+
class SplitsVerificationError(DatasetsError, SplitsVerificationException):
|
| 157 |
+
"""Error raised during splits verifications."""
|
| 158 |
+
|
| 159 |
+
def __init__(self, *args, **kwargs):
|
| 160 |
+
DatasetsError.__init__(self, *args, **kwargs)
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
@deprecated("Use 'UnexpectedSplitsError' instead.")
|
| 164 |
+
class UnexpectedSplits(SplitsVerificationException):
|
| 165 |
+
"""The expected splits of the downloaded file is missing.
|
| 166 |
+
|
| 167 |
+
<Deprecated version="2.20.0">
|
| 168 |
+
|
| 169 |
+
Use `UnexpectedSplitsError` instead.
|
| 170 |
+
|
| 171 |
+
</Deprecated>
|
| 172 |
+
"""
|
| 173 |
+
|
| 174 |
+
|
| 175 |
+
class UnexpectedSplitsError(SplitsVerificationError, UnexpectedSplits):
|
| 176 |
+
"""The expected splits of the downloaded file is missing."""
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
@deprecated("Use 'ExpectedMoreSplitsError' instead.")
|
| 180 |
+
class ExpectedMoreSplits(SplitsVerificationException):
|
| 181 |
+
"""Some recorded splits are missing.
|
| 182 |
+
|
| 183 |
+
<Deprecated version="2.20.0">
|
| 184 |
+
|
| 185 |
+
Use `ExpectedMoreSplitsError` instead.
|
| 186 |
+
|
| 187 |
+
</Deprecated>
|
| 188 |
+
"""
|
| 189 |
+
|
| 190 |
+
|
| 191 |
+
class ExpectedMoreSplitsError(SplitsVerificationError, ExpectedMoreSplits):
|
| 192 |
+
"""Some recorded splits are missing."""
|
| 193 |
+
|
| 194 |
+
|
| 195 |
+
class NonMatchingSplitsSizesError(SplitsVerificationError):
|
| 196 |
+
"""The splits sizes don't match the expected splits sizes."""
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/datasets/inspect.py
ADDED
|
@@ -0,0 +1,582 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright 2020 The HuggingFace Datasets Authors.
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
|
| 15 |
+
# Lint as: python3
|
| 16 |
+
"""List and inspect datasets."""
|
| 17 |
+
|
| 18 |
+
import inspect
|
| 19 |
+
import os
|
| 20 |
+
import shutil
|
| 21 |
+
import warnings
|
| 22 |
+
from pathlib import Path, PurePath
|
| 23 |
+
from typing import Dict, List, Mapping, Optional, Sequence, Union
|
| 24 |
+
|
| 25 |
+
import huggingface_hub
|
| 26 |
+
|
| 27 |
+
from . import config
|
| 28 |
+
from .download.download_config import DownloadConfig
|
| 29 |
+
from .download.download_manager import DownloadMode
|
| 30 |
+
from .download.streaming_download_manager import StreamingDownloadManager
|
| 31 |
+
from .info import DatasetInfo
|
| 32 |
+
from .load import (
|
| 33 |
+
dataset_module_factory,
|
| 34 |
+
get_dataset_builder_class,
|
| 35 |
+
import_main_class,
|
| 36 |
+
load_dataset_builder,
|
| 37 |
+
metric_module_factory,
|
| 38 |
+
)
|
| 39 |
+
from .utils.deprecation_utils import deprecated
|
| 40 |
+
from .utils.file_utils import relative_to_absolute_path
|
| 41 |
+
from .utils.logging import get_logger
|
| 42 |
+
from .utils.version import Version
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
logger = get_logger(__name__)
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
class SplitsNotFoundError(ValueError):
|
| 49 |
+
pass
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
@deprecated("Use 'huggingface_hub.list_datasets' instead.")
|
| 53 |
+
def list_datasets(with_community_datasets=True, with_details=False):
|
| 54 |
+
"""List all the datasets scripts available on the Hugging Face Hub.
|
| 55 |
+
|
| 56 |
+
Args:
|
| 57 |
+
with_community_datasets (`bool`, *optional*, defaults to `True`):
|
| 58 |
+
Include the community provided datasets.
|
| 59 |
+
with_details (`bool`, *optional*, defaults to `False`):
|
| 60 |
+
Return the full details on the datasets instead of only the short name.
|
| 61 |
+
|
| 62 |
+
Example:
|
| 63 |
+
|
| 64 |
+
```py
|
| 65 |
+
>>> from datasets import list_datasets
|
| 66 |
+
>>> list_datasets()
|
| 67 |
+
['acronym_identification',
|
| 68 |
+
'ade_corpus_v2',
|
| 69 |
+
'adversarial_qa',
|
| 70 |
+
'aeslc',
|
| 71 |
+
'afrikaans_ner_corpus',
|
| 72 |
+
'ag_news',
|
| 73 |
+
...
|
| 74 |
+
]
|
| 75 |
+
```
|
| 76 |
+
"""
|
| 77 |
+
datasets = huggingface_hub.list_datasets(full=with_details)
|
| 78 |
+
if not with_community_datasets:
|
| 79 |
+
datasets = [dataset for dataset in datasets if "/" not in dataset.id]
|
| 80 |
+
if not with_details:
|
| 81 |
+
datasets = [dataset.id for dataset in datasets]
|
| 82 |
+
return list(datasets)
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
@deprecated(
|
| 86 |
+
"Use 'evaluate.list_evaluation_modules' instead, from the new library 🤗 Evaluate: https://huggingface.co/docs/evaluate"
|
| 87 |
+
)
|
| 88 |
+
def list_metrics(with_community_metrics=True, with_details=False):
|
| 89 |
+
"""List all the metrics script available on the Hugging Face Hub.
|
| 90 |
+
|
| 91 |
+
<Deprecated version="2.5.0">
|
| 92 |
+
|
| 93 |
+
Use `evaluate.list_evaluation_modules` instead, from the new library 🤗 Evaluate: https://huggingface.co/docs/evaluate
|
| 94 |
+
|
| 95 |
+
</Deprecated>
|
| 96 |
+
|
| 97 |
+
Args:
|
| 98 |
+
with_community_metrics (:obj:`bool`, optional, default ``True``): Include the community provided metrics.
|
| 99 |
+
with_details (:obj:`bool`, optional, default ``False``): Return the full details on the metrics instead of only the short name.
|
| 100 |
+
|
| 101 |
+
Example:
|
| 102 |
+
|
| 103 |
+
```py
|
| 104 |
+
>>> from datasets import list_metrics
|
| 105 |
+
>>> list_metrics()
|
| 106 |
+
['accuracy',
|
| 107 |
+
'bertscore',
|
| 108 |
+
'bleu',
|
| 109 |
+
'bleurt',
|
| 110 |
+
'cer',
|
| 111 |
+
'chrf',
|
| 112 |
+
...
|
| 113 |
+
]
|
| 114 |
+
```
|
| 115 |
+
"""
|
| 116 |
+
metrics = huggingface_hub.list_metrics()
|
| 117 |
+
if not with_community_metrics:
|
| 118 |
+
metrics = [metric for metric in metrics if "/" not in metric.id]
|
| 119 |
+
if not with_details:
|
| 120 |
+
metrics = [metric.id for metric in metrics]
|
| 121 |
+
return metrics
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
@deprecated("Clone the dataset repository from the Hugging Face Hub instead.")
|
| 125 |
+
def inspect_dataset(path: str, local_path: str, download_config: Optional[DownloadConfig] = None, **download_kwargs):
|
| 126 |
+
"""
|
| 127 |
+
Allow inspection/modification of a dataset script by copying on local drive at local_path.
|
| 128 |
+
|
| 129 |
+
Args:
|
| 130 |
+
path (`str`): Path to the dataset processing script with the dataset builder. Can be either:
|
| 131 |
+
|
| 132 |
+
- a local path to processing script or the directory containing the script (if the script has the same name
|
| 133 |
+
as the directory),
|
| 134 |
+
e.g. `'./dataset/squad'` or `'./dataset/squad/squad.py'`.
|
| 135 |
+
- a dataset identifier on the Hugging Face Hub (list all available datasets and ids with [`list_datasets`])
|
| 136 |
+
e.g. `'squad'`, `'glue'` or `'openai/webtext'`.
|
| 137 |
+
local_path (`str`):
|
| 138 |
+
Path to the local folder to copy the dataset script to.
|
| 139 |
+
download_config ([`DownloadConfig`], *optional*):
|
| 140 |
+
Specific download configuration parameters.
|
| 141 |
+
**download_kwargs (additional keyword arguments):
|
| 142 |
+
Optional arguments for [`DownloadConfig`] which will override
|
| 143 |
+
the attributes of `download_config` if supplied.
|
| 144 |
+
"""
|
| 145 |
+
if download_config is None:
|
| 146 |
+
download_config = DownloadConfig(**download_kwargs)
|
| 147 |
+
if os.path.isfile(path):
|
| 148 |
+
path = str(Path(path).parent)
|
| 149 |
+
if os.path.isdir(path):
|
| 150 |
+
shutil.copytree(path, local_path, dirs_exist_ok=True)
|
| 151 |
+
else:
|
| 152 |
+
huggingface_hub.HfApi(endpoint=config.HF_ENDPOINT, token=download_config.token).snapshot_download(
|
| 153 |
+
repo_id=path, repo_type="dataset", local_dir=local_path, force_download=download_config.force_download
|
| 154 |
+
)
|
| 155 |
+
print(
|
| 156 |
+
f"The dataset {path} can be inspected at {local_path}. "
|
| 157 |
+
f'You can modify this loading script if it has one and use it with `datasets.load_dataset("{PurePath(local_path).as_posix()}")`.'
|
| 158 |
+
)
|
| 159 |
+
|
| 160 |
+
|
| 161 |
+
@deprecated(
|
| 162 |
+
"Use 'evaluate.inspect_evaluation_module' instead, from the new library 🤗 Evaluate: https://huggingface.co/docs/evaluate"
|
| 163 |
+
)
|
| 164 |
+
def inspect_metric(path: str, local_path: str, download_config: Optional[DownloadConfig] = None, **download_kwargs):
|
| 165 |
+
r"""
|
| 166 |
+
Allow inspection/modification of a metric script by copying it on local drive at local_path.
|
| 167 |
+
|
| 168 |
+
<Deprecated version="2.5.0">
|
| 169 |
+
|
| 170 |
+
Use `evaluate.inspect_evaluation_module` instead, from the new library 🤗 Evaluate instead: https://huggingface.co/docs/evaluate
|
| 171 |
+
|
| 172 |
+
</Deprecated>
|
| 173 |
+
|
| 174 |
+
Args:
|
| 175 |
+
path (``str``): path to the dataset processing script with the dataset builder. Can be either:
|
| 176 |
+
|
| 177 |
+
- a local path to processing script or the directory containing the script (if the script has the same name as the directory),
|
| 178 |
+
e.g. ``'./dataset/squad'`` or ``'./dataset/squad/squad.py'``
|
| 179 |
+
- a dataset identifier on the Hugging Face Hub (list all available datasets and ids with ``datasets.list_datasets()``)
|
| 180 |
+
e.g. ``'squad'``, ``'glue'`` or ``'openai/webtext'``
|
| 181 |
+
local_path (``str``): path to the local folder to copy the datset script to.
|
| 182 |
+
download_config (Optional ``datasets.DownloadConfig``): specific download configuration parameters.
|
| 183 |
+
**download_kwargs (additional keyword arguments): optional attributes for DownloadConfig() which will override the attributes in download_config if supplied.
|
| 184 |
+
"""
|
| 185 |
+
metric_module = metric_module_factory(path, download_config=download_config, **download_kwargs)
|
| 186 |
+
metric_cls = import_main_class(metric_module.module_path, dataset=False)
|
| 187 |
+
module_source_path = inspect.getsourcefile(metric_cls)
|
| 188 |
+
module_source_dirpath = os.path.dirname(module_source_path)
|
| 189 |
+
for dirpath, dirnames, filenames in os.walk(module_source_dirpath):
|
| 190 |
+
dst_dirpath = os.path.join(local_path, os.path.relpath(dirpath, module_source_dirpath))
|
| 191 |
+
os.makedirs(dst_dirpath, exist_ok=True)
|
| 192 |
+
# skipping hidden directories; prune the search
|
| 193 |
+
dirnames[:] = [dirname for dirname in dirnames if not dirname.startswith((".", "__"))]
|
| 194 |
+
for filename in filenames:
|
| 195 |
+
shutil.copy2(os.path.join(dirpath, filename), os.path.join(dst_dirpath, filename))
|
| 196 |
+
shutil.copystat(dirpath, dst_dirpath)
|
| 197 |
+
local_path = relative_to_absolute_path(local_path)
|
| 198 |
+
print(
|
| 199 |
+
f"The processing scripts for metric {path} can be inspected at {local_path}. "
|
| 200 |
+
f"The main class is in {module_source_dirpath}. "
|
| 201 |
+
f'You can modify this processing scripts and use it with `datasets.load_metric("{PurePath(local_path).as_posix()}")`.'
|
| 202 |
+
)
|
| 203 |
+
|
| 204 |
+
|
| 205 |
+
def get_dataset_infos(
|
| 206 |
+
path: str,
|
| 207 |
+
data_files: Optional[Union[Dict, List, str]] = None,
|
| 208 |
+
download_config: Optional[DownloadConfig] = None,
|
| 209 |
+
download_mode: Optional[Union[DownloadMode, str]] = None,
|
| 210 |
+
revision: Optional[Union[str, Version]] = None,
|
| 211 |
+
token: Optional[Union[bool, str]] = None,
|
| 212 |
+
use_auth_token="deprecated",
|
| 213 |
+
**config_kwargs,
|
| 214 |
+
):
|
| 215 |
+
"""Get the meta information about a dataset, returned as a dict mapping config name to DatasetInfoDict.
|
| 216 |
+
|
| 217 |
+
Args:
|
| 218 |
+
path (`str`): path to the dataset processing script with the dataset builder. Can be either:
|
| 219 |
+
|
| 220 |
+
- a local path to processing script or the directory containing the script (if the script has the same name as the directory),
|
| 221 |
+
e.g. `'./dataset/squad'` or `'./dataset/squad/squad.py'`
|
| 222 |
+
- a dataset identifier on the Hugging Face Hub (list all available datasets and ids with [`datasets.list_datasets`])
|
| 223 |
+
e.g. `'squad'`, `'glue'` or``'openai/webtext'`
|
| 224 |
+
revision (`Union[str, datasets.Version]`, *optional*):
|
| 225 |
+
If specified, the dataset module will be loaded from the datasets repository at this version.
|
| 226 |
+
By default:
|
| 227 |
+
- it is set to the local version of the lib.
|
| 228 |
+
- it will also try to load it from the main branch if it's not available at the local version of the lib.
|
| 229 |
+
Specifying a version that is different from your local version of the lib might cause compatibility issues.
|
| 230 |
+
download_config ([`DownloadConfig`], *optional*):
|
| 231 |
+
Specific download configuration parameters.
|
| 232 |
+
download_mode ([`DownloadMode`] or `str`, defaults to `REUSE_DATASET_IF_EXISTS`):
|
| 233 |
+
Download/generate mode.
|
| 234 |
+
data_files (`Union[Dict, List, str]`, *optional*):
|
| 235 |
+
Defining the data_files of the dataset configuration.
|
| 236 |
+
token (`str` or `bool`, *optional*):
|
| 237 |
+
Optional string or boolean to use as Bearer token for remote files on the Datasets Hub.
|
| 238 |
+
If `True`, or not specified, will get token from `"~/.huggingface"`.
|
| 239 |
+
use_auth_token (`str` or `bool`, *optional*):
|
| 240 |
+
Optional string or boolean to use as Bearer token for remote files on the Datasets Hub.
|
| 241 |
+
If `True`, or not specified, will get token from `"~/.huggingface"`.
|
| 242 |
+
|
| 243 |
+
<Deprecated version="2.14.0">
|
| 244 |
+
|
| 245 |
+
`use_auth_token` was deprecated in favor of `token` in version 2.14.0 and will be removed in 3.0.0.
|
| 246 |
+
|
| 247 |
+
</Deprecated>
|
| 248 |
+
|
| 249 |
+
**config_kwargs (additional keyword arguments):
|
| 250 |
+
Optional attributes for builder class which will override the attributes if supplied.
|
| 251 |
+
|
| 252 |
+
Example:
|
| 253 |
+
|
| 254 |
+
```py
|
| 255 |
+
>>> from datasets import get_dataset_infos
|
| 256 |
+
>>> get_dataset_infos('rotten_tomatoes')
|
| 257 |
+
{'default': DatasetInfo(description="Movie Review Dataset.\nThis is a dataset of containing 5,331 positive and 5,331 negative processed\nsentences from Rotten Tomatoes movie reviews...), ...}
|
| 258 |
+
```
|
| 259 |
+
"""
|
| 260 |
+
if use_auth_token != "deprecated":
|
| 261 |
+
warnings.warn(
|
| 262 |
+
"'use_auth_token' was deprecated in favor of 'token' in version 2.14.0 and will be removed in 3.0.0.\n"
|
| 263 |
+
"You can remove this warning by passing 'token=<use_auth_token>' instead.",
|
| 264 |
+
FutureWarning,
|
| 265 |
+
)
|
| 266 |
+
token = use_auth_token
|
| 267 |
+
|
| 268 |
+
config_names = get_dataset_config_names(
|
| 269 |
+
path=path,
|
| 270 |
+
revision=revision,
|
| 271 |
+
download_config=download_config,
|
| 272 |
+
download_mode=download_mode,
|
| 273 |
+
data_files=data_files,
|
| 274 |
+
token=token,
|
| 275 |
+
)
|
| 276 |
+
return {
|
| 277 |
+
config_name: get_dataset_config_info(
|
| 278 |
+
path=path,
|
| 279 |
+
config_name=config_name,
|
| 280 |
+
data_files=data_files,
|
| 281 |
+
download_config=download_config,
|
| 282 |
+
download_mode=download_mode,
|
| 283 |
+
revision=revision,
|
| 284 |
+
token=token,
|
| 285 |
+
**config_kwargs,
|
| 286 |
+
)
|
| 287 |
+
for config_name in config_names
|
| 288 |
+
}
|
| 289 |
+
|
| 290 |
+
|
| 291 |
+
def get_dataset_config_names(
|
| 292 |
+
path: str,
|
| 293 |
+
revision: Optional[Union[str, Version]] = None,
|
| 294 |
+
download_config: Optional[DownloadConfig] = None,
|
| 295 |
+
download_mode: Optional[Union[DownloadMode, str]] = None,
|
| 296 |
+
dynamic_modules_path: Optional[str] = None,
|
| 297 |
+
data_files: Optional[Union[Dict, List, str]] = None,
|
| 298 |
+
**download_kwargs,
|
| 299 |
+
):
|
| 300 |
+
"""Get the list of available config names for a particular dataset.
|
| 301 |
+
|
| 302 |
+
Args:
|
| 303 |
+
path (`str`): path to the dataset processing script with the dataset builder. Can be either:
|
| 304 |
+
|
| 305 |
+
- a local path to processing script or the directory containing the script (if the script has the same name as the directory),
|
| 306 |
+
e.g. `'./dataset/squad'` or `'./dataset/squad/squad.py'`
|
| 307 |
+
- a dataset identifier on the Hugging Face Hub (list all available datasets and ids with [`datasets.list_datasets`])
|
| 308 |
+
e.g. `'squad'`, `'glue'` or `'openai/webtext'`
|
| 309 |
+
revision (`Union[str, datasets.Version]`, *optional*):
|
| 310 |
+
If specified, the dataset module will be loaded from the datasets repository at this version.
|
| 311 |
+
By default:
|
| 312 |
+
- it is set to the local version of the lib.
|
| 313 |
+
- it will also try to load it from the main branch if it's not available at the local version of the lib.
|
| 314 |
+
Specifying a version that is different from your local version of the lib might cause compatibility issues.
|
| 315 |
+
download_config ([`DownloadConfig`], *optional*):
|
| 316 |
+
Specific download configuration parameters.
|
| 317 |
+
download_mode ([`DownloadMode`] or `str`, defaults to `REUSE_DATASET_IF_EXISTS`):
|
| 318 |
+
Download/generate mode.
|
| 319 |
+
dynamic_modules_path (`str`, defaults to `~/.cache/huggingface/modules/datasets_modules`):
|
| 320 |
+
Optional path to the directory in which the dynamic modules are saved. It must have been initialized with `init_dynamic_modules`.
|
| 321 |
+
By default the datasets and metrics are stored inside the `datasets_modules` module.
|
| 322 |
+
data_files (`Union[Dict, List, str]`, *optional*):
|
| 323 |
+
Defining the data_files of the dataset configuration.
|
| 324 |
+
**download_kwargs (additional keyword arguments):
|
| 325 |
+
Optional attributes for [`DownloadConfig`] which will override the attributes in `download_config` if supplied,
|
| 326 |
+
for example `token`.
|
| 327 |
+
|
| 328 |
+
Example:
|
| 329 |
+
|
| 330 |
+
```py
|
| 331 |
+
>>> from datasets import get_dataset_config_names
|
| 332 |
+
>>> get_dataset_config_names("glue")
|
| 333 |
+
['cola',
|
| 334 |
+
'sst2',
|
| 335 |
+
'mrpc',
|
| 336 |
+
'qqp',
|
| 337 |
+
'stsb',
|
| 338 |
+
'mnli',
|
| 339 |
+
'mnli_mismatched',
|
| 340 |
+
'mnli_matched',
|
| 341 |
+
'qnli',
|
| 342 |
+
'rte',
|
| 343 |
+
'wnli',
|
| 344 |
+
'ax']
|
| 345 |
+
```
|
| 346 |
+
"""
|
| 347 |
+
dataset_module = dataset_module_factory(
|
| 348 |
+
path,
|
| 349 |
+
revision=revision,
|
| 350 |
+
download_config=download_config,
|
| 351 |
+
download_mode=download_mode,
|
| 352 |
+
dynamic_modules_path=dynamic_modules_path,
|
| 353 |
+
data_files=data_files,
|
| 354 |
+
**download_kwargs,
|
| 355 |
+
)
|
| 356 |
+
builder_cls = get_dataset_builder_class(dataset_module, dataset_name=os.path.basename(path))
|
| 357 |
+
return list(builder_cls.builder_configs.keys()) or [
|
| 358 |
+
dataset_module.builder_kwargs.get("config_name", builder_cls.DEFAULT_CONFIG_NAME or "default")
|
| 359 |
+
]
|
| 360 |
+
|
| 361 |
+
|
| 362 |
+
def get_dataset_default_config_name(
|
| 363 |
+
path: str,
|
| 364 |
+
revision: Optional[Union[str, Version]] = None,
|
| 365 |
+
download_config: Optional[DownloadConfig] = None,
|
| 366 |
+
download_mode: Optional[Union[DownloadMode, str]] = None,
|
| 367 |
+
dynamic_modules_path: Optional[str] = None,
|
| 368 |
+
data_files: Optional[Union[Dict, List, str]] = None,
|
| 369 |
+
**download_kwargs,
|
| 370 |
+
) -> Optional[str]:
|
| 371 |
+
"""Get the default config name for a particular dataset.
|
| 372 |
+
Can return None only if the dataset has multiple configurations and no default configuration.
|
| 373 |
+
|
| 374 |
+
Args:
|
| 375 |
+
path (`str`): path to the dataset processing script with the dataset builder. Can be either:
|
| 376 |
+
|
| 377 |
+
- a local path to processing script or the directory containing the script (if the script has the same name as the directory),
|
| 378 |
+
e.g. `'./dataset/squad'` or `'./dataset/squad/squad.py'`
|
| 379 |
+
- a dataset identifier on the Hugging Face Hub (list all available datasets and ids with [`datasets.list_datasets`])
|
| 380 |
+
e.g. `'squad'`, `'glue'` or `'openai/webtext'`
|
| 381 |
+
revision (`Union[str, datasets.Version]`, *optional*):
|
| 382 |
+
If specified, the dataset module will be loaded from the datasets repository at this version.
|
| 383 |
+
By default:
|
| 384 |
+
- it is set to the local version of the lib.
|
| 385 |
+
- it will also try to load it from the main branch if it's not available at the local version of the lib.
|
| 386 |
+
Specifying a version that is different from your local version of the lib might cause compatibility issues.
|
| 387 |
+
download_config ([`DownloadConfig`], *optional*):
|
| 388 |
+
Specific download configuration parameters.
|
| 389 |
+
download_mode ([`DownloadMode`] or `str`, defaults to `REUSE_DATASET_IF_EXISTS`):
|
| 390 |
+
Download/generate mode.
|
| 391 |
+
dynamic_modules_path (`str`, defaults to `~/.cache/huggingface/modules/datasets_modules`):
|
| 392 |
+
Optional path to the directory in which the dynamic modules are saved. It must have been initialized with `init_dynamic_modules`.
|
| 393 |
+
By default the datasets and metrics are stored inside the `datasets_modules` module.
|
| 394 |
+
data_files (`Union[Dict, List, str]`, *optional*):
|
| 395 |
+
Defining the data_files of the dataset configuration.
|
| 396 |
+
**download_kwargs (additional keyword arguments):
|
| 397 |
+
Optional attributes for [`DownloadConfig`] which will override the attributes in `download_config` if supplied,
|
| 398 |
+
for example `token`.
|
| 399 |
+
|
| 400 |
+
Returns:
|
| 401 |
+
Optional[str]: the default config name if there is one
|
| 402 |
+
|
| 403 |
+
Example:
|
| 404 |
+
|
| 405 |
+
```py
|
| 406 |
+
>>> from datasets import get_dataset_default_config_name
|
| 407 |
+
>>> get_dataset_default_config_name("openbookqa")
|
| 408 |
+
'main'
|
| 409 |
+
```
|
| 410 |
+
"""
|
| 411 |
+
dataset_module = dataset_module_factory(
|
| 412 |
+
path,
|
| 413 |
+
revision=revision,
|
| 414 |
+
download_config=download_config,
|
| 415 |
+
download_mode=download_mode,
|
| 416 |
+
dynamic_modules_path=dynamic_modules_path,
|
| 417 |
+
data_files=data_files,
|
| 418 |
+
**download_kwargs,
|
| 419 |
+
)
|
| 420 |
+
builder_cls = get_dataset_builder_class(dataset_module, dataset_name=os.path.basename(path))
|
| 421 |
+
builder_configs = list(builder_cls.builder_configs.keys())
|
| 422 |
+
if builder_configs:
|
| 423 |
+
default_config_name = builder_configs[0] if len(builder_configs) == 1 else None
|
| 424 |
+
else:
|
| 425 |
+
default_config_name = "default"
|
| 426 |
+
return builder_cls.DEFAULT_CONFIG_NAME or default_config_name
|
| 427 |
+
|
| 428 |
+
|
| 429 |
+
def get_dataset_config_info(
|
| 430 |
+
path: str,
|
| 431 |
+
config_name: Optional[str] = None,
|
| 432 |
+
data_files: Optional[Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]] = None,
|
| 433 |
+
download_config: Optional[DownloadConfig] = None,
|
| 434 |
+
download_mode: Optional[Union[DownloadMode, str]] = None,
|
| 435 |
+
revision: Optional[Union[str, Version]] = None,
|
| 436 |
+
token: Optional[Union[bool, str]] = None,
|
| 437 |
+
use_auth_token="deprecated",
|
| 438 |
+
**config_kwargs,
|
| 439 |
+
) -> DatasetInfo:
|
| 440 |
+
"""Get the meta information (DatasetInfo) about a dataset for a particular config
|
| 441 |
+
|
| 442 |
+
Args:
|
| 443 |
+
path (``str``): path to the dataset processing script with the dataset builder. Can be either:
|
| 444 |
+
|
| 445 |
+
- a local path to processing script or the directory containing the script (if the script has the same name as the directory),
|
| 446 |
+
e.g. ``'./dataset/squad'`` or ``'./dataset/squad/squad.py'``
|
| 447 |
+
- a dataset identifier on the Hugging Face Hub (list all available datasets and ids with ``datasets.list_datasets()``)
|
| 448 |
+
e.g. ``'squad'``, ``'glue'`` or ``'openai/webtext'``
|
| 449 |
+
config_name (:obj:`str`, optional): Defining the name of the dataset configuration.
|
| 450 |
+
data_files (:obj:`str` or :obj:`Sequence` or :obj:`Mapping`, optional): Path(s) to source data file(s).
|
| 451 |
+
download_config (:class:`~download.DownloadConfig`, optional): Specific download configuration parameters.
|
| 452 |
+
download_mode (:class:`DownloadMode` or :obj:`str`, default ``REUSE_DATASET_IF_EXISTS``): Download/generate mode.
|
| 453 |
+
revision (:class:`~utils.Version` or :obj:`str`, optional): Version of the dataset script to load.
|
| 454 |
+
As datasets have their own git repository on the Datasets Hub, the default version "main" corresponds to their "main" branch.
|
| 455 |
+
You can specify a different version than the default "main" by using a commit SHA or a git tag of the dataset repository.
|
| 456 |
+
token (``str`` or :obj:`bool`, optional): Optional string or boolean to use as Bearer token for remote files on the Datasets Hub.
|
| 457 |
+
If True, or not specified, will get token from `"~/.huggingface"`.
|
| 458 |
+
use_auth_token (``str`` or :obj:`bool`, optional): Optional string or boolean to use as Bearer token for remote files on the Datasets Hub.
|
| 459 |
+
If True, or not specified, will get token from `"~/.huggingface"`.
|
| 460 |
+
|
| 461 |
+
<Deprecated version="2.14.0">
|
| 462 |
+
|
| 463 |
+
`use_auth_token` was deprecated in favor of `token` in version 2.14.0 and will be removed in 3.0.0.
|
| 464 |
+
|
| 465 |
+
</Deprecated>
|
| 466 |
+
|
| 467 |
+
**config_kwargs (additional keyword arguments): optional attributes for builder class which will override the attributes if supplied.
|
| 468 |
+
|
| 469 |
+
"""
|
| 470 |
+
if use_auth_token != "deprecated":
|
| 471 |
+
warnings.warn(
|
| 472 |
+
"'use_auth_token' was deprecated in favor of 'token' in version 2.14.0 and will be removed in 3.0.0.\n"
|
| 473 |
+
"You can remove this warning by passing 'token=<use_auth_token>' instead.",
|
| 474 |
+
FutureWarning,
|
| 475 |
+
)
|
| 476 |
+
token = use_auth_token
|
| 477 |
+
|
| 478 |
+
builder = load_dataset_builder(
|
| 479 |
+
path,
|
| 480 |
+
name=config_name,
|
| 481 |
+
data_files=data_files,
|
| 482 |
+
download_config=download_config,
|
| 483 |
+
download_mode=download_mode,
|
| 484 |
+
revision=revision,
|
| 485 |
+
token=token,
|
| 486 |
+
**config_kwargs,
|
| 487 |
+
)
|
| 488 |
+
info = builder.info
|
| 489 |
+
if info.splits is None:
|
| 490 |
+
download_config = download_config.copy() if download_config else DownloadConfig()
|
| 491 |
+
if token is not None:
|
| 492 |
+
download_config.token = token
|
| 493 |
+
builder._check_manual_download(
|
| 494 |
+
StreamingDownloadManager(base_path=builder.base_path, download_config=download_config)
|
| 495 |
+
)
|
| 496 |
+
try:
|
| 497 |
+
info.splits = {
|
| 498 |
+
split_generator.name: {"name": split_generator.name, "dataset_name": path}
|
| 499 |
+
for split_generator in builder._split_generators(
|
| 500 |
+
StreamingDownloadManager(base_path=builder.base_path, download_config=download_config)
|
| 501 |
+
)
|
| 502 |
+
}
|
| 503 |
+
except Exception as err:
|
| 504 |
+
raise SplitsNotFoundError("The split names could not be parsed from the dataset config.") from err
|
| 505 |
+
return info
|
| 506 |
+
|
| 507 |
+
|
| 508 |
+
def get_dataset_split_names(
|
| 509 |
+
path: str,
|
| 510 |
+
config_name: Optional[str] = None,
|
| 511 |
+
data_files: Optional[Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]] = None,
|
| 512 |
+
download_config: Optional[DownloadConfig] = None,
|
| 513 |
+
download_mode: Optional[Union[DownloadMode, str]] = None,
|
| 514 |
+
revision: Optional[Union[str, Version]] = None,
|
| 515 |
+
token: Optional[Union[bool, str]] = None,
|
| 516 |
+
use_auth_token="deprecated",
|
| 517 |
+
**config_kwargs,
|
| 518 |
+
):
|
| 519 |
+
"""Get the list of available splits for a particular config and dataset.
|
| 520 |
+
|
| 521 |
+
Args:
|
| 522 |
+
path (`str`): path to the dataset processing script with the dataset builder. Can be either:
|
| 523 |
+
|
| 524 |
+
- a local path to processing script or the directory containing the script (if the script has the same name as the directory),
|
| 525 |
+
e.g. `'./dataset/squad'` or `'./dataset/squad/squad.py'`
|
| 526 |
+
- a dataset identifier on the Hugging Face Hub (list all available datasets and ids with [`datasets.list_datasets`])
|
| 527 |
+
e.g. `'squad'`, `'glue'` or `'openai/webtext'`
|
| 528 |
+
config_name (`str`, *optional*):
|
| 529 |
+
Defining the name of the dataset configuration.
|
| 530 |
+
data_files (`str` or `Sequence` or `Mapping`, *optional*):
|
| 531 |
+
Path(s) to source data file(s).
|
| 532 |
+
download_config ([`DownloadConfig`], *optional*):
|
| 533 |
+
Specific download configuration parameters.
|
| 534 |
+
download_mode ([`DownloadMode`] or `str`, defaults to `REUSE_DATASET_IF_EXISTS`):
|
| 535 |
+
Download/generate mode.
|
| 536 |
+
revision ([`Version`] or `str`, *optional*):
|
| 537 |
+
Version of the dataset script to load.
|
| 538 |
+
As datasets have their own git repository on the Datasets Hub, the default version "main" corresponds to their "main" branch.
|
| 539 |
+
You can specify a different version than the default "main" by using a commit SHA or a git tag of the dataset repository.
|
| 540 |
+
token (`str` or `bool`, *optional*):
|
| 541 |
+
Optional string or boolean to use as Bearer token for remote files on the Datasets Hub.
|
| 542 |
+
If `True`, or not specified, will get token from `"~/.huggingface"`.
|
| 543 |
+
use_auth_token (`str` or `bool`, *optional*):
|
| 544 |
+
Optional string or boolean to use as Bearer token for remote files on the Datasets Hub.
|
| 545 |
+
If `True`, or not specified, will get token from `"~/.huggingface"`.
|
| 546 |
+
|
| 547 |
+
<Deprecated version="2.14.0">
|
| 548 |
+
|
| 549 |
+
`use_auth_token` was deprecated in favor of `token` in version 2.14.0 and will be removed in 3.0.0.
|
| 550 |
+
|
| 551 |
+
</Deprecated>
|
| 552 |
+
|
| 553 |
+
**config_kwargs (additional keyword arguments):
|
| 554 |
+
Optional attributes for builder class which will override the attributes if supplied.
|
| 555 |
+
|
| 556 |
+
Example:
|
| 557 |
+
|
| 558 |
+
```py
|
| 559 |
+
>>> from datasets import get_dataset_split_names
|
| 560 |
+
>>> get_dataset_split_names('rotten_tomatoes')
|
| 561 |
+
['train', 'validation', 'test']
|
| 562 |
+
```
|
| 563 |
+
"""
|
| 564 |
+
if use_auth_token != "deprecated":
|
| 565 |
+
warnings.warn(
|
| 566 |
+
"'use_auth_token' was deprecated in favor of 'token' in version 2.14.0 and will be removed in 3.0.0.\n"
|
| 567 |
+
"You can remove this warning by passing 'token=<use_auth_token>' instead.",
|
| 568 |
+
FutureWarning,
|
| 569 |
+
)
|
| 570 |
+
token = use_auth_token
|
| 571 |
+
|
| 572 |
+
info = get_dataset_config_info(
|
| 573 |
+
path,
|
| 574 |
+
config_name=config_name,
|
| 575 |
+
data_files=data_files,
|
| 576 |
+
download_config=download_config,
|
| 577 |
+
download_mode=download_mode,
|
| 578 |
+
revision=revision,
|
| 579 |
+
token=token,
|
| 580 |
+
**config_kwargs,
|
| 581 |
+
)
|
| 582 |
+
return list(info.splits.keys())
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/datasets/keyhash.py
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
|
| 15 |
+
# Lint as: python3
|
| 16 |
+
|
| 17 |
+
"""
|
| 18 |
+
Hashing function for dataset keys using `hashlib.md5`
|
| 19 |
+
|
| 20 |
+
Requirements for the hash function:
|
| 21 |
+
|
| 22 |
+
- Provides a uniformly distributed hash from random space
|
| 23 |
+
- Adequately fast speed
|
| 24 |
+
- Working with multiple input types (in this case, `str`, `int` or `bytes`)
|
| 25 |
+
- Should be platform independent (generates same hash on different OS and systems)
|
| 26 |
+
|
| 27 |
+
The hashing function provides a unique 128-bit integer hash of the key provided.
|
| 28 |
+
|
| 29 |
+
The split name is being used here as the hash salt to avoid having same hashes
|
| 30 |
+
in different splits due to same keys
|
| 31 |
+
"""
|
| 32 |
+
|
| 33 |
+
from typing import Union
|
| 34 |
+
|
| 35 |
+
from huggingface_hub.utils import insecure_hashlib
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def _as_bytes(hash_data: Union[str, int, bytes]) -> bytes:
|
| 39 |
+
"""
|
| 40 |
+
Returns the input hash_data in its bytes form
|
| 41 |
+
|
| 42 |
+
Args:
|
| 43 |
+
hash_data: the hash salt/key to be converted to bytes
|
| 44 |
+
"""
|
| 45 |
+
if isinstance(hash_data, bytes):
|
| 46 |
+
# Data already in bytes, returns as it as
|
| 47 |
+
return hash_data
|
| 48 |
+
elif isinstance(hash_data, str):
|
| 49 |
+
# We keep the data as it as for it ot be later encoded to UTF-8
|
| 50 |
+
# However replace `\\` with `/` for Windows compatibility
|
| 51 |
+
hash_data = hash_data.replace("\\", "/")
|
| 52 |
+
elif isinstance(hash_data, int):
|
| 53 |
+
hash_data = str(hash_data)
|
| 54 |
+
else:
|
| 55 |
+
# If data is not of the required type, raise error
|
| 56 |
+
raise InvalidKeyError(hash_data)
|
| 57 |
+
|
| 58 |
+
return hash_data.encode("utf-8")
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
class InvalidKeyError(Exception):
|
| 62 |
+
"""Raises an error when given key is of invalid datatype."""
|
| 63 |
+
|
| 64 |
+
def __init__(self, hash_data):
|
| 65 |
+
self.prefix = "\nFAILURE TO GENERATE DATASET: Invalid key type detected"
|
| 66 |
+
self.err_msg = f"\nFound Key {hash_data} of type {type(hash_data)}"
|
| 67 |
+
self.suffix = "\nKeys should be either str, int or bytes type"
|
| 68 |
+
super().__init__(f"{self.prefix}{self.err_msg}{self.suffix}")
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
class DuplicatedKeysError(Exception):
|
| 72 |
+
"""Raise an error when duplicate key found."""
|
| 73 |
+
|
| 74 |
+
def __init__(self, key, duplicate_key_indices, fix_msg=""):
|
| 75 |
+
self.key = key
|
| 76 |
+
self.duplicate_key_indices = duplicate_key_indices
|
| 77 |
+
self.fix_msg = fix_msg
|
| 78 |
+
self.prefix = "Found multiple examples generated with the same key"
|
| 79 |
+
if len(duplicate_key_indices) <= 20:
|
| 80 |
+
self.err_msg = f"\nThe examples at index {', '.join(duplicate_key_indices)} have the key {key}"
|
| 81 |
+
else:
|
| 82 |
+
self.err_msg = f"\nThe examples at index {', '.join(duplicate_key_indices[:20])}... ({len(duplicate_key_indices) - 20} more) have the key {key}"
|
| 83 |
+
self.suffix = "\n" + fix_msg if fix_msg else ""
|
| 84 |
+
super().__init__(f"{self.prefix}{self.err_msg}{self.suffix}")
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
class KeyHasher:
|
| 88 |
+
"""KeyHasher class for providing hash using md5"""
|
| 89 |
+
|
| 90 |
+
def __init__(self, hash_salt: str):
|
| 91 |
+
self._split_md5 = insecure_hashlib.md5(_as_bytes(hash_salt))
|
| 92 |
+
|
| 93 |
+
def hash(self, key: Union[str, int, bytes]) -> int:
|
| 94 |
+
"""Returns 128-bits unique hash of input key
|
| 95 |
+
|
| 96 |
+
Args:
|
| 97 |
+
key: the input key to be hashed (should be str, int or bytes)
|
| 98 |
+
|
| 99 |
+
Returns: 128-bit int hash key"""
|
| 100 |
+
md5 = self._split_md5.copy()
|
| 101 |
+
byte_key = _as_bytes(key)
|
| 102 |
+
md5.update(byte_key)
|
| 103 |
+
# Convert to integer with hexadecimal conversion
|
| 104 |
+
return int(md5.hexdigest(), 16)
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/datasets/load.py
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/datasets/naming.py
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
|
| 15 |
+
# Lint as: python3
|
| 16 |
+
"""Utilities for file names."""
|
| 17 |
+
|
| 18 |
+
import itertools
|
| 19 |
+
import os
|
| 20 |
+
import re
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
_uppercase_uppercase_re = re.compile(r"([A-Z]+)([A-Z][a-z])")
|
| 24 |
+
_lowercase_uppercase_re = re.compile(r"([a-z\d])([A-Z])")
|
| 25 |
+
|
| 26 |
+
_single_underscore_re = re.compile(r"(?<!_)_(?!_)")
|
| 27 |
+
_multiple_underscores_re = re.compile(r"(_{2,})")
|
| 28 |
+
|
| 29 |
+
_split_re = r"^\w+(\.\w+)*$"
|
| 30 |
+
|
| 31 |
+
INVALID_WINDOWS_CHARACTERS_IN_PATH = r"<>:/\|?*"
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def camelcase_to_snakecase(name):
|
| 35 |
+
"""Convert camel-case string to snake-case."""
|
| 36 |
+
name = _uppercase_uppercase_re.sub(r"\1_\2", name)
|
| 37 |
+
name = _lowercase_uppercase_re.sub(r"\1_\2", name)
|
| 38 |
+
return name.lower()
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def snakecase_to_camelcase(name):
|
| 42 |
+
"""Convert snake-case string to camel-case string."""
|
| 43 |
+
name = _single_underscore_re.split(name)
|
| 44 |
+
name = [_multiple_underscores_re.split(n) for n in name]
|
| 45 |
+
return "".join(n.capitalize() for n in itertools.chain.from_iterable(name) if n != "")
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def filename_prefix_for_name(name):
|
| 49 |
+
if os.path.basename(name) != name:
|
| 50 |
+
raise ValueError(f"Should be a dataset name, not a path: {name}")
|
| 51 |
+
return camelcase_to_snakecase(name)
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def filename_prefix_for_split(name, split):
|
| 55 |
+
if os.path.basename(name) != name:
|
| 56 |
+
raise ValueError(f"Should be a dataset name, not a path: {name}")
|
| 57 |
+
if not re.match(_split_re, split):
|
| 58 |
+
raise ValueError(f"Split name should match '{_split_re}'' but got '{split}'.")
|
| 59 |
+
return f"{filename_prefix_for_name(name)}-{split}"
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def filepattern_for_dataset_split(dataset_name, split, data_dir, filetype_suffix=None):
|
| 63 |
+
prefix = filename_prefix_for_split(dataset_name, split)
|
| 64 |
+
if filetype_suffix:
|
| 65 |
+
prefix += f".{filetype_suffix}"
|
| 66 |
+
filepath = os.path.join(data_dir, prefix)
|
| 67 |
+
return f"{filepath}*"
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
def filenames_for_dataset_split(path, dataset_name, split, filetype_suffix=None, shard_lengths=None):
|
| 71 |
+
prefix = filename_prefix_for_split(dataset_name, split)
|
| 72 |
+
prefix = os.path.join(path, prefix)
|
| 73 |
+
|
| 74 |
+
if shard_lengths:
|
| 75 |
+
num_shards = len(shard_lengths)
|
| 76 |
+
filenames = [f"{prefix}-{shard_id:05d}-of-{num_shards:05d}" for shard_id in range(num_shards)]
|
| 77 |
+
if filetype_suffix:
|
| 78 |
+
filenames = [filename + f".{filetype_suffix}" for filename in filenames]
|
| 79 |
+
return filenames
|
| 80 |
+
else:
|
| 81 |
+
filename = prefix
|
| 82 |
+
if filetype_suffix:
|
| 83 |
+
filename += f".{filetype_suffix}"
|
| 84 |
+
return [filename]
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/datasets/streaming.py
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import importlib
|
| 2 |
+
import inspect
|
| 3 |
+
from functools import wraps
|
| 4 |
+
from typing import TYPE_CHECKING, Optional
|
| 5 |
+
|
| 6 |
+
from .download.download_config import DownloadConfig
|
| 7 |
+
from .utils.file_utils import (
|
| 8 |
+
xbasename,
|
| 9 |
+
xdirname,
|
| 10 |
+
xet_parse,
|
| 11 |
+
xexists,
|
| 12 |
+
xgetsize,
|
| 13 |
+
xglob,
|
| 14 |
+
xgzip_open,
|
| 15 |
+
xisdir,
|
| 16 |
+
xisfile,
|
| 17 |
+
xjoin,
|
| 18 |
+
xlistdir,
|
| 19 |
+
xnumpy_load,
|
| 20 |
+
xopen,
|
| 21 |
+
xpandas_read_csv,
|
| 22 |
+
xpandas_read_excel,
|
| 23 |
+
xPath,
|
| 24 |
+
xpyarrow_parquet_read_table,
|
| 25 |
+
xrelpath,
|
| 26 |
+
xsio_loadmat,
|
| 27 |
+
xsplit,
|
| 28 |
+
xsplitext,
|
| 29 |
+
xwalk,
|
| 30 |
+
xxml_dom_minidom_parse,
|
| 31 |
+
)
|
| 32 |
+
from .utils.logging import get_logger
|
| 33 |
+
from .utils.patching import patch_submodule
|
| 34 |
+
from .utils.py_utils import get_imports, lock_importable_file
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
logger = get_logger(__name__)
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
if TYPE_CHECKING:
|
| 41 |
+
from .builder import DatasetBuilder
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def extend_module_for_streaming(module_path, download_config: Optional[DownloadConfig] = None):
|
| 45 |
+
"""Extend the module to support streaming.
|
| 46 |
+
|
| 47 |
+
We patch some functions in the module to use `fsspec` to support data streaming:
|
| 48 |
+
- We use `fsspec.open` to open and read remote files. We patch the module function:
|
| 49 |
+
- `open`
|
| 50 |
+
- We use the "::" hop separator to join paths and navigate remote compressed/archive files. We patch the module
|
| 51 |
+
functions:
|
| 52 |
+
- `os.path.join`
|
| 53 |
+
- `pathlib.Path.joinpath` and `pathlib.Path.__truediv__` (called when using the "/" operator)
|
| 54 |
+
|
| 55 |
+
The patched functions are replaced with custom functions defined to work with the
|
| 56 |
+
:class:`~download.streaming_download_manager.StreamingDownloadManager`.
|
| 57 |
+
|
| 58 |
+
Args:
|
| 59 |
+
module_path: Path to the module to be extended.
|
| 60 |
+
download_config : mainly use use_auth_token or storage_options to support different platforms and auth types.
|
| 61 |
+
"""
|
| 62 |
+
|
| 63 |
+
module = importlib.import_module(module_path)
|
| 64 |
+
|
| 65 |
+
# TODO(QL): always update the module to add subsequent new authentication without removing old ones
|
| 66 |
+
if hasattr(module, "_patched_for_streaming") and module._patched_for_streaming:
|
| 67 |
+
if isinstance(module._patched_for_streaming, DownloadConfig):
|
| 68 |
+
module._patched_for_streaming.token = download_config.token
|
| 69 |
+
module._patched_for_streaming.storage_options = download_config.storage_options
|
| 70 |
+
return
|
| 71 |
+
|
| 72 |
+
def wrap_auth(function):
|
| 73 |
+
@wraps(function)
|
| 74 |
+
def wrapper(*args, **kwargs):
|
| 75 |
+
return function(*args, download_config=download_config, **kwargs)
|
| 76 |
+
|
| 77 |
+
wrapper._decorator_name_ = "wrap_auth"
|
| 78 |
+
return wrapper
|
| 79 |
+
|
| 80 |
+
# open files in a streaming fashion
|
| 81 |
+
patch_submodule(module, "open", wrap_auth(xopen)).start()
|
| 82 |
+
patch_submodule(module, "os.listdir", wrap_auth(xlistdir)).start()
|
| 83 |
+
patch_submodule(module, "os.walk", wrap_auth(xwalk)).start()
|
| 84 |
+
patch_submodule(module, "glob.glob", wrap_auth(xglob)).start()
|
| 85 |
+
# allow to navigate in remote zip files
|
| 86 |
+
patch_submodule(module, "os.path.join", xjoin).start()
|
| 87 |
+
patch_submodule(module, "os.path.dirname", xdirname).start()
|
| 88 |
+
patch_submodule(module, "os.path.basename", xbasename).start()
|
| 89 |
+
patch_submodule(module, "os.path.relpath", xrelpath).start()
|
| 90 |
+
patch_submodule(module, "os.path.split", xsplit).start()
|
| 91 |
+
patch_submodule(module, "os.path.splitext", xsplitext).start()
|
| 92 |
+
# allow checks on paths
|
| 93 |
+
patch_submodule(module, "os.path.exists", wrap_auth(xexists)).start()
|
| 94 |
+
patch_submodule(module, "os.path.isdir", wrap_auth(xisdir)).start()
|
| 95 |
+
patch_submodule(module, "os.path.isfile", wrap_auth(xisfile)).start()
|
| 96 |
+
patch_submodule(module, "os.path.getsize", wrap_auth(xgetsize)).start()
|
| 97 |
+
patch_submodule(module, "pathlib.Path", xPath).start()
|
| 98 |
+
# file readers
|
| 99 |
+
patch_submodule(module, "gzip.open", wrap_auth(xgzip_open)).start()
|
| 100 |
+
patch_submodule(module, "numpy.load", wrap_auth(xnumpy_load)).start()
|
| 101 |
+
patch_submodule(module, "pandas.read_csv", wrap_auth(xpandas_read_csv), attrs=["__version__"]).start()
|
| 102 |
+
patch_submodule(module, "pandas.read_excel", wrap_auth(xpandas_read_excel), attrs=["__version__"]).start()
|
| 103 |
+
patch_submodule(module, "scipy.io.loadmat", wrap_auth(xsio_loadmat), attrs=["__version__"]).start()
|
| 104 |
+
patch_submodule(module, "xml.etree.ElementTree.parse", wrap_auth(xet_parse)).start()
|
| 105 |
+
patch_submodule(module, "xml.dom.minidom.parse", wrap_auth(xxml_dom_minidom_parse)).start()
|
| 106 |
+
# pyarrow: do not patch pyarrow attribute in packaged modules
|
| 107 |
+
if not module.__name__.startswith("datasets.packaged_modules."):
|
| 108 |
+
patch_submodule(module, "pyarrow.parquet.read_table", wrap_auth(xpyarrow_parquet_read_table)).start()
|
| 109 |
+
module._patched_for_streaming = download_config
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
def extend_dataset_builder_for_streaming(builder: "DatasetBuilder"):
|
| 113 |
+
"""Extend the dataset builder module and the modules imported by it to support streaming.
|
| 114 |
+
|
| 115 |
+
Args:
|
| 116 |
+
builder (:class:`DatasetBuilder`): Dataset builder instance.
|
| 117 |
+
"""
|
| 118 |
+
# this extends the open and os.path.join functions for data streaming
|
| 119 |
+
download_config = DownloadConfig(storage_options=builder.storage_options, token=builder.token)
|
| 120 |
+
extend_module_for_streaming(builder.__module__, download_config=download_config)
|
| 121 |
+
# if needed, we also have to extend additional internal imports (like wmt14 -> wmt_utils)
|
| 122 |
+
if not builder.__module__.startswith("datasets."): # check that it's not a packaged builder like csv
|
| 123 |
+
importable_file = inspect.getfile(builder.__class__)
|
| 124 |
+
with lock_importable_file(importable_file):
|
| 125 |
+
for imports in get_imports(importable_file):
|
| 126 |
+
if imports[0] == "internal":
|
| 127 |
+
internal_import_name = imports[1]
|
| 128 |
+
internal_module_name = ".".join(builder.__module__.split(".")[:-1] + [internal_import_name])
|
| 129 |
+
extend_module_for_streaming(internal_module_name, download_config=download_config)
|
| 130 |
+
|
| 131 |
+
# builders can inherit from other builders that might use streaming functionality
|
| 132 |
+
# (for example, ImageFolder and AudioFolder inherit from FolderBuilder which implements examples generation)
|
| 133 |
+
# but these parents builders are not patched automatically as they are not instantiated, so we patch them here
|
| 134 |
+
from .builder import DatasetBuilder
|
| 135 |
+
|
| 136 |
+
parent_builder_modules = [
|
| 137 |
+
cls.__module__
|
| 138 |
+
for cls in type(builder).__mro__[1:] # make sure it's not the same module we've already patched
|
| 139 |
+
if issubclass(cls, DatasetBuilder) and cls.__module__ != DatasetBuilder.__module__
|
| 140 |
+
] # check it's not a standard builder from datasets.builder
|
| 141 |
+
for module in parent_builder_modules:
|
| 142 |
+
extend_module_for_streaming(module, download_config=download_config)
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore-1.0.7.dist-info/INSTALLER
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
pip
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore-1.0.7.dist-info/METADATA
ADDED
|
@@ -0,0 +1,616 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Metadata-Version: 2.3
|
| 2 |
+
Name: httpcore
|
| 3 |
+
Version: 1.0.7
|
| 4 |
+
Summary: A minimal low-level HTTP client.
|
| 5 |
+
Project-URL: Documentation, https://www.encode.io/httpcore
|
| 6 |
+
Project-URL: Homepage, https://www.encode.io/httpcore/
|
| 7 |
+
Project-URL: Source, https://github.com/encode/httpcore
|
| 8 |
+
Author-email: Tom Christie <tom@tomchristie.com>
|
| 9 |
+
License: BSD-3-Clause
|
| 10 |
+
Classifier: Development Status :: 3 - Alpha
|
| 11 |
+
Classifier: Environment :: Web Environment
|
| 12 |
+
Classifier: Framework :: AsyncIO
|
| 13 |
+
Classifier: Framework :: Trio
|
| 14 |
+
Classifier: Intended Audience :: Developers
|
| 15 |
+
Classifier: License :: OSI Approved :: BSD License
|
| 16 |
+
Classifier: Operating System :: OS Independent
|
| 17 |
+
Classifier: Programming Language :: Python :: 3
|
| 18 |
+
Classifier: Programming Language :: Python :: 3 :: Only
|
| 19 |
+
Classifier: Programming Language :: Python :: 3.8
|
| 20 |
+
Classifier: Programming Language :: Python :: 3.9
|
| 21 |
+
Classifier: Programming Language :: Python :: 3.10
|
| 22 |
+
Classifier: Programming Language :: Python :: 3.11
|
| 23 |
+
Classifier: Programming Language :: Python :: 3.12
|
| 24 |
+
Classifier: Topic :: Internet :: WWW/HTTP
|
| 25 |
+
Requires-Python: >=3.8
|
| 26 |
+
Requires-Dist: certifi
|
| 27 |
+
Requires-Dist: h11<0.15,>=0.13
|
| 28 |
+
Provides-Extra: asyncio
|
| 29 |
+
Requires-Dist: anyio<5.0,>=4.0; extra == 'asyncio'
|
| 30 |
+
Provides-Extra: http2
|
| 31 |
+
Requires-Dist: h2<5,>=3; extra == 'http2'
|
| 32 |
+
Provides-Extra: socks
|
| 33 |
+
Requires-Dist: socksio==1.*; extra == 'socks'
|
| 34 |
+
Provides-Extra: trio
|
| 35 |
+
Requires-Dist: trio<1.0,>=0.22.0; extra == 'trio'
|
| 36 |
+
Description-Content-Type: text/markdown
|
| 37 |
+
|
| 38 |
+
# HTTP Core
|
| 39 |
+
|
| 40 |
+
[](https://github.com/encode/httpcore/actions)
|
| 41 |
+
[](https://pypi.org/project/httpcore/)
|
| 42 |
+
|
| 43 |
+
> *Do one thing, and do it well.*
|
| 44 |
+
|
| 45 |
+
The HTTP Core package provides a minimal low-level HTTP client, which does
|
| 46 |
+
one thing only. Sending HTTP requests.
|
| 47 |
+
|
| 48 |
+
It does not provide any high level model abstractions over the API,
|
| 49 |
+
does not handle redirects, multipart uploads, building authentication headers,
|
| 50 |
+
transparent HTTP caching, URL parsing, session cookie handling,
|
| 51 |
+
content or charset decoding, handling JSON, environment based configuration
|
| 52 |
+
defaults, or any of that Jazz.
|
| 53 |
+
|
| 54 |
+
Some things HTTP Core does do:
|
| 55 |
+
|
| 56 |
+
* Sending HTTP requests.
|
| 57 |
+
* Thread-safe / task-safe connection pooling.
|
| 58 |
+
* HTTP(S) proxy & SOCKS proxy support.
|
| 59 |
+
* Supports HTTP/1.1 and HTTP/2.
|
| 60 |
+
* Provides both sync and async interfaces.
|
| 61 |
+
* Async backend support for `asyncio` and `trio`.
|
| 62 |
+
|
| 63 |
+
## Requirements
|
| 64 |
+
|
| 65 |
+
Python 3.8+
|
| 66 |
+
|
| 67 |
+
## Installation
|
| 68 |
+
|
| 69 |
+
For HTTP/1.1 only support, install with:
|
| 70 |
+
|
| 71 |
+
```shell
|
| 72 |
+
$ pip install httpcore
|
| 73 |
+
```
|
| 74 |
+
|
| 75 |
+
There are also a number of optional extras available...
|
| 76 |
+
|
| 77 |
+
```shell
|
| 78 |
+
$ pip install httpcore['asyncio,trio,http2,socks']
|
| 79 |
+
```
|
| 80 |
+
|
| 81 |
+
## Sending requests
|
| 82 |
+
|
| 83 |
+
Send an HTTP request:
|
| 84 |
+
|
| 85 |
+
```python
|
| 86 |
+
import httpcore
|
| 87 |
+
|
| 88 |
+
response = httpcore.request("GET", "https://www.example.com/")
|
| 89 |
+
|
| 90 |
+
print(response)
|
| 91 |
+
# <Response [200]>
|
| 92 |
+
print(response.status)
|
| 93 |
+
# 200
|
| 94 |
+
print(response.headers)
|
| 95 |
+
# [(b'Accept-Ranges', b'bytes'), (b'Age', b'557328'), (b'Cache-Control', b'max-age=604800'), ...]
|
| 96 |
+
print(response.content)
|
| 97 |
+
# b'<!doctype html>\n<html>\n<head>\n<title>Example Domain</title>\n\n<meta charset="utf-8"/>\n ...'
|
| 98 |
+
```
|
| 99 |
+
|
| 100 |
+
The top-level `httpcore.request()` function is provided for convenience. In practice whenever you're working with `httpcore` you'll want to use the connection pooling functionality that it provides.
|
| 101 |
+
|
| 102 |
+
```python
|
| 103 |
+
import httpcore
|
| 104 |
+
|
| 105 |
+
http = httpcore.ConnectionPool()
|
| 106 |
+
response = http.request("GET", "https://www.example.com/")
|
| 107 |
+
```
|
| 108 |
+
|
| 109 |
+
Once you're ready to get going, [head over to the documentation](https://www.encode.io/httpcore/).
|
| 110 |
+
|
| 111 |
+
## Motivation
|
| 112 |
+
|
| 113 |
+
You *probably* don't want to be using HTTP Core directly. It might make sense if
|
| 114 |
+
you're writing something like a proxy service in Python, and you just want
|
| 115 |
+
something at the lowest possible level, but more typically you'll want to use
|
| 116 |
+
a higher level client library, such as `httpx`.
|
| 117 |
+
|
| 118 |
+
The motivation for `httpcore` is:
|
| 119 |
+
|
| 120 |
+
* To provide a reusable low-level client library, that other packages can then build on top of.
|
| 121 |
+
* To provide a *really clear interface split* between the networking code and client logic,
|
| 122 |
+
so that each is easier to understand and reason about in isolation.
|
| 123 |
+
|
| 124 |
+
## Dependencies
|
| 125 |
+
|
| 126 |
+
The `httpcore` package has the following dependencies...
|
| 127 |
+
|
| 128 |
+
* `h11`
|
| 129 |
+
* `certifi`
|
| 130 |
+
|
| 131 |
+
And the following optional extras...
|
| 132 |
+
|
| 133 |
+
* `anyio` - Required by `pip install httpcore['asyncio']`.
|
| 134 |
+
* `trio` - Required by `pip install httpcore['trio']`.
|
| 135 |
+
* `h2` - Required by `pip install httpcore['http2']`.
|
| 136 |
+
* `socksio` - Required by `pip install httpcore['socks']`.
|
| 137 |
+
|
| 138 |
+
## Versioning
|
| 139 |
+
|
| 140 |
+
We use [SEMVER for our versioning policy](https://semver.org/).
|
| 141 |
+
|
| 142 |
+
For changes between package versions please see our [project changelog](CHANGELOG.md).
|
| 143 |
+
|
| 144 |
+
We recommend pinning your requirements either the most current major version, or a more specific version range:
|
| 145 |
+
|
| 146 |
+
```python
|
| 147 |
+
pip install 'httpcore==1.*'
|
| 148 |
+
```
|
| 149 |
+
# Changelog
|
| 150 |
+
|
| 151 |
+
All notable changes to this project will be documented in this file.
|
| 152 |
+
|
| 153 |
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
|
| 154 |
+
|
| 155 |
+
## Version 1.0.7 (November 15th, 2024)
|
| 156 |
+
|
| 157 |
+
- Support `proxy=…` configuration on `ConnectionPool()`. (#974)
|
| 158 |
+
|
| 159 |
+
## Version 1.0.6 (October 1st, 2024)
|
| 160 |
+
|
| 161 |
+
- Relax `trio` dependency pinning. (#956)
|
| 162 |
+
- Handle `trio` raising `NotImplementedError` on unsupported platforms. (#955)
|
| 163 |
+
- Handle mapping `ssl.SSLError` to `httpcore.ConnectError`. (#918)
|
| 164 |
+
|
| 165 |
+
## 1.0.5 (March 27th, 2024)
|
| 166 |
+
|
| 167 |
+
- Handle `EndOfStream` exception for anyio backend. (#899)
|
| 168 |
+
- Allow trio `0.25.*` series in package dependancies. (#903)
|
| 169 |
+
|
| 170 |
+
## 1.0.4 (February 21st, 2024)
|
| 171 |
+
|
| 172 |
+
- Add `target` request extension. (#888)
|
| 173 |
+
- Fix support for connection `Upgrade` and `CONNECT` when some data in the stream has been read. (#882)
|
| 174 |
+
|
| 175 |
+
## 1.0.3 (February 13th, 2024)
|
| 176 |
+
|
| 177 |
+
- Fix support for async cancellations. (#880)
|
| 178 |
+
- Fix trace extension when used with socks proxy. (#849)
|
| 179 |
+
- Fix SSL context for connections using the "wss" scheme (#869)
|
| 180 |
+
|
| 181 |
+
## 1.0.2 (November 10th, 2023)
|
| 182 |
+
|
| 183 |
+
- Fix `float("inf")` timeouts in `Event.wait` function. (#846)
|
| 184 |
+
|
| 185 |
+
## 1.0.1 (November 3rd, 2023)
|
| 186 |
+
|
| 187 |
+
- Fix pool timeout to account for the total time spent retrying. (#823)
|
| 188 |
+
- Raise a neater RuntimeError when the correct async deps are not installed. (#826)
|
| 189 |
+
- Add support for synchronous TLS-in-TLS streams. (#840)
|
| 190 |
+
|
| 191 |
+
## 1.0.0 (October 6th, 2023)
|
| 192 |
+
|
| 193 |
+
From version 1.0 our async support is now optional, as the package has minimal dependencies by default.
|
| 194 |
+
|
| 195 |
+
For async support use either `pip install 'httpcore[asyncio]'` or `pip install 'httpcore[trio]'`.
|
| 196 |
+
|
| 197 |
+
The project versioning policy is now explicitly governed by SEMVER. See https://semver.org/.
|
| 198 |
+
|
| 199 |
+
- Async support becomes fully optional. (#809)
|
| 200 |
+
- Add support for Python 3.12. (#807)
|
| 201 |
+
|
| 202 |
+
## 0.18.0 (September 8th, 2023)
|
| 203 |
+
|
| 204 |
+
- Add support for HTTPS proxies. (#745, #786)
|
| 205 |
+
- Drop Python 3.7 support. (#727)
|
| 206 |
+
- Handle `sni_hostname` extension with SOCKS proxy. (#774)
|
| 207 |
+
- Handle HTTP/1.1 half-closed connections gracefully. (#641)
|
| 208 |
+
- Change the type of `Extensions` from `Mapping[Str, Any]` to `MutableMapping[Str, Any]`. (#762)
|
| 209 |
+
|
| 210 |
+
## 0.17.3 (July 5th, 2023)
|
| 211 |
+
|
| 212 |
+
- Support async cancellations, ensuring that the connection pool is left in a clean state when cancellations occur. (#726)
|
| 213 |
+
- The networking backend interface has [been added to the public API](https://www.encode.io/httpcore/network-backends). Some classes which were previously private implementation detail are now part of the top-level public API. (#699)
|
| 214 |
+
- Graceful handling of HTTP/2 GoAway frames, with requests being transparently retried on a new connection. (#730)
|
| 215 |
+
- Add exceptions when a synchronous `trace callback` is passed to an asynchronous request or an asynchronous `trace callback` is passed to a synchronous request. (#717)
|
| 216 |
+
- Drop Python 3.7 support. (#727)
|
| 217 |
+
|
| 218 |
+
## 0.17.2 (May 23th, 2023)
|
| 219 |
+
|
| 220 |
+
- Add `socket_options` argument to `ConnectionPool` and `HTTProxy` classes. (#668)
|
| 221 |
+
- Improve logging with per-module logger names. (#690)
|
| 222 |
+
- Add `sni_hostname` request extension. (#696)
|
| 223 |
+
- Resolve race condition during import of `anyio` package. (#692)
|
| 224 |
+
- Enable TCP_NODELAY for all synchronous sockets. (#651)
|
| 225 |
+
|
| 226 |
+
## 0.17.1 (May 17th, 2023)
|
| 227 |
+
|
| 228 |
+
- If 'retries' is set, then allow retries if an SSL handshake error occurs. (#669)
|
| 229 |
+
- Improve correctness of tracebacks on network exceptions, by raising properly chained exceptions. (#678)
|
| 230 |
+
- Prevent connection-hanging behaviour when HTTP/2 connections are closed by a server-sent 'GoAway' frame. (#679)
|
| 231 |
+
- Fix edge-case exception when removing requests from the connection pool. (#680)
|
| 232 |
+
- Fix pool timeout edge-case. (#688)
|
| 233 |
+
|
| 234 |
+
## 0.17.0 (March 16th, 2023)
|
| 235 |
+
|
| 236 |
+
- Add DEBUG level logging. (#648)
|
| 237 |
+
- Respect HTTP/2 max concurrent streams when settings updates are sent by server. (#652)
|
| 238 |
+
- Increase the allowable HTTP header size to 100kB. (#647)
|
| 239 |
+
- Add `retries` option to SOCKS proxy classes. (#643)
|
| 240 |
+
|
| 241 |
+
## 0.16.3 (December 20th, 2022)
|
| 242 |
+
|
| 243 |
+
- Allow `ws` and `wss` schemes. Allows us to properly support websocket upgrade connections. (#625)
|
| 244 |
+
- Forwarding HTTP proxies use a connection-per-remote-host. Required by some proxy implementations. (#637)
|
| 245 |
+
- Don't raise `RuntimeError` when closing a connection pool with active connections. Removes some error cases when cancellations are used. (#631)
|
| 246 |
+
- Lazy import `anyio`, so that it's no longer a hard dependancy, and isn't imported if unused. (#639)
|
| 247 |
+
|
| 248 |
+
## 0.16.2 (November 25th, 2022)
|
| 249 |
+
|
| 250 |
+
- Revert 'Fix async cancellation behaviour', which introduced race conditions. (#627)
|
| 251 |
+
- Raise `RuntimeError` if attempting to us UNIX domain sockets on Windows. (#619)
|
| 252 |
+
|
| 253 |
+
## 0.16.1 (November 17th, 2022)
|
| 254 |
+
|
| 255 |
+
- Fix HTTP/1.1 interim informational responses, such as "100 Continue". (#605)
|
| 256 |
+
|
| 257 |
+
## 0.16.0 (October 11th, 2022)
|
| 258 |
+
|
| 259 |
+
- Support HTTP/1.1 informational responses. (#581)
|
| 260 |
+
- Fix async cancellation behaviour. (#580)
|
| 261 |
+
- Support `h11` 0.14. (#579)
|
| 262 |
+
|
| 263 |
+
## 0.15.0 (May 17th, 2022)
|
| 264 |
+
|
| 265 |
+
- Drop Python 3.6 support (#535)
|
| 266 |
+
- Ensure HTTP proxy CONNECT requests include `timeout` configuration. (#506)
|
| 267 |
+
- Switch to explicit `typing.Optional` for type hints. (#513)
|
| 268 |
+
- For `trio` map OSError exceptions to `ConnectError`. (#543)
|
| 269 |
+
|
| 270 |
+
## 0.14.7 (February 4th, 2022)
|
| 271 |
+
|
| 272 |
+
- Requests which raise a PoolTimeout need to be removed from the pool queue. (#502)
|
| 273 |
+
- Fix AttributeError that happened when Socks5Connection were terminated. (#501)
|
| 274 |
+
|
| 275 |
+
## 0.14.6 (February 1st, 2022)
|
| 276 |
+
|
| 277 |
+
- Fix SOCKS support for `http://` URLs. (#492)
|
| 278 |
+
- Resolve race condition around exceptions during streaming a response. (#491)
|
| 279 |
+
|
| 280 |
+
## 0.14.5 (January 18th, 2022)
|
| 281 |
+
|
| 282 |
+
- SOCKS proxy support. (#478)
|
| 283 |
+
- Add proxy_auth argument to HTTPProxy. (#481)
|
| 284 |
+
- Improve error message on 'RemoteProtocolError' exception when server disconnects without sending a response. (#479)
|
| 285 |
+
|
| 286 |
+
## 0.14.4 (January 5th, 2022)
|
| 287 |
+
|
| 288 |
+
- Support HTTP/2 on HTTPS tunnelling proxies. (#468)
|
| 289 |
+
- Fix proxy headers missing on HTTP forwarding. (#456)
|
| 290 |
+
- Only instantiate SSL context if required. (#457)
|
| 291 |
+
- More robust HTTP/2 handling. (#253, #439, #440, #441)
|
| 292 |
+
|
| 293 |
+
## 0.14.3 (November 17th, 2021)
|
| 294 |
+
|
| 295 |
+
- Fix race condition when removing closed connections from the pool. (#437)
|
| 296 |
+
|
| 297 |
+
## 0.14.2 (November 16th, 2021)
|
| 298 |
+
|
| 299 |
+
- Failed connections no longer remain in the pool. (Pull #433)
|
| 300 |
+
|
| 301 |
+
## 0.14.1 (November 12th, 2021)
|
| 302 |
+
|
| 303 |
+
- `max_connections` becomes optional. (Pull #429)
|
| 304 |
+
- `certifi` is now included in the install dependancies. (Pull #428)
|
| 305 |
+
- `h2` is now strictly optional. (Pull #428)
|
| 306 |
+
|
| 307 |
+
## 0.14.0 (November 11th, 2021)
|
| 308 |
+
|
| 309 |
+
The 0.14 release is a complete reworking of `httpcore`, comprehensively addressing some underlying issues in the connection pooling, as well as substantially redesigning the API to be more user friendly.
|
| 310 |
+
|
| 311 |
+
Some of the lower-level API design also makes the components more easily testable in isolation, and the package now has 100% test coverage.
|
| 312 |
+
|
| 313 |
+
See [discussion #419](https://github.com/encode/httpcore/discussions/419) for a little more background.
|
| 314 |
+
|
| 315 |
+
There's some other neat bits in there too, such as the "trace" extension, which gives a hook into inspecting the internal events that occur during the request/response cycle. This extension is needed for the HTTPX cli, in order to...
|
| 316 |
+
|
| 317 |
+
* Log the point at which the connection is established, and the IP/port on which it is made.
|
| 318 |
+
* Determine if the outgoing request should log as HTTP/1.1 or HTTP/2, rather than having to assume it's HTTP/2 if the --http2 flag was passed. (Which may not actually be true.)
|
| 319 |
+
* Log SSL version info / certificate info.
|
| 320 |
+
|
| 321 |
+
Note that `curio` support is not currently available in 0.14.0. If you're using `httpcore` with `curio` please get in touch, so we can assess if we ought to prioritize it as a feature or not.
|
| 322 |
+
|
| 323 |
+
## 0.13.7 (September 13th, 2021)
|
| 324 |
+
|
| 325 |
+
- Fix broken error messaging when URL scheme is missing, or a non HTTP(S) scheme is used. (Pull #403)
|
| 326 |
+
|
| 327 |
+
## 0.13.6 (June 15th, 2021)
|
| 328 |
+
|
| 329 |
+
### Fixed
|
| 330 |
+
|
| 331 |
+
- Close sockets when read or write timeouts occur. (Pull #365)
|
| 332 |
+
|
| 333 |
+
## 0.13.5 (June 14th, 2021)
|
| 334 |
+
|
| 335 |
+
### Fixed
|
| 336 |
+
|
| 337 |
+
- Resolved niggles with AnyIO EOF behaviours. (Pull #358, #362)
|
| 338 |
+
|
| 339 |
+
## 0.13.4 (June 9th, 2021)
|
| 340 |
+
|
| 341 |
+
### Added
|
| 342 |
+
|
| 343 |
+
- Improved error messaging when URL scheme is missing, or a non HTTP(S) scheme is used. (Pull #354)
|
| 344 |
+
|
| 345 |
+
### Fixed
|
| 346 |
+
|
| 347 |
+
- Switched to `anyio` as the default backend implementation when running with `asyncio`. Resolves some awkward [TLS timeout issues](https://github.com/encode/httpx/discussions/1511).
|
| 348 |
+
|
| 349 |
+
## 0.13.3 (May 6th, 2021)
|
| 350 |
+
|
| 351 |
+
### Added
|
| 352 |
+
|
| 353 |
+
- Support HTTP/2 prior knowledge, using `httpcore.SyncConnectionPool(http1=False)`. (Pull #333)
|
| 354 |
+
|
| 355 |
+
### Fixed
|
| 356 |
+
|
| 357 |
+
- Handle cases where environment does not provide `select.poll` support. (Pull #331)
|
| 358 |
+
|
| 359 |
+
## 0.13.2 (April 29th, 2021)
|
| 360 |
+
|
| 361 |
+
### Added
|
| 362 |
+
|
| 363 |
+
- Improve error message for specific case of `RemoteProtocolError` where server disconnects without sending a response. (Pull #313)
|
| 364 |
+
|
| 365 |
+
## 0.13.1 (April 28th, 2021)
|
| 366 |
+
|
| 367 |
+
### Fixed
|
| 368 |
+
|
| 369 |
+
- More resiliant testing for closed connections. (Pull #311)
|
| 370 |
+
- Don't raise exceptions on ungraceful connection closes. (Pull #310)
|
| 371 |
+
|
| 372 |
+
## 0.13.0 (April 21st, 2021)
|
| 373 |
+
|
| 374 |
+
The 0.13 release updates the core API in order to match the HTTPX Transport API,
|
| 375 |
+
introduced in HTTPX 0.18 onwards.
|
| 376 |
+
|
| 377 |
+
An example of making requests with the new interface is:
|
| 378 |
+
|
| 379 |
+
```python
|
| 380 |
+
with httpcore.SyncConnectionPool() as http:
|
| 381 |
+
status_code, headers, stream, extensions = http.handle_request(
|
| 382 |
+
method=b'GET',
|
| 383 |
+
url=(b'https', b'example.org', 443, b'/'),
|
| 384 |
+
headers=[(b'host', b'example.org'), (b'user-agent', b'httpcore')]
|
| 385 |
+
stream=httpcore.ByteStream(b''),
|
| 386 |
+
extensions={}
|
| 387 |
+
)
|
| 388 |
+
body = stream.read()
|
| 389 |
+
print(status_code, body)
|
| 390 |
+
```
|
| 391 |
+
|
| 392 |
+
### Changed
|
| 393 |
+
|
| 394 |
+
- The `.request()` method is now `handle_request()`. (Pull #296)
|
| 395 |
+
- The `.arequest()` method is now `.handle_async_request()`. (Pull #296)
|
| 396 |
+
- The `headers` argument is no longer optional. (Pull #296)
|
| 397 |
+
- The `stream` argument is no longer optional. (Pull #296)
|
| 398 |
+
- The `ext` argument is now named `extensions`, and is no longer optional. (Pull #296)
|
| 399 |
+
- The `"reason"` extension keyword is now named `"reason_phrase"`. (Pull #296)
|
| 400 |
+
- The `"reason_phrase"` and `"http_version"` extensions now use byte strings for their values. (Pull #296)
|
| 401 |
+
- The `httpcore.PlainByteStream()` class becomes `httpcore.ByteStream()`. (Pull #296)
|
| 402 |
+
|
| 403 |
+
### Added
|
| 404 |
+
|
| 405 |
+
- Streams now support a `.read()` interface. (Pull #296)
|
| 406 |
+
|
| 407 |
+
### Fixed
|
| 408 |
+
|
| 409 |
+
- Task cancellation no longer leaks connections from the connection pool. (Pull #305)
|
| 410 |
+
|
| 411 |
+
## 0.12.3 (December 7th, 2020)
|
| 412 |
+
|
| 413 |
+
### Fixed
|
| 414 |
+
|
| 415 |
+
- Abort SSL connections on close rather than waiting for remote EOF when using `asyncio`. (Pull #167)
|
| 416 |
+
- Fix exception raised in case of connect timeouts when using the `anyio` backend. (Pull #236)
|
| 417 |
+
- Fix `Host` header precedence for `:authority` in HTTP/2. (Pull #241, #243)
|
| 418 |
+
- Handle extra edge case when detecting for socket readability when using `asyncio`. (Pull #242, #244)
|
| 419 |
+
- Fix `asyncio` SSL warning when using proxy tunneling. (Pull #249)
|
| 420 |
+
|
| 421 |
+
## 0.12.2 (November 20th, 2020)
|
| 422 |
+
|
| 423 |
+
### Fixed
|
| 424 |
+
|
| 425 |
+
- Properly wrap connect errors on the asyncio backend. (Pull #235)
|
| 426 |
+
- Fix `ImportError` occurring on Python 3.9 when using the HTTP/1.1 sync client in a multithreaded context. (Pull #237)
|
| 427 |
+
|
| 428 |
+
## 0.12.1 (November 7th, 2020)
|
| 429 |
+
|
| 430 |
+
### Added
|
| 431 |
+
|
| 432 |
+
- Add connect retries. (Pull #221)
|
| 433 |
+
|
| 434 |
+
### Fixed
|
| 435 |
+
|
| 436 |
+
- Tweak detection of dropped connections, resolving an issue with open files limits on Linux. (Pull #185)
|
| 437 |
+
- Avoid leaking connections when establishing an HTTP tunnel to a proxy has failed. (Pull #223)
|
| 438 |
+
- Properly wrap OS errors when using `trio`. (Pull #225)
|
| 439 |
+
|
| 440 |
+
## 0.12.0 (October 6th, 2020)
|
| 441 |
+
|
| 442 |
+
### Changed
|
| 443 |
+
|
| 444 |
+
- HTTP header casing is now preserved, rather than always sent in lowercase. (#216 and python-hyper/h11#104)
|
| 445 |
+
|
| 446 |
+
### Added
|
| 447 |
+
|
| 448 |
+
- Add Python 3.9 to officially supported versions.
|
| 449 |
+
|
| 450 |
+
### Fixed
|
| 451 |
+
|
| 452 |
+
- Gracefully handle a stdlib asyncio bug when a connection is closed while it is in a paused-for-reading state. (#201)
|
| 453 |
+
|
| 454 |
+
## 0.11.1 (September 28nd, 2020)
|
| 455 |
+
|
| 456 |
+
### Fixed
|
| 457 |
+
|
| 458 |
+
- Add await to async semaphore release() coroutine (#197)
|
| 459 |
+
- Drop incorrect curio classifier (#192)
|
| 460 |
+
|
| 461 |
+
## 0.11.0 (September 22nd, 2020)
|
| 462 |
+
|
| 463 |
+
The Transport API with 0.11.0 has a couple of significant changes.
|
| 464 |
+
|
| 465 |
+
Firstly we've moved changed the request interface in order to allow extensions, which will later enable us to support features
|
| 466 |
+
such as trailing headers, HTTP/2 server push, and CONNECT/Upgrade connections.
|
| 467 |
+
|
| 468 |
+
The interface changes from:
|
| 469 |
+
|
| 470 |
+
```python
|
| 471 |
+
def request(method, url, headers, stream, timeout):
|
| 472 |
+
return (http_version, status_code, reason, headers, stream)
|
| 473 |
+
```
|
| 474 |
+
|
| 475 |
+
To instead including an optional dictionary of extensions on the request and response:
|
| 476 |
+
|
| 477 |
+
```python
|
| 478 |
+
def request(method, url, headers, stream, ext):
|
| 479 |
+
return (status_code, headers, stream, ext)
|
| 480 |
+
```
|
| 481 |
+
|
| 482 |
+
Having an open-ended extensions point will allow us to add later support for various optional features, that wouldn't otherwise be supported without these API changes.
|
| 483 |
+
|
| 484 |
+
In particular:
|
| 485 |
+
|
| 486 |
+
* Trailing headers support.
|
| 487 |
+
* HTTP/2 Server Push
|
| 488 |
+
* sendfile.
|
| 489 |
+
* Exposing raw connection on CONNECT, Upgrade, HTTP/2 bi-di streaming.
|
| 490 |
+
* Exposing debug information out of the API, including template name, template context.
|
| 491 |
+
|
| 492 |
+
Currently extensions are limited to:
|
| 493 |
+
|
| 494 |
+
* request: `timeout` - Optional. Timeout dictionary.
|
| 495 |
+
* response: `http_version` - Optional. Include the HTTP version used on the response.
|
| 496 |
+
* response: `reason` - Optional. Include the reason phrase used on the response. Only valid with HTTP/1.*.
|
| 497 |
+
|
| 498 |
+
See https://github.com/encode/httpx/issues/1274#issuecomment-694884553 for the history behind this.
|
| 499 |
+
|
| 500 |
+
Secondly, the async version of `request` is now namespaced as `arequest`.
|
| 501 |
+
|
| 502 |
+
This allows concrete transports to support both sync and async implementations on the same class.
|
| 503 |
+
|
| 504 |
+
### Added
|
| 505 |
+
|
| 506 |
+
- Add curio support. (Pull #168)
|
| 507 |
+
- Add anyio support, with `backend="anyio"`. (Pull #169)
|
| 508 |
+
|
| 509 |
+
### Changed
|
| 510 |
+
|
| 511 |
+
- Update the Transport API to use 'ext' for optional extensions. (Pull #190)
|
| 512 |
+
- Update the Transport API to use `.request` and `.arequest` so implementations can support both sync and async. (Pull #189)
|
| 513 |
+
|
| 514 |
+
## 0.10.2 (August 20th, 2020)
|
| 515 |
+
|
| 516 |
+
### Added
|
| 517 |
+
|
| 518 |
+
- Added Unix Domain Socket support. (Pull #139)
|
| 519 |
+
|
| 520 |
+
### Fixed
|
| 521 |
+
|
| 522 |
+
- Always include the port on proxy CONNECT requests. (Pull #154)
|
| 523 |
+
- Fix `max_keepalive_connections` configuration. (Pull #153)
|
| 524 |
+
- Fixes behaviour in HTTP/1.1 where server disconnects can be used to signal the end of the response body. (Pull #164)
|
| 525 |
+
|
| 526 |
+
## 0.10.1 (August 7th, 2020)
|
| 527 |
+
|
| 528 |
+
- Include `max_keepalive_connections` on `AsyncHTTPProxy`/`SyncHTTPProxy` classes.
|
| 529 |
+
|
| 530 |
+
## 0.10.0 (August 7th, 2020)
|
| 531 |
+
|
| 532 |
+
The most notable change in the 0.10.0 release is that HTTP/2 support is now fully optional.
|
| 533 |
+
|
| 534 |
+
Use either `pip install httpcore` for HTTP/1.1 support only, or `pip install httpcore[http2]` for HTTP/1.1 and HTTP/2 support.
|
| 535 |
+
|
| 536 |
+
### Added
|
| 537 |
+
|
| 538 |
+
- HTTP/2 support becomes optional. (Pull #121, #130)
|
| 539 |
+
- Add `local_address=...` support. (Pull #100, #134)
|
| 540 |
+
- Add `PlainByteStream`, `IteratorByteStream`, `AsyncIteratorByteStream`. The `AsyncByteSteam` and `SyncByteStream` classes are now pure interface classes. (#133)
|
| 541 |
+
- Add `LocalProtocolError`, `RemoteProtocolError` exceptions. (Pull #129)
|
| 542 |
+
- Add `UnsupportedProtocol` exception. (Pull #128)
|
| 543 |
+
- Add `.get_connection_info()` method. (Pull #102, #137)
|
| 544 |
+
- Add better TRACE logs. (Pull #101)
|
| 545 |
+
|
| 546 |
+
### Changed
|
| 547 |
+
|
| 548 |
+
- `max_keepalive` is deprecated in favour of `max_keepalive_connections`. (Pull #140)
|
| 549 |
+
|
| 550 |
+
### Fixed
|
| 551 |
+
|
| 552 |
+
- Improve handling of server disconnects. (Pull #112)
|
| 553 |
+
|
| 554 |
+
## 0.9.1 (May 27th, 2020)
|
| 555 |
+
|
| 556 |
+
### Fixed
|
| 557 |
+
|
| 558 |
+
- Proper host resolution for sync case, including IPv6 support. (Pull #97)
|
| 559 |
+
- Close outstanding connections when connection pool is closed. (Pull #98)
|
| 560 |
+
|
| 561 |
+
## 0.9.0 (May 21th, 2020)
|
| 562 |
+
|
| 563 |
+
### Changed
|
| 564 |
+
|
| 565 |
+
- URL port becomes an `Optional[int]` instead of `int`. (Pull #92)
|
| 566 |
+
|
| 567 |
+
### Fixed
|
| 568 |
+
|
| 569 |
+
- Honor HTTP/2 max concurrent streams settings. (Pull #89, #90)
|
| 570 |
+
- Remove incorrect debug log. (Pull #83)
|
| 571 |
+
|
| 572 |
+
## 0.8.4 (May 11th, 2020)
|
| 573 |
+
|
| 574 |
+
### Added
|
| 575 |
+
|
| 576 |
+
- Logging via HTTPCORE_LOG_LEVEL and HTTPX_LOG_LEVEL environment variables
|
| 577 |
+
and TRACE level logging. (Pull #79)
|
| 578 |
+
|
| 579 |
+
### Fixed
|
| 580 |
+
|
| 581 |
+
- Reuse of connections on HTTP/2 in close concurrency situations. (Pull #81)
|
| 582 |
+
|
| 583 |
+
## 0.8.3 (May 6rd, 2020)
|
| 584 |
+
|
| 585 |
+
### Fixed
|
| 586 |
+
|
| 587 |
+
- Include `Host` and `Accept` headers on proxy "CONNECT" requests.
|
| 588 |
+
- De-duplicate any headers also contained in proxy_headers.
|
| 589 |
+
- HTTP/2 flag not being passed down to proxy connections.
|
| 590 |
+
|
| 591 |
+
## 0.8.2 (May 3rd, 2020)
|
| 592 |
+
|
| 593 |
+
### Fixed
|
| 594 |
+
|
| 595 |
+
- Fix connections using proxy forwarding requests not being added to the
|
| 596 |
+
connection pool properly. (Pull #70)
|
| 597 |
+
|
| 598 |
+
## 0.8.1 (April 30th, 2020)
|
| 599 |
+
|
| 600 |
+
### Changed
|
| 601 |
+
|
| 602 |
+
- Allow inherintance of both `httpcore.AsyncByteStream`, `httpcore.SyncByteStream` without type conflicts.
|
| 603 |
+
|
| 604 |
+
## 0.8.0 (April 30th, 2020)
|
| 605 |
+
|
| 606 |
+
### Fixed
|
| 607 |
+
|
| 608 |
+
- Fixed tunnel proxy support.
|
| 609 |
+
|
| 610 |
+
### Added
|
| 611 |
+
|
| 612 |
+
- New `TimeoutException` base class.
|
| 613 |
+
|
| 614 |
+
## 0.7.0 (March 5th, 2020)
|
| 615 |
+
|
| 616 |
+
- First integration with HTTPX.
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore-1.0.7.dist-info/RECORD
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
httpcore-1.0.7.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
|
| 2 |
+
httpcore-1.0.7.dist-info/METADATA,sha256=ATe1rdfnyvJCveGq1xl8q7B27Suta1I2xVcfYU-my4M,21265
|
| 3 |
+
httpcore-1.0.7.dist-info/RECORD,,
|
| 4 |
+
httpcore-1.0.7.dist-info/WHEEL,sha256=C2FUgwZgiLbznR-k0b_5k3Ai_1aASOXDss3lzCUsUug,87
|
| 5 |
+
httpcore-1.0.7.dist-info/licenses/LICENSE.md,sha256=_ctZFUx0y6uhahEkL3dAvqnyPW_rVUeRfYxflKgDkqU,1518
|
| 6 |
+
httpcore/__init__.py,sha256=LrhuDP3kqwQW-464qRK_Q7B72Zp0LklpkEqbqUHAh2E,3357
|
| 7 |
+
httpcore/__pycache__/__init__.cpython-312.pyc,,
|
| 8 |
+
httpcore/__pycache__/_api.cpython-312.pyc,,
|
| 9 |
+
httpcore/__pycache__/_exceptions.cpython-312.pyc,,
|
| 10 |
+
httpcore/__pycache__/_models.cpython-312.pyc,,
|
| 11 |
+
httpcore/__pycache__/_ssl.cpython-312.pyc,,
|
| 12 |
+
httpcore/__pycache__/_synchronization.cpython-312.pyc,,
|
| 13 |
+
httpcore/__pycache__/_trace.cpython-312.pyc,,
|
| 14 |
+
httpcore/__pycache__/_utils.cpython-312.pyc,,
|
| 15 |
+
httpcore/_api.py,sha256=unZmeDschBWCGCPCwkS3Wot9euK6bg_kKxLtGTxw214,3146
|
| 16 |
+
httpcore/_async/__init__.py,sha256=EWdl2v4thnAHzJpqjU4h2a8DUiGAvNiWrkii9pfhTf0,1221
|
| 17 |
+
httpcore/_async/__pycache__/__init__.cpython-312.pyc,,
|
| 18 |
+
httpcore/_async/__pycache__/connection.cpython-312.pyc,,
|
| 19 |
+
httpcore/_async/__pycache__/connection_pool.cpython-312.pyc,,
|
| 20 |
+
httpcore/_async/__pycache__/http11.cpython-312.pyc,,
|
| 21 |
+
httpcore/_async/__pycache__/http2.cpython-312.pyc,,
|
| 22 |
+
httpcore/_async/__pycache__/http_proxy.cpython-312.pyc,,
|
| 23 |
+
httpcore/_async/__pycache__/interfaces.cpython-312.pyc,,
|
| 24 |
+
httpcore/_async/__pycache__/socks_proxy.cpython-312.pyc,,
|
| 25 |
+
httpcore/_async/connection.py,sha256=6OcPXqMEfc0BU38_-iHUNDd1vKSTc2UVT09XqNb_BOk,8449
|
| 26 |
+
httpcore/_async/connection_pool.py,sha256=DOIQ2s2ZCf9qfwxhzMprTPLqCL8OxGXiKF6qRHxvVyY,17307
|
| 27 |
+
httpcore/_async/http11.py,sha256=-qM9bV7PjSQF5vxs37-eUXOIFwbIjPcZbNliuX9TtBw,13880
|
| 28 |
+
httpcore/_async/http2.py,sha256=2mPEUDu8jwx99MVDhDKBu1e8ajCVEkBOu1jUQLk0KR8,23648
|
| 29 |
+
httpcore/_async/http_proxy.py,sha256=2zVkrlv-Ds-rWGaqaXlrhEJiAQFPo23BT3Gq_sWoBXU,14701
|
| 30 |
+
httpcore/_async/interfaces.py,sha256=jTiaWL83pgpGC9ziv90ZfwaKNMmHwmOalzaKiuTxATo,4455
|
| 31 |
+
httpcore/_async/socks_proxy.py,sha256=lLKgLlggPfhFlqi0ODeBkOWvt9CghBBUyqsnsU1tx6Q,13841
|
| 32 |
+
httpcore/_backends/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
| 33 |
+
httpcore/_backends/__pycache__/__init__.cpython-312.pyc,,
|
| 34 |
+
httpcore/_backends/__pycache__/anyio.cpython-312.pyc,,
|
| 35 |
+
httpcore/_backends/__pycache__/auto.cpython-312.pyc,,
|
| 36 |
+
httpcore/_backends/__pycache__/base.cpython-312.pyc,,
|
| 37 |
+
httpcore/_backends/__pycache__/mock.cpython-312.pyc,,
|
| 38 |
+
httpcore/_backends/__pycache__/sync.cpython-312.pyc,,
|
| 39 |
+
httpcore/_backends/__pycache__/trio.cpython-312.pyc,,
|
| 40 |
+
httpcore/_backends/anyio.py,sha256=x8PgEhXRC8bVqsdzk_YJx8Y6d9Tub06CuUSwnbmtqoY,5252
|
| 41 |
+
httpcore/_backends/auto.py,sha256=zO136PKZmsaTDK-HRk84eA-MUg8_2wJf4NvmK432Aio,1662
|
| 42 |
+
httpcore/_backends/base.py,sha256=aShgRdZnMmRhFWHetjumlM73f8Kz1YOAyCUP_4kHslA,3042
|
| 43 |
+
httpcore/_backends/mock.py,sha256=er9T436uSe7NLrfiLa4x6Nuqg5ivQ693CxWYCWsgbH4,4077
|
| 44 |
+
httpcore/_backends/sync.py,sha256=bhE4d9iK9Umxdsdsgm2EfKnXaBms2WggGYU-7jmUujU,7977
|
| 45 |
+
httpcore/_backends/trio.py,sha256=LHu4_Mr5MswQmmT3yE4oLgf9b_JJfeVS4BjDxeJc7Ro,5996
|
| 46 |
+
httpcore/_exceptions.py,sha256=looCKga3_YVYu3s-d3L9RMPRJyhsY7fiuuGxvkOD0c0,1184
|
| 47 |
+
httpcore/_models.py,sha256=IO2CcXcdpovRcLTdGFGB6RyBZdEm2h_TOmoCc4rEKho,17623
|
| 48 |
+
httpcore/_ssl.py,sha256=srqmSNU4iOUvWF-SrJvb8G_YEbHFELOXQOwdDIBTS9c,187
|
| 49 |
+
httpcore/_sync/__init__.py,sha256=JBDIgXt5la1LCJ1sLQeKhjKFpLnpNr8Svs6z2ni3fgg,1141
|
| 50 |
+
httpcore/_sync/__pycache__/__init__.cpython-312.pyc,,
|
| 51 |
+
httpcore/_sync/__pycache__/connection.cpython-312.pyc,,
|
| 52 |
+
httpcore/_sync/__pycache__/connection_pool.cpython-312.pyc,,
|
| 53 |
+
httpcore/_sync/__pycache__/http11.cpython-312.pyc,,
|
| 54 |
+
httpcore/_sync/__pycache__/http2.cpython-312.pyc,,
|
| 55 |
+
httpcore/_sync/__pycache__/http_proxy.cpython-312.pyc,,
|
| 56 |
+
httpcore/_sync/__pycache__/interfaces.cpython-312.pyc,,
|
| 57 |
+
httpcore/_sync/__pycache__/socks_proxy.cpython-312.pyc,,
|
| 58 |
+
httpcore/_sync/connection.py,sha256=9exGOb3PB-Mp2T1-sckSeL2t-tJ_9-NXomV8ihmWCgU,8238
|
| 59 |
+
httpcore/_sync/connection_pool.py,sha256=a-T8LTsUxc7r0Ww1atfHSDoWPjQ0fA8Ul7S3-F0Mj70,16955
|
| 60 |
+
httpcore/_sync/http11.py,sha256=IFobD1Md5JFlJGKWnh1_Q3epikUryI8qo09v8MiJIEA,13476
|
| 61 |
+
httpcore/_sync/http2.py,sha256=IZOBL1nNpOKJYwTSHYWtscD3zjSg8f85-63-o5RedVc,23112
|
| 62 |
+
httpcore/_sync/http_proxy.py,sha256=_al_6crKuEZu2wyvu493RZImJdBJnj5oGKNjLOJL2Zo,14463
|
| 63 |
+
httpcore/_sync/interfaces.py,sha256=snXON42vUDHO5JBJvo8D4VWk2Wat44z2OXXHDrjbl94,4344
|
| 64 |
+
httpcore/_sync/socks_proxy.py,sha256=zegZW9Snqj2_992DFJa8_CppOVBkVL4AgwduRkStakQ,13614
|
| 65 |
+
httpcore/_synchronization.py,sha256=zSi13mAColBnknjZBknUC6hKNDQT4C6ijnezZ-r0T2s,9434
|
| 66 |
+
httpcore/_trace.py,sha256=ck6ZoIzYTkdNAIfq5MGeKqBXDtqjOX-qfYwmZFbrGco,3952
|
| 67 |
+
httpcore/_utils.py,sha256=_RLgXYOAYC350ikALV59GZ68IJrdocRZxPs9PjmzdFY,1537
|
| 68 |
+
httpcore/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|