| | import json |
| | import os |
| | from dataclasses import asdict, dataclass, is_dataclass |
| | from itertools import chain |
| | from typing import Dict, FrozenSet, List, Set, Tuple, Union |
| |
|
| |
|
| | @dataclass(eq=True, frozen=True) |
| | class Evidence: |
| | """ |
| | (docid, start_token, end_token) form the only official Evidence; sentence level annotations are for convenience. |
| | Args: |
| | text: Some representation of the evidence text |
| | docid: Some identifier for the document |
| | start_token: The canonical start token, inclusive |
| | end_token: The canonical end token, exclusive |
| | start_sentence: Best guess start sentence, inclusive |
| | end_sentence: Best guess end sentence, exclusive |
| | """ |
| |
|
| | text: Union[str, Tuple[int], Tuple[str]] |
| | docid: str |
| | start_token: int = -1 |
| | end_token: int = -1 |
| | start_sentence: int = -1 |
| | end_sentence: int = -1 |
| |
|
| |
|
| | @dataclass(eq=True, frozen=True) |
| | class Annotation: |
| | """ |
| | Args: |
| | annotation_id: unique ID for this annotation element |
| | query: some representation of a query string |
| | evidences: a set of "evidence groups". |
| | Each evidence group is: |
| | * sufficient to respond to the query (or justify an answer) |
| | * composed of one or more Evidences |
| | * may have multiple documents in it (depending on the dataset) |
| | - e-snli has multiple documents |
| | - other datasets do not |
| | classification: str |
| | query_type: Optional str, additional information about the query |
| | docids: a set of docids in which one may find evidence. |
| | """ |
| |
|
| | annotation_id: str |
| | query: Union[str, Tuple[int]] |
| | evidences: Union[Set[Tuple[Evidence]], FrozenSet[Tuple[Evidence]]] |
| | classification: str |
| | query_type: str = None |
| | docids: Set[str] = None |
| |
|
| | def all_evidences(self) -> Tuple[Evidence]: |
| | return tuple(list(chain.from_iterable(self.evidences))) |
| |
|
| |
|
| | def annotations_to_jsonl(annotations, output_file): |
| | with open(output_file, "w") as of: |
| | for ann in sorted(annotations, key=lambda x: x.annotation_id): |
| | as_json = _annotation_to_dict(ann) |
| | as_str = json.dumps(as_json, sort_keys=True) |
| | of.write(as_str) |
| | of.write("\n") |
| |
|
| |
|
| | def _annotation_to_dict(dc): |
| | |
| | if is_dataclass(dc): |
| | d = asdict(dc) |
| | ret = dict() |
| | for k, v in d.items(): |
| | ret[k] = _annotation_to_dict(v) |
| | return ret |
| | elif isinstance(dc, dict): |
| | ret = dict() |
| | for k, v in dc.items(): |
| | k = _annotation_to_dict(k) |
| | v = _annotation_to_dict(v) |
| | ret[k] = v |
| | return ret |
| | elif isinstance(dc, str): |
| | return dc |
| | elif isinstance(dc, (set, frozenset, list, tuple)): |
| | ret = [] |
| | for x in dc: |
| | ret.append(_annotation_to_dict(x)) |
| | return tuple(ret) |
| | else: |
| | return dc |
| |
|
| |
|
| | def load_jsonl(fp: str) -> List[dict]: |
| | ret = [] |
| | with open(fp, "r") as inf: |
| | for line in inf: |
| | content = json.loads(line) |
| | ret.append(content) |
| | return ret |
| |
|
| |
|
| | def write_jsonl(jsonl, output_file): |
| | with open(output_file, "w") as of: |
| | for js in jsonl: |
| | as_str = json.dumps(js, sort_keys=True) |
| | of.write(as_str) |
| | of.write("\n") |
| |
|
| |
|
| | def annotations_from_jsonl(fp: str) -> List[Annotation]: |
| | ret = [] |
| | with open(fp, "r") as inf: |
| | for line in inf: |
| | content = json.loads(line) |
| | ev_groups = [] |
| | for ev_group in content["evidences"]: |
| | ev_group = tuple([Evidence(**ev) for ev in ev_group]) |
| | ev_groups.append(ev_group) |
| | content["evidences"] = frozenset(ev_groups) |
| | ret.append(Annotation(**content)) |
| | return ret |
| |
|
| |
|
| | def load_datasets( |
| | data_dir: str, |
| | ) -> Tuple[List[Annotation], List[Annotation], List[Annotation]]: |
| | """Loads a training, validation, and test dataset |
| | |
| | Each dataset is assumed to have been serialized by annotations_to_jsonl, |
| | that is it is a list of json-serialized Annotation instances. |
| | """ |
| | train_data = annotations_from_jsonl(os.path.join(data_dir, "train.jsonl")) |
| | val_data = annotations_from_jsonl(os.path.join(data_dir, "val.jsonl")) |
| | test_data = annotations_from_jsonl(os.path.join(data_dir, "test.jsonl")) |
| | return train_data, val_data, test_data |
| |
|
| |
|
| | def load_documents( |
| | data_dir: str, docids: Set[str] = None |
| | ) -> Dict[str, List[List[str]]]: |
| | """Loads a subset of available documents from disk. |
| | |
| | Each document is assumed to be serialized as newline ('\n') separated sentences. |
| | Each sentence is assumed to be space (' ') joined tokens. |
| | """ |
| | if os.path.exists(os.path.join(data_dir, "docs.jsonl")): |
| | assert not os.path.exists(os.path.join(data_dir, "docs")) |
| | return load_documents_from_file(data_dir, docids) |
| |
|
| | docs_dir = os.path.join(data_dir, "docs") |
| | res = dict() |
| | if docids is None: |
| | docids = sorted(os.listdir(docs_dir)) |
| | else: |
| | docids = sorted(set(str(d) for d in docids)) |
| | for d in docids: |
| | with open(os.path.join(docs_dir, d), "r") as inf: |
| | res[d] = inf.read() |
| | return res |
| |
|
| |
|
| | def load_flattened_documents(data_dir: str, docids: Set[str]) -> Dict[str, List[str]]: |
| | """Loads a subset of available documents from disk. |
| | |
| | Returns a tokenized version of the document. |
| | """ |
| | unflattened_docs = load_documents(data_dir, docids) |
| | flattened_docs = dict() |
| | for doc, unflattened in unflattened_docs.items(): |
| | flattened_docs[doc] = list(chain.from_iterable(unflattened)) |
| | return flattened_docs |
| |
|
| |
|
| | def intern_documents( |
| | documents: Dict[str, List[List[str]]], word_interner: Dict[str, int], unk_token: str |
| | ): |
| | """ |
| | Replaces every word with its index in an embeddings file. |
| | |
| | If a word is not found, uses the unk_token instead |
| | """ |
| | ret = dict() |
| | unk = word_interner[unk_token] |
| | for docid, sentences in documents.items(): |
| | ret[docid] = [[word_interner.get(w, unk) for w in s] for s in sentences] |
| | return ret |
| |
|
| |
|
| | def intern_annotations( |
| | annotations: List[Annotation], word_interner: Dict[str, int], unk_token: str |
| | ): |
| | ret = [] |
| | for ann in annotations: |
| | ev_groups = [] |
| | for ev_group in ann.evidences: |
| | evs = [] |
| | for ev in ev_group: |
| | evs.append( |
| | Evidence( |
| | text=tuple( |
| | [ |
| | word_interner.get(t, word_interner[unk_token]) |
| | for t in ev.text.split() |
| | ] |
| | ), |
| | docid=ev.docid, |
| | start_token=ev.start_token, |
| | end_token=ev.end_token, |
| | start_sentence=ev.start_sentence, |
| | end_sentence=ev.end_sentence, |
| | ) |
| | ) |
| | ev_groups.append(tuple(evs)) |
| | ret.append( |
| | Annotation( |
| | annotation_id=ann.annotation_id, |
| | query=tuple( |
| | [ |
| | word_interner.get(t, word_interner[unk_token]) |
| | for t in ann.query.split() |
| | ] |
| | ), |
| | evidences=frozenset(ev_groups), |
| | classification=ann.classification, |
| | query_type=ann.query_type, |
| | ) |
| | ) |
| | return ret |
| |
|
| |
|
| | def load_documents_from_file( |
| | data_dir: str, docids: Set[str] = None |
| | ) -> Dict[str, List[List[str]]]: |
| | """Loads a subset of available documents from 'docs.jsonl' file on disk. |
| | |
| | Each document is assumed to be serialized as newline ('\n') separated sentences. |
| | Each sentence is assumed to be space (' ') joined tokens. |
| | """ |
| | docs_file = os.path.join(data_dir, "docs.jsonl") |
| | documents = load_jsonl(docs_file) |
| | documents = {doc["docid"]: doc["document"] for doc in documents} |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | return documents |
| |
|