| import os, warnings, requests |
| os.environ["CURL_CA_BUNDLE"] = '' |
| from dotenv import load_dotenv |
| from huggingface_hub import configure_http_backend |
| def backend_factory() -> requests.Session: |
| session = requests.Session() |
| session.verify = False |
| return session |
|
|
| configure_http_backend(backend_factory=backend_factory) |
|
|
| warnings.filterwarnings("ignore") |
| load_dotenv() |
| import bm25s |
| from bm25s.hf import BM25HF |
| from datasets import load_dataset |
| unique_specs = set() |
|
|
| dataset_text = load_dataset("OrganizedProgrammers/3GPPSpecContent") |
| dataset_metadata = load_dataset("OrganizedProgrammers/3GPPSpecMetadata") |
|
|
| dataset_text = dataset_text["train"].to_list() |
| dataset_metadata = dataset_metadata["train"].to_list() |
|
|
| corpus_json = [] |
|
|
| def get_document(spec_id: str, spec_title: str): |
| text = [f"{spec_id} - {spec_title}\n"] |
| for section in dataset_text: |
| if spec_id == section["doc_id"]: |
| text.extend([f"{section['section']}\n\n{section['content']}"]) |
| return text |
|
|
| for specification in dataset_metadata: |
| if specification['id'] in unique_specs: continue |
| for section in dataset_text: |
| if specification['id'] == section['doc_id']: |
| corpus_json.append({"text": f"{section['section']}\n{section['content']}", "metadata": { |
| "id": specification['id'], |
| "title": specification['title'], |
| "section_title": section['section'], |
| "version": specification['version'], |
| "type": specification['type'], |
| "working_group": specification['working_group'], |
| "url": specification['url'], |
| "scope": specification['scope'] |
| }}) |
| unique_specs.add(specification['id']) |
|
|
| corpus_text = [doc["text"] for doc in corpus_json] |
| corpus_tokens = bm25s.tokenize(corpus_text, stopwords="en") |
|
|
| retriever = BM25HF(corpus=corpus_json) |
| retriever.index(corpus_tokens) |
|
|
| retriever.save_to_hub("OrganizedProgrammers/3GPPBM25IndexSections", token=os.environ.get("HF_TOKEN")) |
|
|
| unique_specs = set() |
| corpus_json = [] |
|
|
| for specification in dataset_metadata: |
| if specification['id'] in unique_specs: continue |
| text_list = get_document(specification['id'], specification['title']) |
| text = "\n".join(text_list) |
| if len(text_list) == 1: continue |
| corpus_json.append({"text": text, "metadata": specification}) |
| unique_specs.add(specification['id']) |
| |
| corpus_text = [doc["text"] for doc in corpus_json] |
| corpus_tokens = bm25s.tokenize(corpus_text, stopwords="en") |
|
|
| retriever = BM25HF(corpus=corpus_json) |
| retriever.index(corpus_tokens) |
|
|
| retriever.save_to_hub("OrganizedProgrammers/3GPPBM25IndexSingle", token=os.environ.get("HF_TOKEN")) |