| | |
| |
|
| | import re |
| |
|
| | def merge_documents(main_dict, additional_json, limit=1000): |
| | """ |
| | Adds a subset of documents from an additional JSON file to the main dictionary. |
| | |
| | Args: |
| | main_dict (dict): The main dictionary where processed documents are stored. |
| | additional_json (list): The additional JSON data containing documents. |
| | limit (int): The maximum number of documents to add to the main dictionary. |
| | |
| | Returns: |
| | dict: The updated main dictionary with additional documents added. |
| | """ |
| | |
| | count = 0 |
| |
|
| | for doc in additional_json: |
| | if count >= limit: |
| | break |
| | |
| | |
| | wikipedia_id = doc.get("wikipedia_id") |
| | text = doc.get("text", []) |
| | |
| | |
| | if wikipedia_id not in main_dict: |
| | |
| | joined_text = " ".join(text) |
| | sanitized_text = sanitize_text(joined_text) |
| | |
| | |
| | main_dict[wikipedia_id] = sanitized_text |
| | count += 1 |
| | |
| | print(f"{count} documents added to the main dictionary.") |
| | return main_dict |
| |
|
| | def sanitize_text(text): |
| | """ |
| | Cleans and standardizes text by keeping only alphanumeric characters and spaces. |
| | Args: |
| | text (str): Text to sanitize. |
| | Returns: |
| | str: Sanitized text. |
| | """ |
| | if isinstance(text, str): |
| | |
| | text = re.sub(r'[^a-zA-Z0-9\s]', '', text) |
| | |
| | text = re.sub(r'\s+', ' ', text).strip() |
| | return text |
| |
|
| |
|
| | def process_json_data(json_data): |
| | result_dict = {} |
| | |
| | for doc in json_data: |
| | |
| | wikipedia_id = doc.get("wikipedia_id") |
| | text = doc.get("text", []) |
| | |
| | |
| | joined_text = " ".join(text) |
| | sanitized_text = sanitize_text(joined_text) |
| | |
| | |
| | result_dict[wikipedia_id] = sanitized_text |
| |
|
| | return result_dict |
| |
|
| | def process_queries(json_data): |
| | """ |
| | Processes a JSON object containing queries and query IDs. |
| | |
| | Args: |
| | json_data (dict): The input JSON data. |
| | |
| | Returns: |
| | dict: A dictionary with query_id as the key and query text as the value. |
| | """ |
| | result_dict = {} |
| | |
| | for query_id, query_info in json_data.items(): |
| | |
| | query_text = query_info.get("input", "") |
| | |
| | |
| | result_dict[query_id] = query_text |
| |
|
| | return result_dict |
| |
|
| | |
| | |
| | |
| |
|
| |
|