| import os |
| from datetime import datetime |
|
|
| import pytz |
| from huggingface_hub import HfApi |
|
|
| GENERATED_BELOW_MARKER = "--- Generated Part of README Below ---" |
|
|
|
|
| def update_dataset_readme(dataset_name: str, subreddit: str, new_rows: int) -> None: |
| """ |
| Update the README file of a specified dataset repository with new information. |
| |
| Args: |
| dataset_name (str): Name of the dataset repository. |
| subreddit (str): Name of the subreddit being used for dataset creation. |
| new_rows (int): Number of new rows added in the latest update. |
| """ |
| |
| api = HfApi() |
| |
| |
| readme_path = api.hf_hub_download(repo_id=dataset_name, repo_type="dataset", filename="README.md") |
|
|
| |
| with open(readme_path, "r") as file: |
| old_readme = file.read() |
|
|
| |
| new_readme = append_to_readme(subreddit=subreddit, new_rows=new_rows, old_readme=old_readme) |
|
|
| |
| api.upload_file( |
| path_or_fileobj=new_readme.encode(), |
| path_in_repo="README.md", |
| repo_id=dataset_name, |
| repo_type="dataset", |
| commit_message=f'Pushing {new_rows} new rows' |
| ) |
|
|
|
|
| def append_to_readme(subreddit: str, new_rows: int, old_readme: str) -> str: |
| """ |
| Append new information to the existing README content. |
| |
| Args: |
| subreddit (str): Name of the subreddit. |
| new_rows (int): Number of new rows added. |
| old_readme (str): Existing README content. |
| |
| Returns: |
| str: Updated README content. |
| """ |
| latest_hour = datetime.now(pytz.utc).replace(minute=0, second=0, microsecond=0) |
| latest_hour_str = latest_hour.strftime('%Y-%m-%d %H:00:00 %Z%z') |
|
|
| readme_text = f""" |
| ## Dataset Overview |
| This dataset is based on [derek-thomas/dataset-creator-reddit-{subreddit}](https://huggingface.co/datasets/derek-thomas/dataset-creator-reddit-{subreddit}) |
| and will add [nomic-ai/nomic-embed-text-v1](https://huggingface.co/nomic-ai/nomic-embed-text-v1) embeddings based on the |
| `content` field. |
| |
| The goal is to be able to have an automatic and free semantic/neural tool for any subreddit. |
| |
| The last run was on {latest_hour_str} and updated {new_rows} new rows. |
| |
| ## Creation Details |
| This is done by triggering [derek-thomas/processing-bestofredditorupdates](https://huggingface.co/spaces/derek-thomas/processing-bestofredditorupdates) |
| based on a repository update [webhook](https://huggingface.co/docs/hub/en/webhooks) to calculate the embeddings and update the [nomic atlas](https://docs.nomic.ai) |
| visualization. This is done by this [processing space](https://huggingface.co/spaces/derek-thomas/processing-bestofredditorupdates). |
| |
| ## Update Frequency |
| The dataset is updated based on a [webhook](https://huggingface.co/docs/hub/en/webhooks) trigger, so each time [derek-thomas/dataset-creator-reddit-{subreddit}](https://huggingface.co/datasets/derek-thomas/dataset-creator-reddit-{subreddit}) |
| is updated, this dataset will be updated. |
| |
| ## Opt-out |
| To opt-out of this dataset please make a request in the community tab |
| """ |
|
|
| if GENERATED_BELOW_MARKER in old_readme: |
| index = old_readme.index(GENERATED_BELOW_MARKER) + len(GENERATED_BELOW_MARKER) |
| new_readme = old_readme[:index] + "\n\n" + readme_text |
| else: |
| new_readme = old_readme + "\n\n" + GENERATED_BELOW_MARKER + "\n\n" + readme_text + "\n" |
|
|
| return new_readme |
|
|