| | import json |
| | import math |
| | import os |
| | import uuid |
| | from functools import partial |
| |
|
| | import jsonlines |
| | import streamlit as st |
| | import streamlit.components.v1 as components |
| | from huggingface_hub import HfApi |
| |
|
| | BAD_EXAMPLES_PATH = "bad_examples" |
| | DATA_PATH = "data" |
| |
|
| |
|
| | def report_result_dataset(dataset, docid, text, metadata, reason, annotator): |
| | with jsonlines.open("report.jsonl", "w") as f: |
| | f.write( |
| | { |
| | "dataset": dataset, |
| | "docid": docid, |
| | "text": text, |
| | "metadata": metadata, |
| | "reason": reason, |
| | "annotator": annotator, |
| | } |
| | ) |
| |
|
| | api = HfApi() |
| | api.upload_file( |
| | path_or_fileobj="report.jsonl", |
| | path_in_repo="report-{}.jsonl".format(uuid.uuid4()), |
| | repo_id="HuggingFaceGECLM/data_feedback", |
| | repo_type="dataset", |
| | token=os.environ.get("geclm_token"), |
| | ) |
| |
|
| |
|
| | def load_jsonl(file_path): |
| | data = [] |
| | with open(file_path, "r") as f: |
| | for line in f: |
| | data.append(json.loads(line)) |
| |
|
| | return data |
| |
|
| |
|
| | if "idx" not in st.session_state: |
| | st.session_state.idx = 0 |
| |
|
| |
|
| | def get_next_item(): |
| | st.session_state.idx += 1 |
| |
|
| |
|
| | def save_flag_and_get_next_item(sample, issue): |
| | if issue is None or issue == "": |
| | issue = "None" |
| | sample["issue"] = issue |
| |
|
| | with open(f"{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl", "a") as f: |
| | f.write(json.dumps(sample) + "\n") |
| |
|
| | text = sample["text"] |
| | sample.pop("text") |
| | sample.pop("issue") |
| | sample_id = "" |
| | if "id" not in sample: |
| | if "title" in sample: |
| | sample_id = sample["title"] |
| | else: |
| | sample_id = sample["id"] |
| |
|
| | report_result_dataset(dataset, sample_id, text, str(sample), issue, "") |
| |
|
| | get_next_item() |
| |
|
| |
|
| | datasets = [ |
| | "gutenberg_raw", |
| | "stackexchange2", |
| | "bigcode_python_code", |
| | "bigcode_python_github_issues", |
| | "bigcode_python_jupyter_scripts_dedup_filtered", |
| | "books3", |
| | "c4", |
| | "s2orc_raw", |
| | "reddit_threaded", |
| | "cc_filtered_text", |
| | ] |
| | dataset = st.sidebar.selectbox("Dataset", datasets) |
| | data = load_jsonl(f"{DATA_PATH}/{dataset}_examples_with_stats.json") |
| |
|
| | |
| | with open(f"{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl", "a") as f: |
| | pass |
| |
|
| | st.sidebar.button("Reset Index", on_click=lambda: st.session_state.__delitem__("idx")) |
| |
|
| | with open(f"{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl", "r+") as f: |
| | st.sidebar.download_button( |
| | "Download bad example JSON file", f, file_name=f"{dataset}_bad_examples.jsonl" |
| | ) |
| |
|
| | st.sidebar.button( |
| | "Clear bad examples file", |
| | on_click=lambda: open( |
| | f"{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl", "w" |
| | ).close(), |
| | ) |
| |
|
| | with st.form(key="bad_form", clear_on_submit=True): |
| | sample = data[st.session_state.idx] |
| | text = sample["text"] |
| | st.text_area(f"text id: {st.session_state.idx}", text, height=500) |
| |
|
| | issue = st.text_input( |
| | "What's wrong with this example? (leave blank if example is fine)" |
| | ) |
| |
|
| | good = st.form_submit_button( |
| | "GOOD", |
| | on_click=get_next_item, |
| | ) |
| | bad = st.form_submit_button( |
| | "BAD", |
| | on_click=save_flag_and_get_next_item, |
| | args=(sample, issue), |
| | ) |
| |
|