| |
| """TODO: Add a description here.""" |
| import collections |
| import os |
|
|
| from typing import Literal |
|
|
| import concurrent.futures |
| import datasets |
| import evaluate |
| import itertools |
| import numpy as np |
| from tqdm import tqdm |
|
|
| from .execute import check_correctness |
|
|
| |
| _CITATION = """\ |
| @InProceedings{huggingface:module, |
| title = {A great new module}, |
| authors={huggingface, Inc.}, |
| year={2020} |
| } |
| """ |
|
|
| |
| _DESCRIPTION = """\ |
| This new module is designed to solve this great ML task and is crafted with a lot of care. |
| """ |
|
|
|
|
| |
| _KWARGS_DESCRIPTION = """ |
| Calculates how good are predictions given some references, using certain scores |
| Args: |
| predictions: list of predictions to score. Each predictions |
| should be a string with tokens separated by spaces. |
| references: list of reference for each prediction. Each |
| reference should be a string with tokens separated by spaces. |
| Returns: |
| accuracy: description of the first score, |
| another_score: description of the second score, |
| Examples: |
| Examples should be written in doctest format, and should illustrate how |
| to use the function. |
| |
| >>> my_new_module = evaluate.load("my_new_module") |
| >>> results = my_new_module.compute(references=[0, 1], predictions=[0, 1]) |
| >>> print(results) |
| {'accuracy': 1.0} |
| """ |
|
|
| _WARNING = """ |
| ################################################################################ |
| !!!WARNING!!! |
| ################################################################################ |
| The "code_eval" metric executes untrusted model-generated code in Python. |
| Although it is highly unlikely that model-generated code will do something |
| overtly malicious in response to this test suite, model-generated code may act |
| destructively due to a lack of model capability or alignment. |
| Users are strongly encouraged to sandbox this evaluation suite so that it |
| does not perform destructive actions on their host or network. For more |
| information on how OpenAI sandboxes its code, see the paper "Evaluating Large |
| Language Models Trained on Code" (https://arxiv.org/abs/2107.03374). |
| Once you have read this disclaimer and taken appropriate precautions, |
| set the environment variable HF_ALLOW_CODE_EVAL="1". Within Python you can to this |
| with: |
| >>> import os |
| >>> os.environ["HF_ALLOW_CODE_EVAL"] = "1" |
| ################################################################################\ |
| """ |
|
|
| _CLANG_WARNING = """ |
| Please provide the environment variable 'GENERICIFY_CLANG' with the path of the |
| clang++ compiler. Version 15+ is required. Within Python you can to this |
| with: |
| >>> import os |
| >>> os.environ["GENERICIFY_CLANG"] = "/path/to/clang++" |
| """ |
|
|
| |
| BAD_WORDS_URL = "http://url/to/external/resource/bad_words.txt" |
|
|
|
|
| @evaluate.utils.file_utils.add_start_docstrings( |
| _DESCRIPTION, _KWARGS_DESCRIPTION |
| ) |
| class EvaluateGenericifyCpp(evaluate.Metric): |
| """TODO: Short description of my evaluation module.""" |
|
|
| def _info(self): |
| |
| return evaluate.MetricInfo( |
| |
| description=_DESCRIPTION, |
| citation=_CITATION, |
| inputs_description=_KWARGS_DESCRIPTION, |
| |
| features=datasets.Features( |
| { |
| "predictions": datasets.Sequence(datasets.Value("string")), |
| "references": datasets.Features( |
| { |
| "tests": datasets.Value("string"), |
| "invalids": datasets.Value("string"), |
| } |
| ), |
| } |
| ), |
| |
| homepage="http://module.homepage", |
| |
| codebase_urls=["http://github.com/path/to/codebase/of/new_module"], |
| reference_urls=["http://path.to.reference.url/new_module"], |
| ) |
|
|
| def _compute( |
| self, |
| *, |
| predictions, |
| references, |
| cpp_type: Literal["base", "sfinae", "concepts"], |
| k=[1, 10, 100], |
| ): |
| """Returns the scores""" |
| num_workers = os.cpu_count() |
| num_workers = num_workers if not num_workers else 8 |
|
|
| if os.getenv("HF_ALLOW_CODE_EVAL", default=0) != "1": |
| raise ValueError(_WARNING) |
|
|
| if os.getenv("GENERICIFY_CLANG", default=0) == 0: |
| raise ValueError(_CLANG_WARNING) |
|
|
| if os.name == "nt": |
| raise NotImplementedError( |
| "This metric is currently not supported on Windows." |
| ) |
|
|
| total_predictions = sum(map(len, predictions)) |
|
|
| with concurrent.futures.ThreadPoolExecutor( |
| max_workers=num_workers |
| ) as executor: |
| futures = [] |
| completion_id = collections.Counter() |
| results = collections.defaultdict(list) |
|
|
| for task_id, (candidates, reference) in enumerate( |
| zip(predictions, references) |
| ): |
| for candidate in candidates: |
| args = ( |
| candidate, |
| reference, |
| cpp_type, |
| task_id, |
| completion_id[task_id], |
| ) |
| future = executor.submit(check_correctness, *args) |
| futures.append(future) |
| completion_id[task_id] += 1 |
|
|
| for future in tqdm( |
| concurrent.futures.as_completed(futures), |
| desc="Evaluating", |
| total=total_predictions, |
| ): |
| result = future.result() |
| results[result["task_id"]].append( |
| (result["completion_id"], result) |
| ) |
|
|
| totals = collections.defaultdict(list) |
| corrects = collections.defaultdict(list) |
|
|
| keys = { |
| "base": [ |
| "base_run_passed", |
| "base_run_compiled", |
| ], |
| "sfinae": [ |
| "sfinae_run_passed", |
| "sfinae_run_compiled", |
| "sfinae_constrain_passed", |
| ], |
| "concepts": [ |
| "concepts_run_passed", |
| "concepts_run_compiled", |
| "concepts_constrain_passed", |
| ], |
| }[cpp_type] |
| for result in results.values(): |
| result.sort() |
| for pt in keys: |
| passed = [r[1][pt] for r in result] |
| totals[pt].append(len(passed)) |
| corrects[pt].append(sum(passed)) |
|
|
| totals = {k: np.array(v) for k, v in totals.items()} |
| corrects = {k: np.array(v) for k, v in corrects.items()} |
|
|
| ks = k |
| pass_at_k = { |
| f"{key}@{k}": estimate_pass_at_k( |
| totals[key], |
| corrects[key], |
| k, |
| ).mean() |
| for key in totals.keys() |
| for k in ks |
| if (totals[key] >= k).all() |
| } |
|
|
| return pass_at_k, results |
|
|
|
|
| def estimate_pass_at_k(num_samples, num_correct, k) -> np.array: |
| """Estimates pass@k of each problem and returns them in an array.""" |
|
|
| def estimator(n: int, c: int) -> float: |
| """Calculates 1 - comb(n - c, k) / comb(n, k).""" |
| if n - c < k: |
| return 1.0 |
| return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1)) |
|
|
| if isinstance(num_samples, int): |
| num_samples_it = itertools.repeat(num_samples, len(num_correct)) |
| else: |
| assert len(num_samples) == len(num_correct) |
| num_samples_it = iter(num_samples) |
|
|
| return np.array( |
| [estimator(int(n), int(c)) for n, c in zip(num_samples_it, num_correct)] |
| ) |
|
|