File size: 5,001 Bytes

5374a2d

import os
import json 
from dotenv import load_dotenv
from typing import Any, Tuple

from evoagentx.benchmark import MATH
from evoagentx.core.logging import logger
from evoagentx.models import OpenAILLM, OpenAILLMConfig
from evoagentx.optimizers import MiproOptimizer
from evoagentx.core.callbacks import suppress_logger_info
from evoagentx.utils.mipro_utils.register_utils import MiproRegistry


load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# =====================
# prepare the benchmark data 
# =====================

class MathSplits(MATH):

    def _load_data(self):
        # load the original test data 
        super()._load_data()
        # split the data into dev and test
        import numpy as np 
        np.random.seed(42)
        permutation = np.random.permutation(len(self._test_data))
        full_test_data = self._test_data
        # radnomly select 50 samples for training and 100 samples for test
        # self._train_data = [full_test_data[idx] for idx in permutation[:50]]
        self._train_data = [full_test_data[idx] for idx in permutation[:100]]
        self._test_data = [full_test_data[idx] for idx in permutation[100:200]]

    # define the input keys. 
    # If defined, the corresponding input key and value will be passed to the __call__ method of the program, 
    # i.e., program.__call__(**{k: v for k, v in example.items() if k in self.get_input_keys()})
    # If not defined, the program will be executed with the entire input example, i.e., program.__call__(**example)
    def get_input_keys(self):
        return ["problem"]
    
    # the benchmark must have a `evaluate` method that receives the program's `prediction` (output from the program's __call__ method) 
    # and the `label` (obtained using the `self.get_label` method) and return a dictionary of metrics. 
    def evaluate(self, prediction: Any, label: Any) -> dict:
        return super().evaluate(prediction, label)


# =====================
# prepare the program
# =====================

# here we use a simple program to answer the math problem.
class CustomProgram: 

    def __init__(self, model: OpenAILLM):
        self.model = model 
        self.prompt = "Let's think step by step to answer the math question: {problem}"
    
    # the program must have a `save` and `load` method to save and load the program
    def save(self, path: str):
        params = {"prompt": self.prompt}
        with open(path, "w") as f:
            json.dump(params, f)

    def load(self, path: str):
        with open(path, "r") as f:
            params = json.load(f)
            self.prompt = params["prompt"]
    
    # the program must have a `__call__` method to execute the program.
    # It receives the key-values (specified by `get_input_keys` in the benchmark) of an input example, 
    # and returns a tuple of (prediction, execution_data), 
    # where `prediction` is the program's output and `execution_data` is a dictionary that contains all the parameters' inputs and outputs. 
    def __call__(self, problem: str) -> Tuple[str, dict]:
        
        prompt = self.prompt.format(problem=problem)
        response = self.model.generate(prompt=prompt)
        solution = response.content
        return solution, {"problem": problem, "solution": solution}
    

def main():

    openai_config = OpenAILLMConfig(model="gpt-4o-mini", openai_key=OPENAI_API_KEY, stream=True, output_response=False)
    executor_llm = OpenAILLM(config=openai_config)
    optimizer_config = OpenAILLMConfig(model="gpt-4o", openai_key=OPENAI_API_KEY, stream=True, output_response=False)
    optimizer_llm = OpenAILLM(config=optimizer_config)

    benchmark = MathSplits()
    program = CustomProgram(model=executor_llm)

    # register the parameters to optimize 
    registry = MiproRegistry()
    # MiproRegistry requires specify the input_names and output_names for the specific parameter. 
    # The input_names and output_names should appear in the execution_data returned by the program's __call__ method. 
    registry.track(program, "prompt", input_names=["problem"], output_names=["solution"])

    # optimize the program 
    # `evaluator` is optional. If not provided, the optimizer will construct an evaluator based on the `evaluate` method of the benchmark. 
    optimizer = MiproOptimizer(
        registry=registry, 
        program=program, 
        optimizer_llm=optimizer_llm,
        max_bootstrapped_demos=4, 
        max_labeled_demos=4,
        num_threads=20,  
        eval_rounds=1, 
        auto="medium",
        save_path="examples/output/mipro/math_plug_and_play" 
    )

    logger.info("Optimizing program...")
    optimizer.optimize(dataset=benchmark)
    optimizer.restore_best_program()

    logger.info("Evaluating program on test set...")
    with suppress_logger_info():
        results = optimizer.evaluate(dataset=benchmark, eval_mode="test")
    logger.info(f"Evaluation metrics (after optimization): {results}")


if __name__ == "__main__":
    main()