File size: 5,001 Bytes
5374a2d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import os
import json 
from dotenv import load_dotenv
from typing import Any, Tuple

from evoagentx.benchmark import MATH
from evoagentx.core.logging import logger
from evoagentx.models import OpenAILLM, OpenAILLMConfig
from evoagentx.optimizers import MiproOptimizer
from evoagentx.core.callbacks import suppress_logger_info
from evoagentx.utils.mipro_utils.register_utils import MiproRegistry


load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# =====================
# prepare the benchmark data 
# =====================

class MathSplits(MATH):

    def _load_data(self):
        # load the original test data 
        super()._load_data()
        # split the data into dev and test
        import numpy as np 
        np.random.seed(42)
        permutation = np.random.permutation(len(self._test_data))
        full_test_data = self._test_data
        # radnomly select 50 samples for training and 100 samples for test
        # self._train_data = [full_test_data[idx] for idx in permutation[:50]]
        self._train_data = [full_test_data[idx] for idx in permutation[:100]]
        self._test_data = [full_test_data[idx] for idx in permutation[100:200]]

    # define the input keys. 
    # If defined, the corresponding input key and value will be passed to the __call__ method of the program, 
    # i.e., program.__call__(**{k: v for k, v in example.items() if k in self.get_input_keys()})
    # If not defined, the program will be executed with the entire input example, i.e., program.__call__(**example)
    def get_input_keys(self):
        return ["problem"]
    
    # the benchmark must have a `evaluate` method that receives the program's `prediction` (output from the program's __call__ method) 
    # and the `label` (obtained using the `self.get_label` method) and return a dictionary of metrics. 
    def evaluate(self, prediction: Any, label: Any) -> dict:
        return super().evaluate(prediction, label)


# =====================
# prepare the program
# =====================

# here we use a simple program to answer the math problem.
class CustomProgram: 

    def __init__(self, model: OpenAILLM):
        self.model = model 
        self.prompt = "Let's think step by step to answer the math question: {problem}"
    
    # the program must have a `save` and `load` method to save and load the program
    def save(self, path: str):
        params = {"prompt": self.prompt}
        with open(path, "w") as f:
            json.dump(params, f)

    def load(self, path: str):
        with open(path, "r") as f:
            params = json.load(f)
            self.prompt = params["prompt"]
    
    # the program must have a `__call__` method to execute the program.
    # It receives the key-values (specified by `get_input_keys` in the benchmark) of an input example, 
    # and returns a tuple of (prediction, execution_data), 
    # where `prediction` is the program's output and `execution_data` is a dictionary that contains all the parameters' inputs and outputs. 
    def __call__(self, problem: str) -> Tuple[str, dict]:
        
        prompt = self.prompt.format(problem=problem)
        response = self.model.generate(prompt=prompt)
        solution = response.content
        return solution, {"problem": problem, "solution": solution}
    

def main():

    openai_config = OpenAILLMConfig(model="gpt-4o-mini", openai_key=OPENAI_API_KEY, stream=True, output_response=False)
    executor_llm = OpenAILLM(config=openai_config)
    optimizer_config = OpenAILLMConfig(model="gpt-4o", openai_key=OPENAI_API_KEY, stream=True, output_response=False)
    optimizer_llm = OpenAILLM(config=optimizer_config)

    benchmark = MathSplits()
    program = CustomProgram(model=executor_llm)

    # register the parameters to optimize 
    registry = MiproRegistry()
    # MiproRegistry requires specify the input_names and output_names for the specific parameter. 
    # The input_names and output_names should appear in the execution_data returned by the program's __call__ method. 
    registry.track(program, "prompt", input_names=["problem"], output_names=["solution"])

    # optimize the program 
    # `evaluator` is optional. If not provided, the optimizer will construct an evaluator based on the `evaluate` method of the benchmark. 
    optimizer = MiproOptimizer(
        registry=registry, 
        program=program, 
        optimizer_llm=optimizer_llm,
        max_bootstrapped_demos=4, 
        max_labeled_demos=4,
        num_threads=20,  
        eval_rounds=1, 
        auto="medium",
        save_path="examples/output/mipro/math_plug_and_play" 
    )

    logger.info("Optimizing program...")
    optimizer.optimize(dataset=benchmark)
    optimizer.restore_best_program()

    logger.info("Evaluating program on test set...")
    with suppress_logger_info():
        results = optimizer.evaluate(dataset=benchmark, eval_mode="test")
    logger.info(f"Evaluation metrics (after optimization): {results}")


if __name__ == "__main__":
    main()