File size: 4,724 Bytes
ed1b365
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
#!/usr/bin/env python3
"""Codette LoRA Adapter Inference Test via llama.cpp

Uses GGUF base model + GGUF LoRA adapters for low-memory inference.
Base: Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf (~4.6 GB)
LoRA: newton-lora-f16.gguf, davinci-lora-f16.gguf (~27 MB each)
"""

import os, sys, time

os.environ["PATH"] = r"J:\Lib\site-packages\Library\bin" + os.pathsep + os.environ.get("PATH", "")
# Fix Windows console encoding for Unicode characters (π, etc.)
sys.stdout.reconfigure(encoding='utf-8', errors='replace')

from llama_cpp import Llama

BASE_GGUF = r"J:\codette-training-lab\bartowski\Meta-Llama-3.1-8B-Instruct-GGUF\Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf"
NEWTON_LORA = r"J:\codette-training-lab\adapters\newton-lora-f16.gguf"
DAVINCI_LORA = r"J:\codette-training-lab\adapters\davinci-lora-f16.gguf"

TEST_PROMPTS = [
    {
        "system": "You are a helpful assistant. Answer concisely in 2-3 sentences.",
        "user": "Explain why objects fall to the ground.",
        "tag": "physics"
    },
    {
        "system": "You are a helpful assistant. Answer concisely in 2-3 sentences.",
        "user": "What is the relationship between consciousness and the physical world?",
        "tag": "philosophy"
    },
    {
        "system": "You are a helpful assistant. Answer concisely in 2-3 sentences.",
        "user": "How would you design a system that learns from its own mistakes?",
        "tag": "systems"
    },
]

GEN_KWARGS = dict(
    max_tokens=200,
    temperature=0.7,
    top_p=0.9,
    stop=["<|eot_id|>", "<|end_of_text|>"],
)


def run_test(model_label, llm, prompts):
    """Run all test prompts against a loaded model."""
    print(f"\n{'=' * 60}")
    print(f"  {model_label}")
    print(f"{'=' * 60}")

    responses = []
    for p in prompts:
        print(f"\n  [{p['tag']}] {p['user']}")
        start = time.time()
        result = llm.create_chat_completion(
            messages=[
                {"role": "system", "content": p["system"]},
                {"role": "user", "content": p["user"]},
            ],
            **GEN_KWARGS,
        )
        elapsed = time.time() - start
        text = result["choices"][0]["message"]["content"].strip()
        tokens = result["usage"]["completion_tokens"]
        tps = tokens / elapsed if elapsed > 0 else 0
        print(f"  Response ({elapsed:.1f}s, {tokens} tok, {tps:.1f} tok/s):")
        print(f"  > {text}")
        responses.append({"tag": p["tag"], "response": text, "tokens": tokens, "time": elapsed})

    return responses


def main():
    print("=" * 60)
    print("Codette LoRA Adapter Inference Test")
    print("=" * 60)
    print(f"Base model: {os.path.basename(BASE_GGUF)}")
    print(f"Newton LoRA: {os.path.basename(NEWTON_LORA)}")
    print(f"DaVinci LoRA: {os.path.basename(DAVINCI_LORA)}")

    all_results = {}

    # --- Test 1: BASE MODEL (no adapter) ---
    print("\nLoading BASE model (no adapter)...")
    start = time.time()
    llm_base = Llama(
        model_path=BASE_GGUF,
        n_ctx=2048,
        n_gpu_layers=0,  # CPU only to save VRAM
        verbose=False,
    )
    print(f"  Loaded in {time.time()-start:.1f}s")

    all_results["base"] = run_test("BASE MODEL (no adapter)", llm_base, TEST_PROMPTS)
    del llm_base

    # --- Test 2: NEWTON adapter ---
    print("\n\nLoading BASE + NEWTON adapter...")
    start = time.time()
    llm_newton = Llama(
        model_path=BASE_GGUF,
        lora_path=NEWTON_LORA,
        n_ctx=2048,
        n_gpu_layers=0,
        verbose=False,
    )
    print(f"  Loaded in {time.time()-start:.1f}s")

    all_results["newton"] = run_test("NEWTON ADAPTER", llm_newton, TEST_PROMPTS)
    del llm_newton

    # --- Test 3: DAVINCI adapter ---
    print("\n\nLoading BASE + DAVINCI adapter...")
    start = time.time()
    llm_davinci = Llama(
        model_path=BASE_GGUF,
        lora_path=DAVINCI_LORA,
        n_ctx=2048,
        n_gpu_layers=0,
        verbose=False,
    )
    print(f"  Loaded in {time.time()-start:.1f}s")

    all_results["davinci"] = run_test("DAVINCI ADAPTER", llm_davinci, TEST_PROMPTS)
    del llm_davinci

    # --- Summary ---
    print(f"\n{'=' * 60}")
    print("COMPARISON SUMMARY")
    print(f"{'=' * 60}")
    for tag in ["physics", "philosophy", "systems"]:
        print(f"\n--- {tag.upper()} ---")
        for model_name in ["base", "newton", "davinci"]:
            for r in all_results[model_name]:
                if r["tag"] == tag:
                    short = r["response"][:120] + "..." if len(r["response"]) > 120 else r["response"]
                    print(f"  {model_name:8s}: {short}")

    print(f"\n{'=' * 60}")
    print("TEST COMPLETE")
    print(f"{'=' * 60}")


if __name__ == "__main__":
    main()