Keep only one benchmark.py and delete benchmark_b.py
Browse files- backend/benchmark.py +13 -46
- backend/benchmark_b.py +0 -118
backend/benchmark.py
CHANGED
|
@@ -21,7 +21,7 @@ samples = [
|
|
| 21 |
("frustration", "This word problem about two trains leaving different cities is making my head spin. I hate word problems!"),
|
| 22 |
("frustration", "I don't understand how to convert grams to moles. I keep getting the wrong conversion factor and it's so frustrating!"),
|
| 23 |
("frustration", "I've tried balancing this chemical equation five times and the numbers never match up!"),
|
| 24 |
-
("frustration", "
|
| 25 |
("frustration", "This physics problem about friction has too many variables and I don't even know where to start!"),
|
| 26 |
# Boredom
|
| 27 |
("boredom", "Ugh, why do we have to learn about sedimentary rocks? They just sit there. Who cares?"),
|
|
@@ -74,16 +74,10 @@ for idx, (cat, q) in enumerate(samples, 1):
|
|
| 74 |
results.append({
|
| 75 |
"category": cat,
|
| 76 |
"query": q,
|
| 77 |
-
"
|
| 78 |
-
"
|
| 79 |
-
"latency_b": res_data["latency_b"],
|
| 80 |
-
"tokens_b": res_data["tokens_b"],
|
| 81 |
-
"latency_c": res_data["latency_c"],
|
| 82 |
-
"tokens_c": res_data["tokens_c"],
|
| 83 |
-
"latency_d": res_data["latency_d"],
|
| 84 |
-
"tokens_d": res_data["tokens_d"]
|
| 85 |
})
|
| 86 |
-
print(f" Done:
|
| 87 |
# Add a small delay between requests
|
| 88 |
time.sleep(1.5)
|
| 89 |
except Exception as e:
|
|
@@ -95,19 +89,13 @@ with open(csv_file, mode="w", newline="", encoding="utf-8") as f:
|
|
| 95 |
writer = csv.writer(f)
|
| 96 |
writer.writerow([
|
| 97 |
"Category", "Query",
|
| 98 |
-
"Latency
|
| 99 |
-
"Latency B (s)", "Tokens B",
|
| 100 |
-
"Latency C (s)", "Tokens C",
|
| 101 |
-
"Latency D (s)", "Tokens D"
|
| 102 |
])
|
| 103 |
|
| 104 |
for r in results:
|
| 105 |
writer.writerow([
|
| 106 |
r["category"], r["query"],
|
| 107 |
-
r["
|
| 108 |
-
r["latency_b"], r["tokens_b"],
|
| 109 |
-
r["latency_c"], r["tokens_c"],
|
| 110 |
-
r["latency_d"], r["tokens_d"]
|
| 111 |
])
|
| 112 |
|
| 113 |
print(f"\nResults successfully saved to: {csv_file}")
|
|
@@ -115,37 +103,16 @@ print(f"\nResults successfully saved to: {csv_file}")
|
|
| 115 |
# Calculate averages
|
| 116 |
num_queries = len(results)
|
| 117 |
if num_queries > 0:
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
valid_latencies_b = [r["latency_b"] for r in results if r["latency_b"] is not None]
|
| 121 |
-
valid_tokens_b = [r["tokens_b"] for r in results if r["tokens_b"] is not None]
|
| 122 |
-
valid_latencies_c = [r["latency_c"] for r in results if r["latency_c"] is not None]
|
| 123 |
-
valid_tokens_c = [r["tokens_c"] for r in results if r["tokens_c"] is not None]
|
| 124 |
-
valid_latencies_d = [r["latency_d"] for r in results if r["latency_d"] is not None]
|
| 125 |
-
valid_tokens_d = [r["tokens_d"] for r in results if r["tokens_d"] is not None]
|
| 126 |
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
avg_latency_b = sum(valid_latencies_b) / len(valid_latencies_b) if valid_latencies_b else 0
|
| 130 |
-
avg_tokens_b = sum(valid_tokens_b) / len(valid_tokens_b) if valid_tokens_b else 0
|
| 131 |
-
avg_latency_c = sum(valid_latencies_c) / len(valid_latencies_c) if valid_latencies_c else 0
|
| 132 |
-
avg_tokens_c = sum(valid_tokens_c) / len(valid_tokens_c) if valid_tokens_c else 0
|
| 133 |
-
avg_latency_d = sum(valid_latencies_d) / len(valid_latencies_d) if valid_latencies_d else 0
|
| 134 |
-
avg_tokens_d = sum(valid_tokens_d) / len(valid_tokens_d) if valid_tokens_d else 0
|
| 135 |
|
| 136 |
print("\n" + "="*50)
|
| 137 |
print("BENCHMARK SUMMARY AVERAGES:")
|
| 138 |
print("="*50)
|
| 139 |
-
print(f"
|
| 140 |
-
print(f" - Avg Latency: {
|
| 141 |
-
print(f" - Avg Tokens: {
|
| 142 |
-
print(f"Option B (Gemini Single-Pass):")
|
| 143 |
-
print(f" - Avg Latency: {avg_latency_b:.3f}s")
|
| 144 |
-
print(f" - Avg Tokens: {avg_tokens_b:.1f}")
|
| 145 |
-
print(f"Option C (Raw Distribution + Gemini):")
|
| 146 |
-
print(f" - Avg Latency: {avg_latency_c:.3f}s")
|
| 147 |
-
print(f" - Avg Tokens: {avg_tokens_c:.1f}")
|
| 148 |
-
print(f"Option D (Local-Classifier + Gemini):")
|
| 149 |
-
print(f" - Avg Latency: {avg_latency_d:.3f}s")
|
| 150 |
-
print(f" - Avg Tokens: {avg_tokens_d:.1f}")
|
| 151 |
print("="*50)
|
|
|
|
| 21 |
("frustration", "This word problem about two trains leaving different cities is making my head spin. I hate word problems!"),
|
| 22 |
("frustration", "I don't understand how to convert grams to moles. I keep getting the wrong conversion factor and it's so frustrating!"),
|
| 23 |
("frustration", "I've tried balancing this chemical equation five times and the numbers never match up!"),
|
| 24 |
+
("frustration", "Im trying to draw this ray diagram for a concave lens and the lines are crossing in the wrong place. I give up!"),
|
| 25 |
("frustration", "This physics problem about friction has too many variables and I don't even know where to start!"),
|
| 26 |
# Boredom
|
| 27 |
("boredom", "Ugh, why do we have to learn about sedimentary rocks? They just sit there. Who cares?"),
|
|
|
|
| 74 |
results.append({
|
| 75 |
"category": cat,
|
| 76 |
"query": q,
|
| 77 |
+
"latency": res_data["latency"],
|
| 78 |
+
"tokens": res_data["tokens"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
})
|
| 80 |
+
print(f" Done: {res_data['latency']}s, {res_data['tokens']}t")
|
| 81 |
# Add a small delay between requests
|
| 82 |
time.sleep(1.5)
|
| 83 |
except Exception as e:
|
|
|
|
| 89 |
writer = csv.writer(f)
|
| 90 |
writer.writerow([
|
| 91 |
"Category", "Query",
|
| 92 |
+
"Latency (s)", "Tokens"
|
|
|
|
|
|
|
|
|
|
| 93 |
])
|
| 94 |
|
| 95 |
for r in results:
|
| 96 |
writer.writerow([
|
| 97 |
r["category"], r["query"],
|
| 98 |
+
r["latency"], r["tokens"]
|
|
|
|
|
|
|
|
|
|
| 99 |
])
|
| 100 |
|
| 101 |
print(f"\nResults successfully saved to: {csv_file}")
|
|
|
|
| 103 |
# Calculate averages
|
| 104 |
num_queries = len(results)
|
| 105 |
if num_queries > 0:
|
| 106 |
+
valid_latencies = [r["latency"] for r in results if r["latency"] is not None]
|
| 107 |
+
valid_tokens = [r["tokens"] for r in results if r["tokens"] is not None]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
|
| 109 |
+
avg_latency = sum(valid_latencies) / len(valid_latencies) if valid_latencies else 0
|
| 110 |
+
avg_tokens = sum(valid_tokens) / len(valid_tokens) if valid_tokens else 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
|
| 112 |
print("\n" + "="*50)
|
| 113 |
print("BENCHMARK SUMMARY AVERAGES:")
|
| 114 |
print("="*50)
|
| 115 |
+
print(f"Socratic Tutor (Single-Pass):")
|
| 116 |
+
print(f" - Avg Latency: {avg_latency:.3f}s")
|
| 117 |
+
print(f" - Avg Tokens: {avg_tokens:.1f}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
print("="*50)
|
backend/benchmark_b.py
DELETED
|
@@ -1,118 +0,0 @@
|
|
| 1 |
-
import json
|
| 2 |
-
import urllib.request
|
| 3 |
-
import csv
|
| 4 |
-
import time
|
| 5 |
-
import os
|
| 6 |
-
|
| 7 |
-
samples = [
|
| 8 |
-
# Confusion
|
| 9 |
-
("confusion", "Wait, why does a negative times a negative make a positive? I don't get it."),
|
| 10 |
-
("confusion", "I'm looking at this cell diagram and I can't tell the difference between the cell wall and the cell membrane."),
|
| 11 |
-
("confusion", "Our teacher said the Earth is tilted, but how does that make summer and winter? It doesn't make sense."),
|
| 12 |
-
("confusion", "Is a virus alive or is it not? My textbook says both and I'm really mixed up."),
|
| 13 |
-
("confusion", "What is the difference between a variable and a constant in algebra? I'm lost."),
|
| 14 |
-
("confusion", "Why does dividing by a fraction mean multiplying by its reciprocal? It seems arbitrary."),
|
| 15 |
-
("confusion", "What is the difference between speed and velocity? They sound like the same thing."),
|
| 16 |
-
("confusion", "Why is the mitochondria called the powerhouse of the cell? What does it actually do?"),
|
| 17 |
-
# Frustration
|
| 18 |
-
("frustration", "I've tried to solve this quadratic equation three times using the formula, but I keep getting a negative under the square root!"),
|
| 19 |
-
("frustration", "My science experiment failed again! The volcano didn't bubble at all and I did everything exactly right!"),
|
| 20 |
-
("frustration", "This long division with decimals is taking forever and I keep getting the wrong remainder! I hate this!"),
|
| 21 |
-
("frustration", "This word problem about two trains leaving different cities is making my head spin. I hate word problems!"),
|
| 22 |
-
("frustration", "I don't understand how to convert grams to moles. I keep getting the wrong conversion factor and it's so frustrating!"),
|
| 23 |
-
("frustration", "I've tried balancing this chemical equation five times and the numbers never match up!"),
|
| 24 |
-
("frustration", "Im trying to draw this ray diagram for a concave lens and the lines are crossing in the wrong place. I give up!"),
|
| 25 |
-
("frustration", "This physics problem about friction has too many variables and I don't even know where to start!"),
|
| 26 |
-
# Boredom
|
| 27 |
-
("boredom", "Ugh, why do we have to learn about sedimentary rocks? They just sit there. Who cares?"),
|
| 28 |
-
("boredom", "This math worksheet is just 50 of the same exact addition problems. This is so boring."),
|
| 29 |
-
("boredom", "We are just copying definitions of different math properties from the board. This is so boring."),
|
| 30 |
-
("boredom", "Another lecture on the phases of mitosis... we've covered this three years in a row now."),
|
| 31 |
-
("boredom", "I finished all my science reading early. There's nothing else to do except stare at the wall."),
|
| 32 |
-
("boredom", "We have to measure the temperature of this water every two minutes for an hour. This is so tedious."),
|
| 33 |
-
("boredom", "Calculating the area of twenty slightly different rectangles is putting me to sleep."),
|
| 34 |
-
("boredom", "This lecture on cell organelles is just slides of definitions. I'm falling asleep."),
|
| 35 |
-
# Confidence
|
| 36 |
-
("confidence", "I totally mastered multiplying fractions! Give me a hard practice problem to try!"),
|
| 37 |
-
("confidence", "I just derived the formula for the volume of a sphere all by myself!"),
|
| 38 |
-
("confidence", "I know exactly how to balance any redox reaction now. Try me!"),
|
| 39 |
-
("confidence", "I got a perfect score on the calculus midterm today! I really understand derivatives now!"),
|
| 40 |
-
("confidence", "I can explain the entire water cycle in my sleep! Evaporation, condensation, precipitation, easy!"),
|
| 41 |
-
("confidence", "I just solved the hardest logic puzzle in the workbook on my very first try!"),
|
| 42 |
-
("confidence", "I can calculate the trajectory of a projectile in my head now, it's so easy!"),
|
| 43 |
-
("confidence", "I fully understand how DNA replication works and could draw every step from memory!"),
|
| 44 |
-
# Neutral
|
| 45 |
-
("neutral", "How do I calculate the hypotenuse of a right triangle when the sides are 3 and 4?"),
|
| 46 |
-
("neutral", "What are the three main types of rocks found in the Earth's crust?"),
|
| 47 |
-
("neutral", "Can you explain how photosynthesis converts sunlight into chemical energy?"),
|
| 48 |
-
("neutral", "What is the chemical formula for photosynthesis and cellular respiration?"),
|
| 49 |
-
("neutral", "How do you find the slope of a line from two points on a graph?"),
|
| 50 |
-
("neutral", "What is the difference between an isotope and an ion in chemistry?"),
|
| 51 |
-
("neutral", "Could you list the steps of the scientific method in order?"),
|
| 52 |
-
("neutral", "What is the value of the constant pi, and how is it calculated?")
|
| 53 |
-
]
|
| 54 |
-
|
| 55 |
-
results = []
|
| 56 |
-
|
| 57 |
-
url = "http://127.0.0.1:8000/api/chat"
|
| 58 |
-
|
| 59 |
-
print(f"Starting benchmark targeting Option B only for {len(samples)} sample queries...")
|
| 60 |
-
for idx, (cat, q) in enumerate(samples, 1):
|
| 61 |
-
print(f"[{idx}/{len(samples)}] Query ({cat}): \"{q}\"")
|
| 62 |
-
req_data = json.dumps({"message": q, "selected_option": "B"}).encode('utf-8')
|
| 63 |
-
req = urllib.request.Request(
|
| 64 |
-
url,
|
| 65 |
-
data=req_data,
|
| 66 |
-
headers={'Content-Type': 'application/json'}
|
| 67 |
-
)
|
| 68 |
-
|
| 69 |
-
try:
|
| 70 |
-
with urllib.request.urlopen(req) as response:
|
| 71 |
-
res_data = json.loads(response.read().decode('utf-8'))
|
| 72 |
-
|
| 73 |
-
# Record result
|
| 74 |
-
results.append({
|
| 75 |
-
"category": cat,
|
| 76 |
-
"query": q,
|
| 77 |
-
"latency_b": res_data["latency"],
|
| 78 |
-
"tokens_b": res_data["tokens"]
|
| 79 |
-
})
|
| 80 |
-
print(f" Done: B ({res_data['latency']}s, {res_data['tokens']}t)")
|
| 81 |
-
# Add a small delay between requests
|
| 82 |
-
time.sleep(1.5)
|
| 83 |
-
except Exception as e:
|
| 84 |
-
print(f" Error processing query: {e}")
|
| 85 |
-
|
| 86 |
-
# Save to CSV
|
| 87 |
-
csv_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), "benchmark_results_b.csv")
|
| 88 |
-
with open(csv_file, mode="w", newline="", encoding="utf-8") as f:
|
| 89 |
-
writer = csv.writer(f)
|
| 90 |
-
writer.writerow([
|
| 91 |
-
"Category", "Query",
|
| 92 |
-
"Latency B (s)", "Tokens B"
|
| 93 |
-
])
|
| 94 |
-
|
| 95 |
-
for r in results:
|
| 96 |
-
writer.writerow([
|
| 97 |
-
r["category"], r["query"],
|
| 98 |
-
r["latency_b"], r["tokens_b"]
|
| 99 |
-
])
|
| 100 |
-
|
| 101 |
-
print(f"\nResults successfully saved to: {csv_file}")
|
| 102 |
-
|
| 103 |
-
# Calculate averages
|
| 104 |
-
num_queries = len(results)
|
| 105 |
-
if num_queries > 0:
|
| 106 |
-
valid_latencies_b = [r["latency_b"] for r in results if r["latency_b"] is not None]
|
| 107 |
-
valid_tokens_b = [r["tokens_b"] for r in results if r["tokens_b"] is not None]
|
| 108 |
-
|
| 109 |
-
avg_latency_b = sum(valid_latencies_b) / len(valid_latencies_b) if valid_latencies_b else 0
|
| 110 |
-
avg_tokens_b = sum(valid_tokens_b) / len(valid_tokens_b) if valid_tokens_b else 0
|
| 111 |
-
|
| 112 |
-
print("\n" + "="*50)
|
| 113 |
-
print("BENCHMARK SUMMARY AVERAGES (OPTION B ONLY):")
|
| 114 |
-
print("="*50)
|
| 115 |
-
print(f"Option B (Gemini Single-Pass):")
|
| 116 |
-
print(f" - Avg Latency: {avg_latency_b:.3f}s")
|
| 117 |
-
print(f" - Avg Tokens: {avg_tokens_b:.1f}")
|
| 118 |
-
print("="*50)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|