3v324v23 commited on
Commit
177eb47
·
1 Parent(s): 776cb9b

Keep only one benchmark.py and delete benchmark_b.py

Browse files
Files changed (2) hide show
  1. backend/benchmark.py +13 -46
  2. backend/benchmark_b.py +0 -118
backend/benchmark.py CHANGED
@@ -21,7 +21,7 @@ samples = [
21
  ("frustration", "This word problem about two trains leaving different cities is making my head spin. I hate word problems!"),
22
  ("frustration", "I don't understand how to convert grams to moles. I keep getting the wrong conversion factor and it's so frustrating!"),
23
  ("frustration", "I've tried balancing this chemical equation five times and the numbers never match up!"),
24
- ("frustration", "I'm trying to draw this ray diagram for a concave lens and the lines are crossing in the wrong place. I give up!"),
25
  ("frustration", "This physics problem about friction has too many variables and I don't even know where to start!"),
26
  # Boredom
27
  ("boredom", "Ugh, why do we have to learn about sedimentary rocks? They just sit there. Who cares?"),
@@ -74,16 +74,10 @@ for idx, (cat, q) in enumerate(samples, 1):
74
  results.append({
75
  "category": cat,
76
  "query": q,
77
- "latency_a": res_data["latency_a"],
78
- "tokens_a": res_data["tokens_a"],
79
- "latency_b": res_data["latency_b"],
80
- "tokens_b": res_data["tokens_b"],
81
- "latency_c": res_data["latency_c"],
82
- "tokens_c": res_data["tokens_c"],
83
- "latency_d": res_data["latency_d"],
84
- "tokens_d": res_data["tokens_d"]
85
  })
86
- print(f" Done: A ({res_data['latency_a']}s, {res_data['tokens_a']}t) | B ({res_data['latency_b']}s, {res_data['tokens_b']}t) | C ({res_data['latency_c']}s, {res_data['tokens_c']}t) | D ({res_data['latency_d']}s, {res_data['tokens_d']}t)")
87
  # Add a small delay between requests
88
  time.sleep(1.5)
89
  except Exception as e:
@@ -95,19 +89,13 @@ with open(csv_file, mode="w", newline="", encoding="utf-8") as f:
95
  writer = csv.writer(f)
96
  writer.writerow([
97
  "Category", "Query",
98
- "Latency A (s)", "Tokens A",
99
- "Latency B (s)", "Tokens B",
100
- "Latency C (s)", "Tokens C",
101
- "Latency D (s)", "Tokens D"
102
  ])
103
 
104
  for r in results:
105
  writer.writerow([
106
  r["category"], r["query"],
107
- r["latency_a"], r["tokens_a"],
108
- r["latency_b"], r["tokens_b"],
109
- r["latency_c"], r["tokens_c"],
110
- r["latency_d"], r["tokens_d"]
111
  ])
112
 
113
  print(f"\nResults successfully saved to: {csv_file}")
@@ -115,37 +103,16 @@ print(f"\nResults successfully saved to: {csv_file}")
115
  # Calculate averages
116
  num_queries = len(results)
117
  if num_queries > 0:
118
- valid_latencies_a = [r["latency_a"] for r in results if r["latency_a"] is not None]
119
- valid_tokens_a = [r["tokens_a"] for r in results if r["tokens_a"] is not None]
120
- valid_latencies_b = [r["latency_b"] for r in results if r["latency_b"] is not None]
121
- valid_tokens_b = [r["tokens_b"] for r in results if r["tokens_b"] is not None]
122
- valid_latencies_c = [r["latency_c"] for r in results if r["latency_c"] is not None]
123
- valid_tokens_c = [r["tokens_c"] for r in results if r["tokens_c"] is not None]
124
- valid_latencies_d = [r["latency_d"] for r in results if r["latency_d"] is not None]
125
- valid_tokens_d = [r["tokens_d"] for r in results if r["tokens_d"] is not None]
126
 
127
- avg_latency_a = sum(valid_latencies_a) / len(valid_latencies_a) if valid_latencies_a else 0
128
- avg_tokens_a = sum(valid_tokens_a) / len(valid_tokens_a) if valid_tokens_a else 0
129
- avg_latency_b = sum(valid_latencies_b) / len(valid_latencies_b) if valid_latencies_b else 0
130
- avg_tokens_b = sum(valid_tokens_b) / len(valid_tokens_b) if valid_tokens_b else 0
131
- avg_latency_c = sum(valid_latencies_c) / len(valid_latencies_c) if valid_latencies_c else 0
132
- avg_tokens_c = sum(valid_tokens_c) / len(valid_tokens_c) if valid_tokens_c else 0
133
- avg_latency_d = sum(valid_latencies_d) / len(valid_latencies_d) if valid_latencies_d else 0
134
- avg_tokens_d = sum(valid_tokens_d) / len(valid_tokens_d) if valid_tokens_d else 0
135
 
136
  print("\n" + "="*50)
137
  print("BENCHMARK SUMMARY AVERAGES:")
138
  print("="*50)
139
- print(f"Option A (Gemini LLM-Classifier):")
140
- print(f" - Avg Latency: {avg_latency_a:.3f}s")
141
- print(f" - Avg Tokens: {avg_tokens_a:.1f}")
142
- print(f"Option B (Gemini Single-Pass):")
143
- print(f" - Avg Latency: {avg_latency_b:.3f}s")
144
- print(f" - Avg Tokens: {avg_tokens_b:.1f}")
145
- print(f"Option C (Raw Distribution + Gemini):")
146
- print(f" - Avg Latency: {avg_latency_c:.3f}s")
147
- print(f" - Avg Tokens: {avg_tokens_c:.1f}")
148
- print(f"Option D (Local-Classifier + Gemini):")
149
- print(f" - Avg Latency: {avg_latency_d:.3f}s")
150
- print(f" - Avg Tokens: {avg_tokens_d:.1f}")
151
  print("="*50)
 
21
  ("frustration", "This word problem about two trains leaving different cities is making my head spin. I hate word problems!"),
22
  ("frustration", "I don't understand how to convert grams to moles. I keep getting the wrong conversion factor and it's so frustrating!"),
23
  ("frustration", "I've tried balancing this chemical equation five times and the numbers never match up!"),
24
+ ("frustration", "Im trying to draw this ray diagram for a concave lens and the lines are crossing in the wrong place. I give up!"),
25
  ("frustration", "This physics problem about friction has too many variables and I don't even know where to start!"),
26
  # Boredom
27
  ("boredom", "Ugh, why do we have to learn about sedimentary rocks? They just sit there. Who cares?"),
 
74
  results.append({
75
  "category": cat,
76
  "query": q,
77
+ "latency": res_data["latency"],
78
+ "tokens": res_data["tokens"]
 
 
 
 
 
 
79
  })
80
+ print(f" Done: {res_data['latency']}s, {res_data['tokens']}t")
81
  # Add a small delay between requests
82
  time.sleep(1.5)
83
  except Exception as e:
 
89
  writer = csv.writer(f)
90
  writer.writerow([
91
  "Category", "Query",
92
+ "Latency (s)", "Tokens"
 
 
 
93
  ])
94
 
95
  for r in results:
96
  writer.writerow([
97
  r["category"], r["query"],
98
+ r["latency"], r["tokens"]
 
 
 
99
  ])
100
 
101
  print(f"\nResults successfully saved to: {csv_file}")
 
103
  # Calculate averages
104
  num_queries = len(results)
105
  if num_queries > 0:
106
+ valid_latencies = [r["latency"] for r in results if r["latency"] is not None]
107
+ valid_tokens = [r["tokens"] for r in results if r["tokens"] is not None]
 
 
 
 
 
 
108
 
109
+ avg_latency = sum(valid_latencies) / len(valid_latencies) if valid_latencies else 0
110
+ avg_tokens = sum(valid_tokens) / len(valid_tokens) if valid_tokens else 0
 
 
 
 
 
 
111
 
112
  print("\n" + "="*50)
113
  print("BENCHMARK SUMMARY AVERAGES:")
114
  print("="*50)
115
+ print(f"Socratic Tutor (Single-Pass):")
116
+ print(f" - Avg Latency: {avg_latency:.3f}s")
117
+ print(f" - Avg Tokens: {avg_tokens:.1f}")
 
 
 
 
 
 
 
 
 
118
  print("="*50)
backend/benchmark_b.py DELETED
@@ -1,118 +0,0 @@
1
- import json
2
- import urllib.request
3
- import csv
4
- import time
5
- import os
6
-
7
- samples = [
8
- # Confusion
9
- ("confusion", "Wait, why does a negative times a negative make a positive? I don't get it."),
10
- ("confusion", "I'm looking at this cell diagram and I can't tell the difference between the cell wall and the cell membrane."),
11
- ("confusion", "Our teacher said the Earth is tilted, but how does that make summer and winter? It doesn't make sense."),
12
- ("confusion", "Is a virus alive or is it not? My textbook says both and I'm really mixed up."),
13
- ("confusion", "What is the difference between a variable and a constant in algebra? I'm lost."),
14
- ("confusion", "Why does dividing by a fraction mean multiplying by its reciprocal? It seems arbitrary."),
15
- ("confusion", "What is the difference between speed and velocity? They sound like the same thing."),
16
- ("confusion", "Why is the mitochondria called the powerhouse of the cell? What does it actually do?"),
17
- # Frustration
18
- ("frustration", "I've tried to solve this quadratic equation three times using the formula, but I keep getting a negative under the square root!"),
19
- ("frustration", "My science experiment failed again! The volcano didn't bubble at all and I did everything exactly right!"),
20
- ("frustration", "This long division with decimals is taking forever and I keep getting the wrong remainder! I hate this!"),
21
- ("frustration", "This word problem about two trains leaving different cities is making my head spin. I hate word problems!"),
22
- ("frustration", "I don't understand how to convert grams to moles. I keep getting the wrong conversion factor and it's so frustrating!"),
23
- ("frustration", "I've tried balancing this chemical equation five times and the numbers never match up!"),
24
- ("frustration", "Im trying to draw this ray diagram for a concave lens and the lines are crossing in the wrong place. I give up!"),
25
- ("frustration", "This physics problem about friction has too many variables and I don't even know where to start!"),
26
- # Boredom
27
- ("boredom", "Ugh, why do we have to learn about sedimentary rocks? They just sit there. Who cares?"),
28
- ("boredom", "This math worksheet is just 50 of the same exact addition problems. This is so boring."),
29
- ("boredom", "We are just copying definitions of different math properties from the board. This is so boring."),
30
- ("boredom", "Another lecture on the phases of mitosis... we've covered this three years in a row now."),
31
- ("boredom", "I finished all my science reading early. There's nothing else to do except stare at the wall."),
32
- ("boredom", "We have to measure the temperature of this water every two minutes for an hour. This is so tedious."),
33
- ("boredom", "Calculating the area of twenty slightly different rectangles is putting me to sleep."),
34
- ("boredom", "This lecture on cell organelles is just slides of definitions. I'm falling asleep."),
35
- # Confidence
36
- ("confidence", "I totally mastered multiplying fractions! Give me a hard practice problem to try!"),
37
- ("confidence", "I just derived the formula for the volume of a sphere all by myself!"),
38
- ("confidence", "I know exactly how to balance any redox reaction now. Try me!"),
39
- ("confidence", "I got a perfect score on the calculus midterm today! I really understand derivatives now!"),
40
- ("confidence", "I can explain the entire water cycle in my sleep! Evaporation, condensation, precipitation, easy!"),
41
- ("confidence", "I just solved the hardest logic puzzle in the workbook on my very first try!"),
42
- ("confidence", "I can calculate the trajectory of a projectile in my head now, it's so easy!"),
43
- ("confidence", "I fully understand how DNA replication works and could draw every step from memory!"),
44
- # Neutral
45
- ("neutral", "How do I calculate the hypotenuse of a right triangle when the sides are 3 and 4?"),
46
- ("neutral", "What are the three main types of rocks found in the Earth's crust?"),
47
- ("neutral", "Can you explain how photosynthesis converts sunlight into chemical energy?"),
48
- ("neutral", "What is the chemical formula for photosynthesis and cellular respiration?"),
49
- ("neutral", "How do you find the slope of a line from two points on a graph?"),
50
- ("neutral", "What is the difference between an isotope and an ion in chemistry?"),
51
- ("neutral", "Could you list the steps of the scientific method in order?"),
52
- ("neutral", "What is the value of the constant pi, and how is it calculated?")
53
- ]
54
-
55
- results = []
56
-
57
- url = "http://127.0.0.1:8000/api/chat"
58
-
59
- print(f"Starting benchmark targeting Option B only for {len(samples)} sample queries...")
60
- for idx, (cat, q) in enumerate(samples, 1):
61
- print(f"[{idx}/{len(samples)}] Query ({cat}): \"{q}\"")
62
- req_data = json.dumps({"message": q, "selected_option": "B"}).encode('utf-8')
63
- req = urllib.request.Request(
64
- url,
65
- data=req_data,
66
- headers={'Content-Type': 'application/json'}
67
- )
68
-
69
- try:
70
- with urllib.request.urlopen(req) as response:
71
- res_data = json.loads(response.read().decode('utf-8'))
72
-
73
- # Record result
74
- results.append({
75
- "category": cat,
76
- "query": q,
77
- "latency_b": res_data["latency"],
78
- "tokens_b": res_data["tokens"]
79
- })
80
- print(f" Done: B ({res_data['latency']}s, {res_data['tokens']}t)")
81
- # Add a small delay between requests
82
- time.sleep(1.5)
83
- except Exception as e:
84
- print(f" Error processing query: {e}")
85
-
86
- # Save to CSV
87
- csv_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), "benchmark_results_b.csv")
88
- with open(csv_file, mode="w", newline="", encoding="utf-8") as f:
89
- writer = csv.writer(f)
90
- writer.writerow([
91
- "Category", "Query",
92
- "Latency B (s)", "Tokens B"
93
- ])
94
-
95
- for r in results:
96
- writer.writerow([
97
- r["category"], r["query"],
98
- r["latency_b"], r["tokens_b"]
99
- ])
100
-
101
- print(f"\nResults successfully saved to: {csv_file}")
102
-
103
- # Calculate averages
104
- num_queries = len(results)
105
- if num_queries > 0:
106
- valid_latencies_b = [r["latency_b"] for r in results if r["latency_b"] is not None]
107
- valid_tokens_b = [r["tokens_b"] for r in results if r["tokens_b"] is not None]
108
-
109
- avg_latency_b = sum(valid_latencies_b) / len(valid_latencies_b) if valid_latencies_b else 0
110
- avg_tokens_b = sum(valid_tokens_b) / len(valid_tokens_b) if valid_tokens_b else 0
111
-
112
- print("\n" + "="*50)
113
- print("BENCHMARK SUMMARY AVERAGES (OPTION B ONLY):")
114
- print("="*50)
115
- print(f"Option B (Gemini Single-Pass):")
116
- print(f" - Avg Latency: {avg_latency_b:.3f}s")
117
- print(f" - Avg Tokens: {avg_tokens_b:.1f}")
118
- print("="*50)