Quincy Hsieh commited on
Commit
929f2ac
·
1 Parent(s): b250e76

Add CO2 emission and token counts

Browse files
Files changed (3) hide show
  1. app.py +31 -45
  2. llm.py +133 -0
  3. requirements.txt +1 -0
app.py CHANGED
@@ -36,6 +36,8 @@ from chromadb.config import Settings
36
  from langchain_text_splitters import RecursiveCharacterTextSplitter
37
  from pypdf import PdfReader
38
 
 
 
39
  logging.basicConfig(level=logging.INFO)
40
  logger = logging.getLogger(__name__)
41
 
@@ -257,44 +259,6 @@ def retrieve_relevant_context(query: str, top_k: int = TOP_K_RESULTS) -> list[di
257
  return contexts
258
 
259
 
260
- def call_llm(prompt: str) -> dict:
261
- """
262
- Make a call to the LLM via Azure Foundry (GPT-5).
263
-
264
- Calls the Azure OpenAI-compatible chat/completions endpoint.
265
- Endpoint URL is loaded from config.json; API key from AZURE_API_KEY env var.
266
-
267
- Returns a dict with 'content' (str) and 'total_tokens' (int).
268
- """
269
- headers = {
270
- "api-key": AZURE_API_KEY,
271
- "Content-Type": "application/json",
272
- }
273
- payload = {
274
- "model": LLM_MODEL_NAME,
275
- "messages": [{"role": "user", "content": prompt}],
276
- "max_completion_tokens": LLM_MAX_TOKENS,
277
- "temperature": LLM_TEMPERATURE,
278
- "top_p": LLM_TOP_P,
279
- }
280
- try:
281
- resp = http_requests.post(LLM_ENDPOINT_URL, headers=headers, json=payload, timeout=None)
282
- resp.raise_for_status()
283
- data = resp.json()
284
- content = data["choices"][0]["message"]["content"].strip()
285
- total_tokens = data.get("usage", {}).get("total_tokens", 0)
286
- return {"content": content, "total_tokens": total_tokens}
287
- except http_requests.exceptions.HTTPError as e:
288
- logger.error(f"LLM API call failed: {e} — {resp.text}")
289
- raise HTTPException(status_code=503, detail=f"LLM service unavailable: {str(e)}")
290
- except (http_requests.exceptions.JSONDecodeError, ValueError) as e:
291
- logger.error(f"LLM API returned non-JSON response (status {resp.status_code}): {repr(resp.text)}")
292
- raise HTTPException(status_code=502, detail="LLM service returned an invalid response")
293
- except (KeyError, IndexError) as e:
294
- logger.error(f"Unexpected LLM response format: {e} — body: {resp.text}")
295
- raise HTTPException(status_code=502, detail="Unexpected response from LLM service")
296
-
297
-
298
  def build_rag_prompt(query: str, contexts: list[dict]) -> str:
299
  """
300
  Construct the RAG prompt by combining retrieved context with the user question.
@@ -337,16 +301,30 @@ def rag_query(query: str, top_k: int = TOP_K_RESULTS) -> dict:
337
  "sources": [],
338
  "explanation": "No documents found in the vector store to retrieve context from.",
339
  "total_token": 0,
 
 
 
 
 
340
  "run_time_in_ms": elapsed_ms,
341
  }
342
 
343
  # Step 2: Build the augmented prompt
344
  prompt = build_rag_prompt(query, contexts)
345
 
346
- # Step 3: Generate answer from LLM
347
- llm_result = call_llm(prompt)
 
 
 
 
 
 
 
 
348
  raw_content = llm_result["content"]
349
- total_token = llm_result["total_tokens"]
 
350
 
351
  # Parse structured JSON response from LLM (handle markdown code fences)
352
  json_str = raw_content.strip()
@@ -370,6 +348,11 @@ def rag_query(query: str, top_k: int = TOP_K_RESULTS) -> dict:
370
  "sources": [{"source": ctx["source"], "score": ctx["similarity_score"], "ref_text": ctx["text"]} for ctx in contexts],
371
  "explanation": explanation,
372
  "total_token": total_token,
 
 
 
 
 
373
  "run_time_in_ms": elapsed_ms,
374
  }
375
 
@@ -491,10 +474,10 @@ async def health_check():
491
  # Step 6: Gradio UI for Interactive Demo
492
  # ---------------------------------------------------------------------------
493
 
494
- def gradio_query(question: str) -> tuple[str, str, str, str]:
495
  """Handle queries from the Gradio chat interface."""
496
  if not question.strip():
497
- return "Please enter a question.", "", "", ""
498
  result = rag_query(question)
499
  sources_text = "\n".join(
500
  f" - {s['source']} (relevance: {s['score']:.2f})" for s in result["sources"]
@@ -502,8 +485,10 @@ def gradio_query(question: str) -> tuple[str, str, str, str]:
502
  answer = f"{result['answer']}\n\n📚 Sources:\n{sources_text}" if result["sources"] else result["answer"]
503
  explanation = result.get("explanation", "")
504
  token_info = str(result.get("total_token", 0))
 
 
505
  run_time = f"{result.get('run_time_in_ms', 0)} ms"
506
- return answer, explanation, token_info, run_time
507
 
508
 
509
  def gradio_ingest(text: str, source_name: str) -> str:
@@ -539,11 +524,12 @@ with gr.Blocks(title="RAG Chat API - Gustave Eiffel Hackathon") as demo:
539
  query_explanation = gr.Textbox(label="Explanation", lines=3, interactive=False)
540
  with gr.Row():
541
  query_tokens = gr.Textbox(label="Total Tokens", interactive=False)
 
542
  query_runtime = gr.Textbox(label="Run Time", interactive=False)
543
  query_button.click(
544
  fn=gradio_query,
545
  inputs=query_input,
546
- outputs=[query_output, query_explanation, query_tokens, query_runtime],
547
  )
548
 
549
  with gr.Tab("📄 Ingest Documents"):
 
36
  from langchain_text_splitters import RecursiveCharacterTextSplitter
37
  from pypdf import PdfReader
38
 
39
+ from llm import call_llm as call_llm_with_metrics
40
+
41
  logging.basicConfig(level=logging.INFO)
42
  logger = logging.getLogger(__name__)
43
 
 
259
  return contexts
260
 
261
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
262
  def build_rag_prompt(query: str, contexts: list[dict]) -> str:
263
  """
264
  Construct the RAG prompt by combining retrieved context with the user question.
 
301
  "sources": [],
302
  "explanation": "No documents found in the vector store to retrieve context from.",
303
  "total_token": 0,
304
+ "prompt_tokens": 0,
305
+ "completion_tokens": 0,
306
+ "cached_tokens": 0,
307
+ "co2_grams": None,
308
+ "energy_kwh": None,
309
  "run_time_in_ms": elapsed_ms,
310
  }
311
 
312
  # Step 2: Build the augmented prompt
313
  prompt = build_rag_prompt(query, contexts)
314
 
315
+ # Step 3: Generate answer from LLM (with token + CO2 metrics)
316
+ llm_result = call_llm_with_metrics(
317
+ prompt,
318
+ endpoint_url=LLM_ENDPOINT_URL,
319
+ api_key=AZURE_API_KEY,
320
+ model=LLM_MODEL_NAME,
321
+ max_completion_tokens=LLM_MAX_TOKENS,
322
+ temperature=LLM_TEMPERATURE,
323
+ top_p=LLM_TOP_P,
324
+ )
325
  raw_content = llm_result["content"]
326
+ tokens = llm_result["tokens"]
327
+ total_token = tokens["total"]
328
 
329
  # Parse structured JSON response from LLM (handle markdown code fences)
330
  json_str = raw_content.strip()
 
348
  "sources": [{"source": ctx["source"], "score": ctx["similarity_score"], "ref_text": ctx["text"]} for ctx in contexts],
349
  "explanation": explanation,
350
  "total_token": total_token,
351
+ "prompt_tokens": tokens["prompt"],
352
+ "completion_tokens": tokens["completion"],
353
+ "cached_tokens": tokens["cached"],
354
+ "co2_grams": llm_result["co2_grams"],
355
+ "energy_kwh": llm_result["energy_kwh"],
356
  "run_time_in_ms": elapsed_ms,
357
  }
358
 
 
474
  # Step 6: Gradio UI for Interactive Demo
475
  # ---------------------------------------------------------------------------
476
 
477
+ def gradio_query(question: str) -> tuple[str, str, str, str, str]:
478
  """Handle queries from the Gradio chat interface."""
479
  if not question.strip():
480
+ return "Please enter a question.", "", "", "", ""
481
  result = rag_query(question)
482
  sources_text = "\n".join(
483
  f" - {s['source']} (relevance: {s['score']:.2f})" for s in result["sources"]
 
485
  answer = f"{result['answer']}\n\n📚 Sources:\n{sources_text}" if result["sources"] else result["answer"]
486
  explanation = result.get("explanation", "")
487
  token_info = str(result.get("total_token", 0))
488
+ co2_value = result.get("co2_grams")
489
+ co2_info = f"{co2_value:.4f} g" if isinstance(co2_value, (int, float)) else "N/A"
490
  run_time = f"{result.get('run_time_in_ms', 0)} ms"
491
+ return answer, explanation, token_info, co2_info, run_time
492
 
493
 
494
  def gradio_ingest(text: str, source_name: str) -> str:
 
524
  query_explanation = gr.Textbox(label="Explanation", lines=3, interactive=False)
525
  with gr.Row():
526
  query_tokens = gr.Textbox(label="Total Tokens", interactive=False)
527
+ query_co2 = gr.Textbox(label="CO2 Emission", interactive=False)
528
  query_runtime = gr.Textbox(label="Run Time", interactive=False)
529
  query_button.click(
530
  fn=gradio_query,
531
  inputs=query_input,
532
+ outputs=[query_output, query_explanation, query_tokens, query_co2, query_runtime],
533
  )
534
 
535
  with gr.Tab("📄 Ingest Documents"):
llm.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ LLM wrapper with token accounting and CO2 emission estimation.
3
+
4
+ Wraps an Azure OpenAI-compatible chat/completions call and returns:
5
+ - content: the generated text
6
+ - tokens: prompt / completion / cached / total
7
+ - energy_kwh, co2_grams: environmental impact for the call
8
+
9
+ CO2 emissions are estimated with `ecologits`, which uses a model registry
10
+ (parameter counts, hardware assumptions) plus output token count and request
11
+ latency to compute global warming potential (kgCO2eq) and energy (kWh).
12
+ We chose `ecologits` over `codecarbon` because the LLM runs on a remote
13
+ endpoint — `codecarbon` measures local process energy and would only count
14
+ the client overhead, not the inference itself.
15
+ """
16
+
17
+ import logging
18
+ import time
19
+ from typing import Optional
20
+
21
+ import requests
22
+ from fastapi import HTTPException
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+ try:
27
+ from ecologits.tracers.utils import llm_impacts
28
+ _ECOLOGITS_AVAILABLE = True
29
+ except ImportError:
30
+ _ECOLOGITS_AVAILABLE = False
31
+ logger.warning("ecologits not installed — CO2 emission will be reported as None.")
32
+
33
+
34
+ def call_llm(
35
+ prompt: str,
36
+ *,
37
+ endpoint_url: str,
38
+ api_key: str,
39
+ model: str,
40
+ max_completion_tokens: int = 512,
41
+ temperature: float = 0.7,
42
+ top_p: float = 0.95,
43
+ provider: str = "openai",
44
+ timeout: Optional[float] = None,
45
+ ) -> dict:
46
+ """
47
+ Call an Azure OpenAI-compatible chat/completions endpoint and return the
48
+ response together with token counts and CO2 emission estimate.
49
+
50
+ Returns a dict:
51
+ {
52
+ "content": str,
53
+ "tokens": {
54
+ "prompt": int,
55
+ "completion": int,
56
+ "cached": int,
57
+ "total": int,
58
+ },
59
+ "energy_kwh": float | None,
60
+ "co2_grams": float | None,
61
+ "latency_s": float,
62
+ }
63
+ """
64
+ headers = {
65
+ "api-key": api_key,
66
+ "Content-Type": "application/json",
67
+ }
68
+ payload = {
69
+ "model": model,
70
+ "messages": [{"role": "user", "content": prompt}],
71
+ "max_completion_tokens": max_completion_tokens,
72
+ "temperature": temperature,
73
+ "top_p": top_p,
74
+ }
75
+
76
+ start = time.perf_counter()
77
+ try:
78
+ resp = requests.post(endpoint_url, headers=headers, json=payload, timeout=timeout)
79
+ resp.raise_for_status()
80
+ data = resp.json()
81
+ except requests.exceptions.HTTPError as e:
82
+ logger.error(f"LLM API call failed: {e} — {resp.text}")
83
+ raise HTTPException(status_code=503, detail=f"LLM service unavailable: {str(e)}")
84
+ except (requests.exceptions.JSONDecodeError, ValueError):
85
+ logger.error(
86
+ f"LLM API returned non-JSON response (status {resp.status_code}): {repr(resp.text)}"
87
+ )
88
+ raise HTTPException(status_code=502, detail="LLM service returned an invalid response")
89
+ latency_s = time.perf_counter() - start
90
+
91
+ try:
92
+ content = data["choices"][0]["message"]["content"].strip()
93
+ except (KeyError, IndexError) as e:
94
+ logger.error(f"Unexpected LLM response format: {e} — body: {data!r}")
95
+ raise HTTPException(status_code=502, detail="Unexpected response from LLM service")
96
+
97
+ usage = data.get("usage") or {}
98
+ prompt_tokens = usage.get("prompt_tokens", 0)
99
+ completion_tokens = usage.get("completion_tokens", 0)
100
+ total_tokens = usage.get("total_tokens", prompt_tokens + completion_tokens)
101
+
102
+ prompt_details = usage.get("prompt_tokens_details") or {}
103
+ cached_tokens = prompt_details.get("cached_tokens", usage.get("cached_tokens", 0))
104
+
105
+ energy_kwh: Optional[float] = None
106
+ co2_grams: Optional[float] = None
107
+ if _ECOLOGITS_AVAILABLE:
108
+ try:
109
+ impacts = llm_impacts(
110
+ provider=provider,
111
+ model_name=model,
112
+ output_token_count=completion_tokens,
113
+ request_latency=latency_s,
114
+ )
115
+ if impacts is not None:
116
+ energy_kwh = float(impacts.energy.value)
117
+ # ecologits returns gwp in kgCO2eq; convert to grams
118
+ co2_grams = float(impacts.gwp.value) * 1000.0
119
+ except Exception as e:
120
+ logger.warning(f"ecologits impact calc failed for model={model}: {e}")
121
+
122
+ return {
123
+ "content": content,
124
+ "tokens": {
125
+ "prompt": prompt_tokens,
126
+ "completion": completion_tokens,
127
+ "cached": cached_tokens,
128
+ "total": total_tokens,
129
+ },
130
+ "energy_kwh": energy_kwh,
131
+ "co2_grams": co2_grams,
132
+ "latency_s": latency_s,
133
+ }
requirements.txt CHANGED
@@ -12,3 +12,4 @@ pypdf==4.3.0
12
  python-multipart==0.0.9
13
  pydantic==2.9.0
14
  requests>=2.31.0
 
 
12
  python-multipart==0.0.9
13
  pydantic==2.9.0
14
  requests>=2.31.0
15
+ ecologits>=0.5.0