Spaces:

lisapin
/

hackathon

Sleeping

App Files Files Community

Quincy Hsieh commited on Jun 2

Commit

929f2ac

1 Parent(s): b250e76

Add CO2 emission and token counts

Browse files

Files changed (3) hide show

app.py +31 -45
llm.py +133 -0
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -36,6 +36,8 @@ from chromadb.config import Settings
 from langchain_text_splitters import RecursiveCharacterTextSplitter
 from pypdf import PdfReader
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
@@ -257,44 +259,6 @@ def retrieve_relevant_context(query: str, top_k: int = TOP_K_RESULTS) -> list[di
     return contexts
-def call_llm(prompt: str) -> dict:
-    """
-    Make a call to the LLM via Azure Foundry (GPT-5).
-    Calls the Azure OpenAI-compatible chat/completions endpoint.
-    Endpoint URL is loaded from config.json; API key from AZURE_API_KEY env var.
-    Returns a dict with 'content' (str) and 'total_tokens' (int).
-    """
-    headers = {
-        "api-key": AZURE_API_KEY,
-        "Content-Type": "application/json",
-    }
-    payload = {
-        "model": LLM_MODEL_NAME,
-        "messages": [{"role": "user", "content": prompt}],
-        "max_completion_tokens": LLM_MAX_TOKENS,
-        "temperature": LLM_TEMPERATURE,
-        "top_p": LLM_TOP_P,
-    }
-    try:
-        resp = http_requests.post(LLM_ENDPOINT_URL, headers=headers, json=payload, timeout=None)
-        resp.raise_for_status()
-        data = resp.json()
-        content = data["choices"][0]["message"]["content"].strip()
-        total_tokens = data.get("usage", {}).get("total_tokens", 0)
-        return {"content": content, "total_tokens": total_tokens}
-    except http_requests.exceptions.HTTPError as e:
-        logger.error(f"LLM API call failed: {e} — {resp.text}")
-        raise HTTPException(status_code=503, detail=f"LLM service unavailable: {str(e)}")
-    except (http_requests.exceptions.JSONDecodeError, ValueError) as e:
-        logger.error(f"LLM API returned non-JSON response (status {resp.status_code}): {repr(resp.text)}")
-        raise HTTPException(status_code=502, detail="LLM service returned an invalid response")
-    except (KeyError, IndexError) as e:
-        logger.error(f"Unexpected LLM response format: {e} — body: {resp.text}")
-        raise HTTPException(status_code=502, detail="Unexpected response from LLM service")
 def build_rag_prompt(query: str, contexts: list[dict]) -> str:
     """
     Construct the RAG prompt by combining retrieved context with the user question.
@@ -337,16 +301,30 @@ def rag_query(query: str, top_k: int = TOP_K_RESULTS) -> dict:
             "sources": [],
             "explanation": "No documents found in the vector store to retrieve context from.",
             "total_token": 0,
             "run_time_in_ms": elapsed_ms,
         }
     # Step 2: Build the augmented prompt
     prompt = build_rag_prompt(query, contexts)
-    # Step 3: Generate answer from LLM
-    llm_result = call_llm(prompt)
     raw_content = llm_result["content"]
-    total_token = llm_result["total_tokens"]
     # Parse structured JSON response from LLM (handle markdown code fences)
     json_str = raw_content.strip()
@@ -370,6 +348,11 @@ def rag_query(query: str, top_k: int = TOP_K_RESULTS) -> dict:
         "sources": [{"source": ctx["source"], "score": ctx["similarity_score"], "ref_text": ctx["text"]} for ctx in contexts],
         "explanation": explanation,
         "total_token": total_token,
         "run_time_in_ms": elapsed_ms,
     }
@@ -491,10 +474,10 @@ async def health_check():
 # Step 6: Gradio UI for Interactive Demo
 # ---------------------------------------------------------------------------
-def gradio_query(question: str) -> tuple[str, str, str, str]:
     """Handle queries from the Gradio chat interface."""
     if not question.strip():
-        return "Please enter a question.", "", "", ""
     result = rag_query(question)
     sources_text = "\n".join(
         f"  - {s['source']} (relevance: {s['score']:.2f})" for s in result["sources"]
@@ -502,8 +485,10 @@ def gradio_query(question: str) -> tuple[str, str, str, str]:
     answer = f"{result['answer']}\n\n📚 Sources:\n{sources_text}" if result["sources"] else result["answer"]
     explanation = result.get("explanation", "")
     token_info = str(result.get("total_token", 0))
     run_time = f"{result.get('run_time_in_ms', 0)} ms"
-    return answer, explanation, token_info, run_time
 def gradio_ingest(text: str, source_name: str) -> str:
@@ -539,11 +524,12 @@ with gr.Blocks(title="RAG Chat API - Gustave Eiffel Hackathon") as demo:
         query_explanation = gr.Textbox(label="Explanation", lines=3, interactive=False)
         with gr.Row():
             query_tokens = gr.Textbox(label="Total Tokens", interactive=False)
             query_runtime = gr.Textbox(label="Run Time", interactive=False)
         query_button.click(
             fn=gradio_query,
             inputs=query_input,
-            outputs=[query_output, query_explanation, query_tokens, query_runtime],
         )
     with gr.Tab("📄 Ingest Documents"):

 from langchain_text_splitters import RecursiveCharacterTextSplitter
 from pypdf import PdfReader
+from llm import call_llm as call_llm_with_metrics
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
     return contexts
 def build_rag_prompt(query: str, contexts: list[dict]) -> str:
     """
     Construct the RAG prompt by combining retrieved context with the user question.
             "sources": [],
             "explanation": "No documents found in the vector store to retrieve context from.",
             "total_token": 0,
+            "prompt_tokens": 0,
+            "completion_tokens": 0,
+            "cached_tokens": 0,
+            "co2_grams": None,
+            "energy_kwh": None,
             "run_time_in_ms": elapsed_ms,
         }
     # Step 2: Build the augmented prompt
     prompt = build_rag_prompt(query, contexts)
+    # Step 3: Generate answer from LLM (with token + CO2 metrics)
+    llm_result = call_llm_with_metrics(
+        prompt,
+        endpoint_url=LLM_ENDPOINT_URL,
+        api_key=AZURE_API_KEY,
+        model=LLM_MODEL_NAME,
+        max_completion_tokens=LLM_MAX_TOKENS,
+        temperature=LLM_TEMPERATURE,
+        top_p=LLM_TOP_P,
+    )
     raw_content = llm_result["content"]
+    tokens = llm_result["tokens"]
+    total_token = tokens["total"]
     # Parse structured JSON response from LLM (handle markdown code fences)
     json_str = raw_content.strip()
         "sources": [{"source": ctx["source"], "score": ctx["similarity_score"], "ref_text": ctx["text"]} for ctx in contexts],
         "explanation": explanation,
         "total_token": total_token,
+        "prompt_tokens": tokens["prompt"],
+        "completion_tokens": tokens["completion"],
+        "cached_tokens": tokens["cached"],
+        "co2_grams": llm_result["co2_grams"],
+        "energy_kwh": llm_result["energy_kwh"],
         "run_time_in_ms": elapsed_ms,
     }
 # Step 6: Gradio UI for Interactive Demo
 # ---------------------------------------------------------------------------
+def gradio_query(question: str) -> tuple[str, str, str, str, str]:
     """Handle queries from the Gradio chat interface."""
     if not question.strip():
+        return "Please enter a question.", "", "", "", ""
     result = rag_query(question)
     sources_text = "\n".join(
         f"  - {s['source']} (relevance: {s['score']:.2f})" for s in result["sources"]
     answer = f"{result['answer']}\n\n📚 Sources:\n{sources_text}" if result["sources"] else result["answer"]
     explanation = result.get("explanation", "")
     token_info = str(result.get("total_token", 0))
+    co2_value = result.get("co2_grams")
+    co2_info = f"{co2_value:.4f} g" if isinstance(co2_value, (int, float)) else "N/A"
     run_time = f"{result.get('run_time_in_ms', 0)} ms"
+    return answer, explanation, token_info, co2_info, run_time
 def gradio_ingest(text: str, source_name: str) -> str:
         query_explanation = gr.Textbox(label="Explanation", lines=3, interactive=False)
         with gr.Row():
             query_tokens = gr.Textbox(label="Total Tokens", interactive=False)
+            query_co2 = gr.Textbox(label="CO2 Emission", interactive=False)
             query_runtime = gr.Textbox(label="Run Time", interactive=False)
         query_button.click(
             fn=gradio_query,
             inputs=query_input,
+            outputs=[query_output, query_explanation, query_tokens, query_co2, query_runtime],
         )
     with gr.Tab("📄 Ingest Documents"):

llm.py ADDED Viewed

	@@ -0,0 +1,133 @@

+"""
+LLM wrapper with token accounting and CO2 emission estimation.
+Wraps an Azure OpenAI-compatible chat/completions call and returns:
+- content: the generated text
+- tokens: prompt / completion / cached / total
+- energy_kwh, co2_grams: environmental impact for the call
+CO2 emissions are estimated with `ecologits`, which uses a model registry
+(parameter counts, hardware assumptions) plus output token count and request
+latency to compute global warming potential (kgCO2eq) and energy (kWh).
+We chose `ecologits` over `codecarbon` because the LLM runs on a remote
+endpoint — `codecarbon` measures local process energy and would only count
+the client overhead, not the inference itself.
+"""
+import logging
+import time
+from typing import Optional
+import requests
+from fastapi import HTTPException
+logger = logging.getLogger(__name__)
+try:
+    from ecologits.tracers.utils import llm_impacts
+    _ECOLOGITS_AVAILABLE = True
+except ImportError:
+    _ECOLOGITS_AVAILABLE = False
+    logger.warning("ecologits not installed — CO2 emission will be reported as None.")
+def call_llm(
+    prompt: str,
+    *,
+    endpoint_url: str,
+    api_key: str,
+    model: str,
+    max_completion_tokens: int = 512,
+    temperature: float = 0.7,
+    top_p: float = 0.95,
+    provider: str = "openai",
+    timeout: Optional[float] = None,
+) -> dict:
+    """
+    Call an Azure OpenAI-compatible chat/completions endpoint and return the
+    response together with token counts and CO2 emission estimate.
+    Returns a dict:
+        {
+            "content": str,
+            "tokens": {
+                "prompt": int,
+                "completion": int,
+                "cached": int,
+                "total": int,
+            },
+            "energy_kwh": float | None,
+            "co2_grams": float | None,
+            "latency_s": float,
+        }
+    """
+    headers = {
+        "api-key": api_key,
+        "Content-Type": "application/json",
+    }
+    payload = {
+        "model": model,
+        "messages": [{"role": "user", "content": prompt}],
+        "max_completion_tokens": max_completion_tokens,
+        "temperature": temperature,
+        "top_p": top_p,
+    }
+    start = time.perf_counter()
+    try:
+        resp = requests.post(endpoint_url, headers=headers, json=payload, timeout=timeout)
+        resp.raise_for_status()
+        data = resp.json()
+    except requests.exceptions.HTTPError as e:
+        logger.error(f"LLM API call failed: {e} — {resp.text}")
+        raise HTTPException(status_code=503, detail=f"LLM service unavailable: {str(e)}")
+    except (requests.exceptions.JSONDecodeError, ValueError):
+        logger.error(
+            f"LLM API returned non-JSON response (status {resp.status_code}): {repr(resp.text)}"
+        )
+        raise HTTPException(status_code=502, detail="LLM service returned an invalid response")
+    latency_s = time.perf_counter() - start
+    try:
+        content = data["choices"][0]["message"]["content"].strip()
+    except (KeyError, IndexError) as e:
+        logger.error(f"Unexpected LLM response format: {e} — body: {data!r}")
+        raise HTTPException(status_code=502, detail="Unexpected response from LLM service")
+    usage = data.get("usage") or {}
+    prompt_tokens = usage.get("prompt_tokens", 0)
+    completion_tokens = usage.get("completion_tokens", 0)
+    total_tokens = usage.get("total_tokens", prompt_tokens + completion_tokens)
+    prompt_details = usage.get("prompt_tokens_details") or {}
+    cached_tokens = prompt_details.get("cached_tokens", usage.get("cached_tokens", 0))
+    energy_kwh: Optional[float] = None
+    co2_grams: Optional[float] = None
+    if _ECOLOGITS_AVAILABLE:
+        try:
+            impacts = llm_impacts(
+                provider=provider,
+                model_name=model,
+                output_token_count=completion_tokens,
+                request_latency=latency_s,
+            )
+            if impacts is not None:
+                energy_kwh = float(impacts.energy.value)
+                # ecologits returns gwp in kgCO2eq; convert to grams
+                co2_grams = float(impacts.gwp.value) * 1000.0
+        except Exception as e:
+            logger.warning(f"ecologits impact calc failed for model={model}: {e}")
+    return {
+        "content": content,
+        "tokens": {
+            "prompt": prompt_tokens,
+            "completion": completion_tokens,
+            "cached": cached_tokens,
+            "total": total_tokens,
+        },
+        "energy_kwh": energy_kwh,
+        "co2_grams": co2_grams,
+        "latency_s": latency_s,
+    }

requirements.txt CHANGED Viewed

@@ -12,3 +12,4 @@ pypdf==4.3.0
 python-multipart==0.0.9
 pydantic==2.9.0
 requests>=2.31.0

 python-multipart==0.0.9
 pydantic==2.9.0
 requests>=2.31.0
+ecologits>=0.5.0