StrainAIAPP / services /parser.py
Victor Gerardo Rivera
Implement deep OCR with Tesseract and RegEx patterns for lab extraction
4bcd249
import base64
import json
import io
import os
from PIL import Image
from typing import Dict, Any, List, Optional
try:
from transformers import pipeline
HAS_TRANSFORMERS = True
except ImportError:
HAS_TRANSFORMERS = False
print("Warning: transformers not found. Using mock extraction.")
from models import LabReportData, Cannabinoid, Terpene
# Simulation of Hugging Face pipeline for document understanding
# In reality, this would use: pipeline("document-question-answering", model="impira/layoutlm-document-qa")
# Or a multimodal model like Donut: model="naver-clova-ix/donut-base-finetuned-docvqa"
class LabReportParser:
def __init__(self, use_remote_api: bool = False, hf_token: Optional[str] = None):
self.use_remote_api = use_remote_api
self.hf_token = hf_token or os.getenv("HF_TOKEN")
async def extract_data(self, file_content: str, file_name: str) -> LabReportData:
# 1. Prepare Images and OCR Text
raw_text = ""
image_to_process_b64 = None
pil_image = None
try:
import pytesseract
image_data = base64.b64decode(file_content)
if file_name.lower().endswith('.pdf'):
from pdf2image import convert_from_bytes
images = convert_from_bytes(image_data)
if images:
pil_image = images[0]
# Also extract text from all pages to be safe
for img in images[:2]: # First 2 pages
raw_text += pytesseract.image_to_string(img)
else:
pil_image = Image.open(io.BytesIO(image_data))
raw_text = pytesseract.image_to_string(pil_image)
if pil_image:
buffered = io.BytesIO()
pil_image.save(buffered, format="JPEG", quality=95)
image_to_process_b64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
except Exception as e:
print(f"OCR/Image Pre-processing Error: {e}")
# 2. Extract Using RegEx (Fast & Reliable for Numbers)
import re
thc_val = 0.0
cbd_val = 0.0
# Look for THC/CBD patterns
thc_match = re.search(r'(?:Total\s*THC|THC\s*Total|Potency)[:\s]*([\d\.]+)', raw_text, re.I)
if thc_match:
try: thc_val = float(thc_match.group(1))
except: pass
cbd_match = re.search(r'(?:Total\s*CBD|CBD\s*Total)[:\s]*([\d\.]+)', raw_text, re.I)
if cbd_match:
try: cbd_val = float(cbd_match.group(1))
except: pass
# 3. Use Inference API for Naming (Context is better there)
strain_name = file_name.replace(".pdf", "").replace(".png", "").replace(".jpg", "").title()
if self.hf_token and image_to_process_b64:
try:
import requests
API_URL = "https://api-inference.huggingface.co/models/impira/layoutlm-document-qa"
headers = {"Authorization": f"Bearer {self.hf_token}"}
payload = {"inputs": {"image": image_to_process_b64, "question": "What is the strain name?"}}
resp = requests.post(API_URL, headers=headers, json=payload).json()
if isinstance(resp, list) and len(resp) > 0:
answer = resp[0].get("answer", "").title()
if len(answer) > 2 and "unknown" not in answer.lower():
strain_name = answer
except: pass
# 4. Final Data Assembly
print(f"DEBUG: OCR Extraction - THC: {thc_val}%, Name: {strain_name}")
# If OCR got data, or we use a smart fallback
return LabReportData(
strain_name=strain_name,
strain_type="Hybrid", # default
cannabinoids=[
Cannabinoid(name="Total THC", value=thc_val, unit="%"),
Cannabinoid(name="Total CBD", value=cbd_val, unit="%")
],
terpenes=[],
file_name=file_name,
confidence=0.8 if thc_val > 0 else 0.5,
source_type="ai_hybrid"
)
def _empty_extraction(self, file_name: str) -> LabReportData:
# Not needed anymore as extract_data is now more robust, but kept for compatibility
return LabReportData(
strain_name=file_name.split(".")[0].title(),
cannabinoids=[Cannabinoid(name="Total THC", value=0.0)],
confidence=0.0,
file_name=file_name,
source_type="error"
)
async def normalize(self, data: Dict[str, Any]) -> LabReportData:
return LabReportData(**data)