Spaces:

VicGerardoPR
/

StrainAIAPP

Sleeping

Victor Gerardo Rivera

Implement deep OCR with Tesseract and RegEx patterns for lab extraction

4bcd249 9 days ago

4.68 kB

	import base64
	import json
	import io
	import os
	from PIL import Image
	from typing import Dict, Any, List, Optional
	try:
	from transformers import pipeline
	HAS_TRANSFORMERS = True
	except ImportError:
	HAS_TRANSFORMERS = False
	print("Warning: transformers not found. Using mock extraction.")
	from models import LabReportData, Cannabinoid, Terpene

	# Simulation of Hugging Face pipeline for document understanding
	# In reality, this would use: pipeline("document-question-answering", model="impira/layoutlm-document-qa")
	# Or a multimodal model like Donut: model="naver-clova-ix/donut-base-finetuned-docvqa"

	class LabReportParser:
	def __init__(self, use_remote_api: bool = False, hf_token: Optional[str] = None):
	self.use_remote_api = use_remote_api
	self.hf_token = hf_token or os.getenv("HF_TOKEN")

	async def extract_data(self, file_content: str, file_name: str) -> LabReportData:
	# 1. Prepare Images and OCR Text
	raw_text = ""
	image_to_process_b64 = None
	pil_image = None
	try:
	import pytesseract
	image_data = base64.b64decode(file_content)
	if file_name.lower().endswith('.pdf'):
	from pdf2image import convert_from_bytes
	images = convert_from_bytes(image_data)
	if images:
	pil_image = images[0]
	# Also extract text from all pages to be safe
	for img in images[:2]: # First 2 pages
	raw_text += pytesseract.image_to_string(img)
	else:
	pil_image = Image.open(io.BytesIO(image_data))
	raw_text = pytesseract.image_to_string(pil_image)

	if pil_image:
	buffered = io.BytesIO()
	pil_image.save(buffered, format="JPEG", quality=95)
	image_to_process_b64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
	except Exception as e:
	print(f"OCR/Image Pre-processing Error: {e}")

	# 2. Extract Using RegEx (Fast & Reliable for Numbers)
	import re
	thc_val = 0.0
	cbd_val = 0.0

	# Look for THC/CBD patterns
	thc_match = re.search(r'(?:Total\sTHC\|THC\sTotal\|Potency)[:\s]*([\d\.]+)', raw_text, re.I)
	if thc_match:
	try: thc_val = float(thc_match.group(1))
	except: pass

	cbd_match = re.search(r'(?:Total\sCBD\|CBD\sTotal)[:\s]*([\d\.]+)', raw_text, re.I)
	if cbd_match:
	try: cbd_val = float(cbd_match.group(1))
	except: pass

	# 3. Use Inference API for Naming (Context is better there)
	strain_name = file_name.replace(".pdf", "").replace(".png", "").replace(".jpg", "").title()
	if self.hf_token and image_to_process_b64:
	try:
	import requests
	API_URL = "https://api-inference.huggingface.co/models/impira/layoutlm-document-qa"
	headers = {"Authorization": f"Bearer {self.hf_token}"}
	payload = {"inputs": {"image": image_to_process_b64, "question": "What is the strain name?"}}
	resp = requests.post(API_URL, headers=headers, json=payload).json()
	if isinstance(resp, list) and len(resp) > 0:
	answer = resp[0].get("answer", "").title()
	if len(answer) > 2 and "unknown" not in answer.lower():
	strain_name = answer
	except: pass

	# 4. Final Data Assembly
	print(f"DEBUG: OCR Extraction - THC: {thc_val}%, Name: {strain_name}")

	# If OCR got data, or we use a smart fallback
	return LabReportData(
	strain_name=strain_name,
	strain_type="Hybrid", # default
	cannabinoids=[
	Cannabinoid(name="Total THC", value=thc_val, unit="%"),
	Cannabinoid(name="Total CBD", value=cbd_val, unit="%")
	],
	terpenes=[],
	file_name=file_name,
	confidence=0.8 if thc_val > 0 else 0.5,
	source_type="ai_hybrid"
	)

	def _empty_extraction(self, file_name: str) -> LabReportData:
	# Not needed anymore as extract_data is now more robust, but kept for compatibility
	return LabReportData(
	strain_name=file_name.split(".")[0].title(),
	cannabinoids=[Cannabinoid(name="Total THC", value=0.0)],
	confidence=0.0,
	file_name=file_name,
	source_type="error"
	)

	async def normalize(self, data: Dict[str, Any]) -> LabReportData:
	return LabReportData(**data)