Spaces:
Sleeping
Sleeping
File size: 2,094 Bytes
09c368a 66544f4 6828d65 565d3eb 2eaadc6 5e4eb8d 09c368a 565d3eb 2eaadc6 5e4eb8d 09c368a 6828d65 565d3eb 6828d65 5e4eb8d 66544f4 09c368a 66544f4 09c368a 6828d65 09c368a 6828d65 09c368a 6828d65 09c368a 2eaadc6 565d3eb 2eaadc6 565d3eb 09c368a 565d3eb 6828d65 565d3eb 2eaadc6 6828d65 5e4eb8d 09c368a 5e4eb8d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 | import streamlit as st
from PIL import Image
import torch
import easyocr
import numpy as np
import io
from transformers import CLIPModel, CLIPImageProcessor
# β
Fix: set_page_config() must be the first Streamlit command
st.set_page_config(page_title="Multimodal AI Assistant", layout="wide")
# ---- Load CLIP Model (Vision Only) ---- #
@st.cache_resource
def load_clip_model():
model = CLIPModel.from_pretrained(
"fxmarty/clip-vision-model-tiny",
ignore_mismatched_sizes=True # β
Fix size mismatch
)
processor = CLIPImageProcessor.from_pretrained("fxmarty/clip-vision-model-tiny")
return model, processor
model, processor = load_clip_model()
# ---- Load OCR (EasyOCR) ---- #
@st.cache_resource
def load_ocr():
return easyocr.Reader(['en'])
reader = load_ocr()
# ---- Streamlit UI ---- #
st.title("πΌοΈ Multimodal AI Assistant")
st.write("Upload an image and ask a question about it!")
# ---- Upload Image ---- #
uploaded_file = st.file_uploader("π€ Upload an image", type=["jpg", "png", "jpeg"])
if uploaded_file is not None:
# Convert file to image format
image = Image.open(uploaded_file).convert("RGB")
# β
Fix: use `use_container_width` instead of `use_column_width`
st.image(image, caption="Uploaded Image", use_container_width=True)
# β
Convert PIL image to NumPy array for EasyOCR
image_np = np.array(image)
# β
Fix: Pass the correct format to EasyOCR
with st.spinner("π Extracting text from image..."):
extracted_text = reader.readtext(image_np, detail=0)
st.write("### π Extracted Text:")
if extracted_text:
st.success(extracted_text)
else:
st.warning("No readable text found in the image.")
# ---- Process Image with CLIP Vision Model ---- #
with st.spinner("π Analyzing image with CLIP Model..."):
inputs = processor(images=image, return_tensors="pt")
outputs = model.get_image_features(**inputs)
st.write("### π AI Response:")
st.write("CLIP Model has processed the image! (More features coming soon)")
|