File size: 2,094 Bytes
09c368a
 
66544f4
6828d65
565d3eb
2eaadc6
5e4eb8d
09c368a
565d3eb
2eaadc6
 
5e4eb8d
09c368a
 
6828d65
 
565d3eb
6828d65
5e4eb8d
66544f4
09c368a
66544f4
09c368a
6828d65
09c368a
 
 
 
 
 
6828d65
09c368a
 
 
6828d65
 
09c368a
 
2eaadc6
565d3eb
2eaadc6
 
 
 
565d3eb
 
09c368a
565d3eb
6828d65
565d3eb
2eaadc6
6828d65
 
 
 
 
 
5e4eb8d
 
 
 
09c368a
5e4eb8d
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import streamlit as st
from PIL import Image
import torch
import easyocr
import numpy as np
import io
from transformers import CLIPModel, CLIPImageProcessor

# βœ… Fix: set_page_config() must be the first Streamlit command
st.set_page_config(page_title="Multimodal AI Assistant", layout="wide")

# ---- Load CLIP Model (Vision Only) ---- #
@st.cache_resource
def load_clip_model():
    model = CLIPModel.from_pretrained(
        "fxmarty/clip-vision-model-tiny", 
        ignore_mismatched_sizes=True  # βœ… Fix size mismatch
    )
    processor = CLIPImageProcessor.from_pretrained("fxmarty/clip-vision-model-tiny")
    return model, processor

model, processor = load_clip_model()

# ---- Load OCR (EasyOCR) ---- #
@st.cache_resource
def load_ocr():
    return easyocr.Reader(['en'])

reader = load_ocr()

# ---- Streamlit UI ---- #
st.title("πŸ–ΌοΈ Multimodal AI Assistant")
st.write("Upload an image and ask a question about it!")

# ---- Upload Image ---- #
uploaded_file = st.file_uploader("πŸ“€ Upload an image", type=["jpg", "png", "jpeg"])

if uploaded_file is not None:
    # Convert file to image format
    image = Image.open(uploaded_file).convert("RGB")

    # βœ… Fix: use `use_container_width` instead of `use_column_width`
    st.image(image, caption="Uploaded Image", use_container_width=True)

    # βœ… Convert PIL image to NumPy array for EasyOCR
    image_np = np.array(image)

    # βœ… Fix: Pass the correct format to EasyOCR
    with st.spinner("πŸ” Extracting text from image..."):
        extracted_text = reader.readtext(image_np, detail=0)

    st.write("### πŸ“ Extracted Text:")
    if extracted_text:
        st.success(extracted_text)
    else:
        st.warning("No readable text found in the image.")

    # ---- Process Image with CLIP Vision Model ---- #
    with st.spinner("πŸ” Analyzing image with CLIP Model..."):
        inputs = processor(images=image, return_tensors="pt")
        outputs = model.get_image_features(**inputs)

    st.write("### πŸ† AI Response:")
    st.write("CLIP Model has processed the image! (More features coming soon)")