File size: 2,995 Bytes
e5b2062
 
 
 
 
 
39d368e
 
 
4acb49a
 
 
 
 
e5b2062
 
 
39d368e
e5b2062
 
 
4acb49a
e5b2062
 
39d368e
4acb49a
 
e5b2062
 
4acb49a
 
 
 
e5b2062
39d368e
4acb49a
e5b2062
39d368e
4acb49a
 
 
e5b2062
 
 
4acb49a
e5b2062
 
4acb49a
 
 
 
 
 
 
 
 
 
 
e5b2062
4acb49a
 
 
 
 
 
 
 
e5b2062
4acb49a
e5b2062
4acb49a
e5b2062
4acb49a
e5b2062
 
 
4acb49a
 
 
e5b2062
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import torch
import gradio as gr
from PIL import Image
import whisper
from transformers import AutoProcessor, AutoModelForImageTextToText


processor = AutoProcessor.from_pretrained("deepseek-community/Janus-Pro-1B", trust_remote_code=True)
model = AutoModelForImageTextToText.from_pretrained("deepseek-community/Janus-Pro-1B", trust_remote_code=True)


device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

whisper_model = whisper.load_model("base")

def build_instruction(user_text):
    return f"You are a professional AI prompt engineer. Convert the input into a highly detailed AI generation prompt. Include: Subject, Environment, Summary. Input: {user_text}\nReturn only the final prompt."

def text_to_prompt(user_text):
    instruction = build_instruction(user_text)
    inputs = processor(text=instruction, return_tensors="pt").to(device)

    input_len = inputs.input_ids.shape[1]
    
    output = model.generate(**inputs, max_new_tokens=200)
    return processor.decode(output[0][input_len:], skip_special_tokens=True).strip()

def image_text_to_prompt(image_path, user_text):
    if not user_text:
        user_text = "Describe this image in detail."
    
    image = Image.open(image_path).convert("RGB")
    instruction = build_instruction(user_text)
    
    inputs = processor(images=[image], text=instruction, return_tensors="pt").to(device)
    input_len = inputs.input_ids.shape[1]
    
    output = model.generate(**inputs, max_new_tokens=200)
    
    return processor.decode(output[0][input_len:], skip_special_tokens=True).strip()

def audio_to_prompt(audio_path):
    result = whisper_model.transcribe(audio_path)
    return text_to_prompt(result["text"])

def generate_prompt_ui(input_type, text, image, audio):
    try:
        if input_type == "Text":
            return text_to_prompt(text)
        elif input_type == "Image + Text":
            return image_text_to_prompt(image, text)
        elif input_type == "Audio":
            return audio_to_prompt(audio)
    except Exception as e:
        return f"Error: {str(e)}"

# Gradio UI setup
with gr.Blocks() as app:
    gr.Markdown("# 🧠 Janus-Pro Prompt Generator")
    
    input_type = gr.Radio(["Text", "Image + Text", "Audio"], label="Select Input Type", value="Text")
    
    text_input = gr.Textbox(label="Enter your idea")
    image_input = gr.Image(type="filepath", label="Upload Image", visible=False)
    audio_input = gr.Audio(type="filepath", label="Upload Audio", visible=False)
    
    output = gr.Textbox(label="Generated Prompt")
    btn = gr.Button("Generate 🚀")

    def toggle(choice):
        return (
            gr.update(visible=(choice != "Audio")),
            gr.update(visible=(choice == "Image + Text")),
            gr.update(visible=(choice == "Audio"))
        )

    input_type.change(toggle, input_type, [text_input, image_input, audio_input])
    btn.click(generate_prompt_ui, [input_type, text_input, image_input, audio_input], output)

app.launch()