| | import gradio as gr |
| | import spaces |
| | import torch |
| | from loadimg import load_img |
| | from torchvision import transforms |
| | from transformers import AutoModelForImageSegmentation, pipeline |
| | from diffusers import FluxFillPipeline |
| | from PIL import Image, ImageOps |
| |
|
| | |
| | import numpy as np |
| | from simple_lama_inpainting import SimpleLama |
| | from contextlib import contextmanager |
| |
|
| |
|
| | @contextmanager |
| | def float32_high_matmul_precision(): |
| | torch.set_float32_matmul_precision("high") |
| | try: |
| | yield |
| | finally: |
| | torch.set_float32_matmul_precision("highest") |
| |
|
| |
|
| | pipe = FluxFillPipeline.from_pretrained( |
| | "black-forest-labs/FLUX.1-Fill-dev", torch_dtype=torch.bfloat16 |
| | ).to("cuda") |
| |
|
| | birefnet = AutoModelForImageSegmentation.from_pretrained( |
| | "ZhengPeng7/BiRefNet", trust_remote_code=True |
| | ) |
| | birefnet.to("cuda") |
| |
|
| | transform_image = transforms.Compose( |
| | [ |
| | transforms.Resize((1024, 1024)), |
| | transforms.ToTensor(), |
| | transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), |
| | ] |
| | ) |
| |
|
| |
|
| | def prepare_image_and_mask( |
| | image, |
| | padding_top=0, |
| | padding_bottom=0, |
| | padding_left=0, |
| | padding_right=0, |
| | ): |
| | image = load_img(image).convert("RGB") |
| | |
| | background = ImageOps.expand( |
| | image, |
| | border=(padding_left, padding_top, padding_right, padding_bottom), |
| | fill="white", |
| | ) |
| | mask = Image.new("RGB", image.size, "black") |
| | mask = ImageOps.expand( |
| | mask, |
| | border=(padding_left, padding_top, padding_right, padding_bottom), |
| | fill="white", |
| | ) |
| | return background, mask |
| |
|
| |
|
| | def outpaint( |
| | image, |
| | padding_top=0, |
| | padding_bottom=0, |
| | padding_left=0, |
| | padding_right=0, |
| | prompt="", |
| | num_inference_steps=28, |
| | guidance_scale=50, |
| | ): |
| | background, mask = prepare_image_and_mask( |
| | image, padding_top, padding_bottom, padding_left, padding_right |
| | ) |
| |
|
| | result = pipe( |
| | prompt=prompt, |
| | height=background.height, |
| | width=background.width, |
| | image=background, |
| | mask_image=mask, |
| | num_inference_steps=num_inference_steps, |
| | guidance_scale=guidance_scale, |
| | ).images[0] |
| |
|
| | result = result.convert("RGBA") |
| |
|
| | return result |
| |
|
| |
|
| | def inpaint( |
| | image, |
| | mask, |
| | prompt="", |
| | num_inference_steps=28, |
| | guidance_scale=50, |
| | ): |
| | background = image.convert("RGB") |
| | mask = mask.convert("L") |
| |
|
| | result = pipe( |
| | prompt=prompt, |
| | height=background.height, |
| | width=background.width, |
| | image=background, |
| | mask_image=mask, |
| | num_inference_steps=num_inference_steps, |
| | guidance_scale=guidance_scale, |
| | ).images[0] |
| |
|
| | result = result.convert("RGBA") |
| |
|
| | return result |
| |
|
| |
|
| | def rmbg(image=None, url=None): |
| | if image is None: |
| | image = url |
| | image = load_img(image).convert("RGB") |
| | image_size = image.size |
| | input_images = transform_image(image).unsqueeze(0).to("cuda") |
| | with float32_high_matmul_precision(): |
| | |
| | with torch.no_grad(): |
| | preds = birefnet(input_images)[-1].sigmoid().cpu() |
| | pred = preds[0].squeeze() |
| | pred_pil = transforms.ToPILImage()(pred) |
| | mask = pred_pil.resize(image_size) |
| | image.putalpha(mask) |
| | return image |
| |
|
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| |
|
| |
|
| | def erase(image=None, mask=None): |
| | simple_lama = SimpleLama() |
| | image = load_img(image) |
| | mask = load_img(mask).convert("L") |
| | return simple_lama(image, mask) |
| |
|
| |
|
| | |
| | whisper = pipeline( |
| | task="automatic-speech-recognition", |
| | model="openai/whisper-large-v3", |
| | chunk_length_s=30, |
| | device="cuda" if torch.cuda.is_available() else "cpu", |
| | ) |
| |
|
| |
|
| | def transcribe(audio, task="transcribe"): |
| | if audio is None: |
| | raise gr.Error("No audio file submitted!") |
| |
|
| | text = whisper( |
| | audio, batch_size=8, generate_kwargs={"task": task}, return_timestamps=True |
| | )["text"] |
| | return text |
| |
|
| |
|
| | @spaces.GPU(duration=120) |
| | def main(*args): |
| | api_num = args[0] |
| | args = args[1:] |
| | if api_num == 1: |
| | return rmbg(*args) |
| | elif api_num == 2: |
| | return outpaint(*args) |
| | elif api_num == 3: |
| | return inpaint(*args) |
| | |
| | |
| | elif api_num == 5: |
| | return erase(*args) |
| | elif api_num == 6: |
| | return transcribe(*args) |
| |
|
| |
|
| | rmbg_tab = gr.Interface( |
| | fn=main, |
| | inputs=[ |
| | gr.Number(1, interactive=False), |
| | "image", |
| | gr.Text("", label="url"), |
| | ], |
| | outputs=["image"], |
| | api_name="rmbg", |
| | examples=[[1, "./assets/Inpainting mask.png", ""]], |
| | cache_examples=False, |
| | description="pass an image or a url of an image", |
| | ) |
| |
|
| | outpaint_tab = gr.Interface( |
| | fn=main, |
| | inputs=[ |
| | gr.Number(2, interactive=False), |
| | gr.Image(label="image", type="pil"), |
| | gr.Number(label="padding top"), |
| | gr.Number(label="padding bottom"), |
| | gr.Number(label="padding left"), |
| | gr.Number(label="padding right"), |
| | gr.Text(label="prompt"), |
| | gr.Number(value=50, label="num_inference_steps"), |
| | gr.Number(value=28, label="guidance_scale"), |
| | ], |
| | outputs=["image"], |
| | api_name="outpainting", |
| | examples=[[2, "./assets/rocket.png", 100, 0, 0, 0, "", 50, 28]], |
| | cache_examples=False, |
| | ) |
| |
|
| |
|
| | inpaint_tab = gr.Interface( |
| | fn=main, |
| | inputs=[ |
| | gr.Number(3, interactive=False), |
| | gr.Image(label="image", type="pil"), |
| | gr.Image(label="mask", type="pil"), |
| | gr.Text(label="prompt"), |
| | gr.Number(value=50, label="num_inference_steps"), |
| | gr.Number(value=28, label="guidance_scale"), |
| | ], |
| | outputs=["image"], |
| | api_name="inpaint", |
| | examples=[[3, "./assets/rocket.png", "./assets/Inpainting mask.png"]], |
| | cache_examples=False, |
| | description="it is recommended that you use https://github.com/la-voliere/react-mask-editor when creating an image mask in JS and then inverse it before sending it to this space", |
| | ) |
| |
|
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | erase_tab = gr.Interface( |
| | main, |
| | inputs=[ |
| | gr.Number(5, interactive=False), |
| | gr.Image(type="pil"), |
| | gr.Image(type="pil"), |
| | ], |
| | outputs=gr.Image(), |
| | examples=[ |
| | [ |
| | 5, |
| | "./assets/rocket.png", |
| | "./assets/Inpainting mask.png", |
| | ] |
| | ], |
| | api_name="erase", |
| | cache_examples=False, |
| | ) |
| |
|
| | transcribe_tab = gr.Interface( |
| | fn=main, |
| | inputs=[ |
| | gr.Number(6, interactive=False), |
| | gr.Audio(type="filepath"), |
| | gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"), |
| | ], |
| | outputs="text", |
| | api_name="transcribe", |
| | description="Upload an audio file to extract text using Whisper Large V3", |
| | ) |
| |
|
| | demo = gr.TabbedInterface( |
| | [ |
| | rmbg_tab, |
| | outpaint_tab, |
| | inpaint_tab, |
| | |
| | erase_tab, |
| | transcribe_tab, |
| | ], |
| | [ |
| | "remove background", |
| | "outpainting", |
| | "inpainting", |
| | |
| | "erase", |
| | "transcribe", |
| | ], |
| | title="Utilities that require GPU", |
| | ) |
| |
|
| |
|
| | demo.launch() |