File size: 3,528 Bytes
0970deb | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 | #!/usr/bin/env python3
"""
VBVR-Wan2.2 Image-to-Video Inference Example
Generate a video from a reference image using the VBVR-Wan2.2 model.
Usage:
python inference.py --model_path /path/to/VBVR-Wan2.2
"""
import os
import torch
from PIL import Image
from diffusers import WanImageToVideoPipeline, AutoencoderKLWan
from diffusers.utils import export_to_video
# โโโโโโโโโโโโโโโ Configuration (only change model_path) โโโโโโโโโโโโโโโ
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--model_path", type=str, default="VBVR-Wan2.2")
args = parser.parse_args()
model_path = args.model_path
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
# Paths derived from model_path
image_path = os.path.join(model_path, "assets", "first_frame.png")
output_path = "output.mp4"
# Prompt
prompt = (
"The scene contains two types of shapes, each type has three shapes of "
"different sizes arranged randomly. Keep all shapes unchanged in appearance "
"(type, size, and color). Only rearrange their positions: first group the "
"shapes by type, then within each group, sort the shapes from smallest to "
"largest (left to right), and arrange all shapes in a single horizontal "
"line from left to right."
)
negative_prompt = (
"่ฒ่ฐ่ณไธฝ๏ผ่ฟๆ๏ผ้ๆ๏ผ็ป่ๆจก็ณไธๆธ
๏ผๅญๅน๏ผ้ฃๆ ผ๏ผไฝๅ๏ผ็ปไฝ๏ผ็ป้ข๏ผ้ๆญข๏ผ"
"ๆดไฝๅ็ฐ๏ผๆๅทฎ่ดจ้๏ผไฝ่ดจ้๏ผJPEGๅ็ผฉๆฎ็๏ผไธ้็๏ผๆฎ็ผบ็๏ผๅคไฝ็ๆๆ๏ผ"
"็ปๅพไธๅฅฝ็ๆ้จ๏ผ็ปๅพไธๅฅฝ็่ธ้จ๏ผ็ธๅฝข็๏ผๆฏๅฎน็๏ผๅฝขๆ็ธๅฝข็่ขไฝ๏ผๆๆ่ๅ๏ผ"
"้ๆญขไธๅจ็็ป้ข๏ผๆไนฑ็่ๆฏ๏ผไธๆก่
ฟ๏ผ่ๆฏไบบๅพๅค๏ผๅ็่ตฐ"
)
# Generation settings
num_frames = 96
num_inference_steps = 50
guidance_scale = 5.0
seed = 1
# โโโโโโโโโโโโโโโโโโโโโโโโ Load Pipeline โโโโโโโโโโโโโโโโโโโโโโโโ
print(f"Loading model from: {model_path}")
vae = AutoencoderKLWan.from_pretrained(
model_path, subfolder="vae", torch_dtype=torch.float32
)
pipe = WanImageToVideoPipeline.from_pretrained(
model_path,
vae=vae,
torch_dtype=torch.bfloat16,
)
pipe.enable_model_cpu_offload()
print(f"Pipeline loaded. boundary_ratio = {pipe.config.boundary_ratio}")
# โโโโโโโโโโโโโโโโโโโโโโโโ Load Image โโโโโโโโโโโโโโโโโโโโโโโโ
print(f"Loading image: {image_path}")
image = Image.open(image_path).convert("RGB")
width, height = image.size
print(f"Image size: {width}x{height}")
# โโโโโโโโโโโโโโโโโโโโโโโโ Generate Video โโโโโโโโโโโโโโโโโโโโโโโโ
print(f"Generating video: {num_frames} frames @ {width}x{height}, {num_inference_steps} steps")
generator = torch.Generator(device="cpu").manual_seed(seed)
output = pipe(
image=image,
prompt=prompt,
negative_prompt=negative_prompt,
height=height,
width=width,
num_frames=num_frames,
num_inference_steps=num_inference_steps,
guidance_scale=guidance_scale,
generator=generator,
)
export_to_video(output.frames[0], output_path, fps=16)
print(f"Video saved to: {output_path}")
|