Instructions to use jbilcke-hf/CogVideoX-Fun-V1.5-5b-for-InferenceEndpoints with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Diffusers
How to use jbilcke-hf/CogVideoX-Fun-V1.5-5b-for-InferenceEndpoints with Diffusers:
pip install -U diffusers transformers accelerate
import torch from diffusers import DiffusionPipeline # switch to "mps" for apple devices pipe = DiffusionPipeline.from_pretrained("jbilcke-hf/CogVideoX-Fun-V1.5-5b-for-InferenceEndpoints", dtype=torch.bfloat16, device_map="cuda") prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k" image = pipe(prompt).images[0] - Notebooks
- Google Colab
- Kaggle
| from typing import Dict, Any, Union, List | |
| import torch | |
| from diffusers import ( | |
| CogVideoXPipeline, | |
| CogVideoXDPMScheduler, | |
| CogVideoXVideoToVideoPipeline, | |
| CogVideoXImageToVideoPipeline | |
| ) | |
| from diffusers.utils import load_video, load_image | |
| from PIL import Image | |
| import base64 | |
| import io | |
| import numpy as np | |
| class EndpointHandler: | |
| def __init__(self, path: str = ""): | |
| """Initialize the CogVideoX pipeline. | |
| Args: | |
| path (str): Path to the model weights | |
| """ | |
| # Initialize pipeline with bfloat16 for optimal performance as recommended in docs | |
| self.pipe = CogVideoXPipeline.from_pretrained( | |
| path or "jbilcke-hf/CogVideoX-Fun-V1.5-5b-for-InferenceEndpoints", | |
| torch_dtype=torch.bfloat16 | |
| ).to("cuda") | |
| # Set up the scheduler with trailing timesteps as shown in example | |
| self.pipe.scheduler = CogVideoXDPMScheduler.from_config( | |
| self.pipe.scheduler.config, | |
| timestep_spacing="trailing" | |
| ) | |
| # those two pipelines - generated by Claude - are interesting, but loading it all at once is too much. | |
| # # Initialize video-to-video pipeline | |
| # self.pipe_video = CogVideoXVideoToVideoPipeline.from_pretrained( | |
| # path or "jbilcke-hf/CogVideoX-Fun-V1.5-5b-for-InferenceEndpoints", | |
| # transformer=self.pipe.transformer, | |
| # vae=self.pipe.vae, | |
| # scheduler=self.pipe.scheduler, | |
| # tokenizer=self.pipe.tokenizer, | |
| # text_encoder=self.pipe.text_encoder, | |
| # torch_dtype=torch.bfloat16 | |
| # ).to("cuda") | |
| # | |
| # # Initialize image-to-video pipeline | |
| # self.pipe_image = CogVideoXImageToVideoPipeline.from_pretrained( | |
| # path or "THUDM/CogVideoX1.5-5B-I2V", | |
| # vae=self.pipe.vae, | |
| # scheduler=self.pipe.scheduler, | |
| # tokenizer=self.pipe.tokenizer, | |
| # text_encoder=self.pipe.text_encoder, | |
| # torch_dtype=torch.bfloat16 | |
| # ).to("cuda") | |
| def _decode_base64_to_image(self, base64_string: str) -> Image.Image: | |
| """Convert base64 string to PIL Image.""" | |
| image_data = base64.b64decode(base64_string) | |
| image = Image.open(io.BytesIO(image_data)) | |
| return image | |
| def _encode_video_to_base64(self, video_frames: List[np.ndarray]) -> str: | |
| """Convert video frames to base64 string.""" | |
| # Convert frames to a video file in memory | |
| import imageio | |
| output_bytes = io.BytesIO() | |
| imageio.mimsave(output_bytes, video_frames, format='mp4', fps=8) | |
| return base64.b64encode(output_bytes.getvalue()).decode('utf-8') | |
| def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]: | |
| """Process the input data and generate video using CogVideoX. | |
| Args: | |
| data (Dict[str, Any]): Input data containing: | |
| - prompt (str): Text prompt for generation | |
| - image (str, optional): Base64 encoded image for image-to-video | |
| - video (str, optional): Base64 encoded video for video-to-video | |
| - num_inference_steps (int, optional): Number of inference steps | |
| - guidance_scale (float, optional): Guidance scale for generation | |
| Returns: | |
| Dict[str, Any]: Generated video as base64 string | |
| """ | |
| # Extract parameters from input | |
| prompt = data.get("prompt", "") | |
| num_inference_steps = data.get("num_inference_steps", 50) | |
| guidance_scale = data.get("guidance_scale", 7.0) | |
| # Set up generation parameters | |
| generation_kwargs = { | |
| "prompt": prompt, | |
| "num_inference_steps": num_inference_steps, | |
| "guidance_scale": guidance_scale, | |
| "num_videos_per_prompt": 1, | |
| "use_dynamic_cfg": True, | |
| "output_type": "np", # Get numpy array output | |
| } | |
| # Handle different input types | |
| if "image" in data: | |
| # Image to video generation | |
| input_image = self._decode_base64_to_image(data["image"]) | |
| input_image = input_image.resize((720, 480)) # Resize as per example | |
| image = load_image(input_image) | |
| #raise ValueError("image to video isn't supported yet (takes up too much RAM right now)") | |
| return {"error": "Image to video generation not yet supported"} | |
| #video_frames = self.pipe_image( | |
| # image=image, | |
| # **generation_kwargs | |
| #).frames[0] | |
| elif "video" in data: | |
| # Video to video generation | |
| # TODO: Implement video loading from base64 | |
| # For now, returning error | |
| return {"error": "Video to video generation not yet supported"} | |
| else: | |
| # Text to video generation | |
| generation_kwargs["num_frames"] = 49 # As per example | |
| video_frames = self.pipe(**generation_kwargs).frames[0] | |
| # Convert output to base64 | |
| video_base64 = self._encode_video_to_base64(video_frames) | |
| return { | |
| "video": video_base64 | |
| } | |
| def cleanup(self): | |
| """Cleanup the model and free GPU memory.""" | |
| # Move models to CPU to free GPU memory | |
| self.pipe.to("cpu") | |
| #self.pipe_video.to("cpu") | |
| #self.pipe_image.to("cpu") | |
| # Clear CUDA cache | |
| torch.cuda.empty_cache() |