| { |
| "manifest_version": 1, |
| "updated_at": "2026-04-24T15:44:51Z", |
| "min_app_version": "1.0", |
| "categories": [ |
| { |
| "id": "llm", |
| "name": "Large Language Models", |
| "icon": "bubble.left.and.text.bubble.right", |
| "order": 0 |
| }, |
| { |
| "id": "segmentation", |
| "name": "Segmentation", |
| "icon": "person.and.background.dotted", |
| "order": 1 |
| }, |
| { |
| "id": "enhancement", |
| "name": "Image Enhancement", |
| "icon": "wand.and.stars", |
| "order": 2 |
| }, |
| { |
| "id": "detection", |
| "name": "Object Detection", |
| "icon": "viewfinder", |
| "order": 3 |
| }, |
| { |
| "id": "depth", |
| "name": "Depth & Geometry", |
| "icon": "cube.transparent", |
| "order": 4 |
| }, |
| { |
| "id": "vision_language", |
| "name": "Vision-Language", |
| "icon": "text.viewfinder", |
| "order": 5 |
| }, |
| { |
| "id": "face", |
| "name": "Face Processing", |
| "icon": "face.smiling", |
| "order": 6 |
| }, |
| { |
| "id": "generation", |
| "name": "Image Generation", |
| "icon": "sparkles", |
| "order": 7 |
| }, |
| { |
| "id": "video", |
| "name": "Video Processing", |
| "icon": "film", |
| "order": 8 |
| }, |
| { |
| "id": "audio", |
| "name": "Audio Processing", |
| "icon": "waveform.circle", |
| "order": 9 |
| }, |
| { |
| "id": "speech", |
| "name": "Speech & Music", |
| "icon": "music.note", |
| "order": 10 |
| }, |
| { |
| "id": "inpainting", |
| "name": "Inpainting", |
| "icon": "eraser", |
| "order": 11 |
| }, |
| { |
| "id": "restoration", |
| "name": "Face Restoration", |
| "icon": "face.smiling.inverse", |
| "order": 12 |
| } |
| ], |
| "models": [ |
| { |
| "id": "gemma4_e2b", |
| "name": "Gemma 4 E2B", |
| "subtitle": "Google DeepMind, 2025", |
| "category_id": "llm", |
| "description_md": "Google's on-device multimodal LLM (Gemma 3n E2B, 2.3B effective params with Per-Layer Embeddings). **Text + image + audio + video** input, streaming text output. On Apple Neural Engine: ~31 tok/s (4-chunk default) / ~34 tok/s (3-chunk, `LLM_3CHUNK=1`) on iPhone 17 Pro. Native 384x384 vision encoder (64 tokens/frame) handles video; 12-layer Conformer encoder handles audio. Powered by [CoreML-LLM](https://github.com/john-rocky/CoreML-LLM) v1.4.", |
| "thumbnail_url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/thumbnails/gemma4.jpg", |
| "demo": { |
| "template": "chat", |
| "config": { |
| "max_tokens": 1024, |
| "multimodal": true |
| } |
| }, |
| "files": [ |
| { |
| "name": "gemma4-e2b-coreml.zip", |
| "url": "https://huggingface.co/mlboydaisuke/gemma-4-E2B-coreml/resolve/main/gemma4-e2b-coreml.zip", |
| "archive": "zip", |
| "size_bytes": 3100000000, |
| "sha256": "TODO", |
| "compute_units": "cpuAndNeuralEngine", |
| "kind": "model" |
| } |
| ], |
| "requirements": { |
| "min_ios": "18.0", |
| "min_ram_mb": 1500, |
| "device_capabilities": [ |
| "arm64" |
| ] |
| }, |
| "license": { |
| "name": "Gemma", |
| "url": "https://ai.google.dev/gemma/terms" |
| }, |
| "upstream": { |
| "name": "google/gemma-3n-E2B-it", |
| "url": "https://huggingface.co/google/gemma-3n-E2B-it", |
| "year": 2025 |
| } |
| }, |
| { |
| "id": "gemma4_e4b", |
| "name": "Gemma 4 E4B", |
| "subtitle": "Google DeepMind, 2025", |
| "category_id": "llm", |
| "description_md": "Larger Gemma 4 variant: 42-layer text decoder, ~4B effective params, 100% ANE-resident. Text-only (no vision/audio). ~14 tok/s decode on iPhone 17 Pro at 2048-token context. Use when you want maximum text quality and have the storage budget. Powered by [CoreML-LLM](https://github.com/john-rocky/CoreML-LLM) v1.4.", |
| "thumbnail_url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/thumbnails/gemma4.jpg", |
| "demo": { |
| "template": "chat", |
| "config": { |
| "max_tokens": 1024, |
| "multimodal": false |
| } |
| }, |
| "files": [ |
| { |
| "name": "gemma4-e4b-coreml.zip", |
| "url": "https://huggingface.co/mlboydaisuke/gemma-4-E4B-coreml/resolve/main/gemma4-e4b-coreml.zip", |
| "archive": "zip", |
| "size_bytes": 5500000000, |
| "sha256": "TODO", |
| "compute_units": "cpuAndNeuralEngine", |
| "kind": "model" |
| } |
| ], |
| "requirements": { |
| "min_ios": "18.0", |
| "min_ram_mb": 3000, |
| "device_capabilities": [ |
| "arm64" |
| ] |
| }, |
| "license": { |
| "name": "Gemma", |
| "url": "https://ai.google.dev/gemma/terms" |
| }, |
| "upstream": { |
| "name": "google/gemma-3n-E4B-it", |
| "url": "https://huggingface.co/google/gemma-3n-E4B-it", |
| "year": 2025 |
| } |
| }, |
| { |
| "id": "qwen3.5-2b", |
| "name": "Qwen3.5 2B", |
| "subtitle": "Alibaba Qwen, 2025", |
| "category_id": "llm", |
| "description_md": "Hybrid Gated-DeltaNet SSM + attention architecture, shipped as 4 INT8 chunks to fit the iPhone ANE single-mlprogram budget. ~17 tok/s decode on iPhone 17 Pro with ~200 MB RSS — exceptional memory efficiency for a 2B-param model. Text-only. Powered by [CoreML-LLM](https://github.com/john-rocky/CoreML-LLM) v1.4.", |
| "thumbnail_url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/thumbnails/qwen3_5.jpg", |
| "demo": { |
| "template": "chat", |
| "config": { |
| "max_tokens": 1024, |
| "multimodal": false |
| } |
| }, |
| "files": [ |
| { |
| "name": "qwen3.5-2B-CoreML.zip", |
| "url": "https://huggingface.co/mlboydaisuke/qwen3.5-2B-CoreML/resolve/main/qwen3.5-2B-CoreML.zip", |
| "archive": "zip", |
| "size_bytes": 2400000000, |
| "sha256": "TODO", |
| "compute_units": "cpuAndNeuralEngine", |
| "kind": "model" |
| } |
| ], |
| "requirements": { |
| "min_ios": "18.0", |
| "min_ram_mb": 1200, |
| "device_capabilities": [ |
| "arm64" |
| ] |
| }, |
| "license": { |
| "name": "Apache-2.0", |
| "url": "https://www.apache.org/licenses/LICENSE-2.0" |
| }, |
| "upstream": { |
| "name": "Qwen/Qwen3.5-2B", |
| "url": "https://huggingface.co/Qwen/Qwen3.5-2B", |
| "year": 2025 |
| } |
| }, |
| { |
| "id": "qwen3.5-0.8b", |
| "name": "Qwen3.5 0.8B", |
| "subtitle": "Alibaba Qwen, 2025", |
| "category_id": "llm", |
| "description_md": "Compact hybrid SSM+attention model, INT8 palettized (same semantic precision as fp16 — 100% top-3 parity vs fp32 oracle). ~20 tok/s decode on iPhone 17 Pro. Smallest and fastest option in the CoreML-LLM lineup. Text-only. Powered by [CoreML-LLM](https://github.com/john-rocky/CoreML-LLM) v1.4.", |
| "thumbnail_url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/thumbnails/qwen3_5.jpg", |
| "demo": { |
| "template": "chat", |
| "config": { |
| "max_tokens": 1024, |
| "multimodal": false |
| } |
| }, |
| "files": [ |
| { |
| "name": "qwen3.5-0.8B-CoreML.zip", |
| "url": "https://huggingface.co/mlboydaisuke/qwen3.5-0.8B-CoreML/resolve/main/qwen3.5-0.8B-CoreML.zip", |
| "archive": "zip", |
| "size_bytes": 754000000, |
| "sha256": "TODO", |
| "compute_units": "cpuAndNeuralEngine", |
| "kind": "model" |
| } |
| ], |
| "requirements": { |
| "min_ios": "18.0", |
| "min_ram_mb": 600, |
| "device_capabilities": [ |
| "arm64" |
| ] |
| }, |
| "license": { |
| "name": "Apache-2.0", |
| "url": "https://www.apache.org/licenses/LICENSE-2.0" |
| }, |
| "upstream": { |
| "name": "Qwen/Qwen3.5-0.8B", |
| "url": "https://huggingface.co/Qwen/Qwen3.5-0.8B", |
| "year": 2025 |
| } |
| }, |
| { |
| "id": "qwen3-vl-2b", |
| "name": "Qwen3-VL 2B", |
| "subtitle": "Alibaba Qwen, 2025", |
| "category_id": "llm", |
| "description_md": "Qwen3-VL multimodal model — **text + image** input (DeepStack injection at L0/1/2, interleaved mRoPE for image tokens, 196 image tokens). 28-layer GQA text backbone, 6 INT8 body chunks + fp16 embed sidecar. ~7.5 tok/s decode on iPhone 17 Pro. Powered by [CoreML-LLM](https://github.com/john-rocky/CoreML-LLM) v1.4.", |
| "thumbnail_url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/thumbnails/qwen3_vl.jpg", |
| "demo": { |
| "template": "chat", |
| "config": { |
| "max_tokens": 1024, |
| "multimodal": true |
| } |
| }, |
| "files": [ |
| { |
| "name": "qwen3-vl-2b-coreml.zip", |
| "url": "https://huggingface.co/mlboydaisuke/qwen3-vl-2b-coreml/resolve/main/qwen3-vl-2b-coreml.zip", |
| "archive": "zip", |
| "size_bytes": 4700000000, |
| "sha256": "TODO", |
| "compute_units": "cpuAndNeuralEngine", |
| "kind": "model" |
| } |
| ], |
| "requirements": { |
| "min_ios": "18.0", |
| "min_ram_mb": 2500, |
| "device_capabilities": [ |
| "arm64" |
| ] |
| }, |
| "license": { |
| "name": "Apache-2.0", |
| "url": "https://www.apache.org/licenses/LICENSE-2.0" |
| }, |
| "upstream": { |
| "name": "Qwen/Qwen3-VL-2B-Instruct", |
| "url": "https://huggingface.co/Qwen/Qwen3-VL-2B-Instruct", |
| "year": 2025 |
| } |
| }, |
| { |
| "id": "rmbg_1_4", |
| "name": "RMBG-1.4", |
| "subtitle": "BRIA AI, 2023", |
| "category_id": "segmentation", |
| "description_md": "High-quality background removal. Outputs foreground with alpha mask. 1024×1024 input.", |
| "demo": { |
| "template": "image_in_out", |
| "config": { |
| "input_size": 1024, |
| "output_type": "mask" |
| } |
| }, |
| "files": [ |
| { |
| "name": "RMBG_1_4.mlpackage.zip", |
| "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/rmbg/RMBG_1_4.mlpackage.zip", |
| "archive": "zip", |
| "size_bytes": 38771210, |
| "sha256": "a80dbb5f04c922a8fa698c38592e4e52af4e62471d70bc7c59c28a3355a1da95", |
| "compute_units": "cpuOnly", |
| "kind": "model" |
| } |
| ], |
| "requirements": { |
| "min_ios": "17.0", |
| "min_ram_mb": 300 |
| }, |
| "license": { |
| "name": "Apache-2.0", |
| "url": "https://huggingface.co/briaai/RMBG-1.4" |
| }, |
| "upstream": { |
| "name": "briaai/RMBG-1.4", |
| "url": "https://huggingface.co/briaai/RMBG-1.4", |
| "year": 2023 |
| }, |
| "conversion_script_url": "https://github.com/john-rocky/CoreML-Models/blob/master/conversion_scripts/convert_rmbg.py" |
| }, |
| { |
| "id": "ddcolor", |
| "name": "DDColor Tiny", |
| "subtitle": "Image Colorization, 2023", |
| "category_id": "enhancement", |
| "description_md": "Automatic grayscale image colorization via dual decoders. 512×512 input.", |
| "demo": { |
| "template": "image_in_out", |
| "config": { |
| "input_size": 512, |
| "output_type": "lab_ab" |
| } |
| }, |
| "files": [ |
| { |
| "name": "DDColor_Tiny.mlpackage.zip", |
| "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/ddcolor/DDColor_Tiny.mlpackage.zip", |
| "archive": "zip", |
| "size_bytes": 212344570, |
| "sha256": "bfecea37d66005f602efe13978360b8e4707923234c3d1d00beeb4e36cb1b02c", |
| "compute_units": "all", |
| "kind": "model" |
| } |
| ], |
| "requirements": { |
| "min_ios": "17.0", |
| "min_ram_mb": 400 |
| }, |
| "license": { |
| "name": "Apache-2.0", |
| "url": "https://github.com/piddnad/DDColor" |
| }, |
| "upstream": { |
| "name": "piddnad/DDColor", |
| "url": "https://github.com/piddnad/DDColor", |
| "year": 2023 |
| }, |
| "conversion_script_url": "https://github.com/john-rocky/CoreML-Models/blob/master/conversion_scripts/convert_ddcolor.py" |
| }, |
| { |
| "id": "sinsr", |
| "name": "SinSR", |
| "subtitle": "Single-Step Super-Resolution, 2024", |
| "category_id": "enhancement", |
| "description_md": "4× super-resolution via single-step diffusion. 256→1024. Swin Transformer denoiser (FP32).", |
| "demo": { |
| "template": "image_in_out", |
| "config": { |
| "input_size": 256, |
| "output_type": "sinsr" |
| } |
| }, |
| "files": [ |
| { |
| "name": "SinSR_Encoder.mlpackage.zip", |
| "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/sinsr/SinSR_Encoder.mlpackage.zip", |
| "archive": "zip", |
| "size_bytes": 41246338, |
| "sha256": "fdec09d17561ec1bb5a2e829683d48c2b45e76b876285619a6e29a3523b8b7e2", |
| "compute_units": "cpuAndGPU", |
| "kind": "model" |
| }, |
| { |
| "name": "SinSR_Denoiser.mlpackage.zip", |
| "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/sinsr/SinSR_Denoiser.mlpackage.zip", |
| "archive": "zip", |
| "size_bytes": 440014511, |
| "sha256": "b31374c2d539b2cdd81499d6062c801ca00e405f5a67507cd609d14e2d6d4beb", |
| "compute_units": "cpuOnly", |
| "kind": "model" |
| }, |
| { |
| "name": "SinSR_Decoder.mlpackage.zip", |
| "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/sinsr/SinSR_Decoder.mlpackage.zip", |
| "archive": "zip", |
| "size_bytes": 60880285, |
| "sha256": "b8b9a7b52d6b240cf9fb3352b286ea83eb984fd73f5dd81c9f034f0016a5cb8c", |
| "compute_units": "cpuAndGPU", |
| "kind": "model" |
| } |
| ], |
| "requirements": { |
| "min_ios": "17.0", |
| "min_ram_mb": 600 |
| }, |
| "license": { |
| "name": "Apache-2.0", |
| "url": "https://github.com/wyf0912/SinSR" |
| }, |
| "upstream": { |
| "name": "wyf0912/SinSR", |
| "url": "https://github.com/wyf0912/SinSR", |
| "year": 2024 |
| }, |
| "conversion_script_url": "https://github.com/john-rocky/CoreML-Models/blob/master/conversion_scripts/convert_sinsr.py" |
| }, |
| { |
| "id": "yolo26s", |
| "name": "YOLO26s", |
| "subtitle": "NMS-Free Detection, 2026", |
| "category_id": "detection", |
| "description_md": "NMS-free object detection. 640×640 input, 80 COCO classes.", |
| "demo": { |
| "template": "image_detection", |
| "config": { |
| "input_size": 640, |
| "confidence_threshold": 0.25 |
| } |
| }, |
| "files": [ |
| { |
| "name": "yolo26s.mlpackage.zip", |
| "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/yolo26/yolo26s.mlpackage.zip", |
| "archive": "zip", |
| "size_bytes": 17697581, |
| "sha256": "0ec02fb0cf2dbd6e09601cbbc00a9734156ea4c2a52b0da23a984337074c6fd4", |
| "compute_units": "all", |
| "kind": "model" |
| } |
| ], |
| "requirements": { |
| "min_ios": "17.0", |
| "min_ram_mb": 300 |
| }, |
| "license": { |
| "name": "AGPL-3.0", |
| "url": "https://github.com/ultralytics/ultralytics" |
| }, |
| "upstream": { |
| "name": "ultralytics/ultralytics", |
| "url": "https://github.com/ultralytics/ultralytics", |
| "year": 2026 |
| } |
| }, |
| { |
| "id": "yolo11s", |
| "name": "YOLO11s", |
| "subtitle": "Object Detection, 2024", |
| "category_id": "detection", |
| "description_md": "YOLO11 small detection with Vision framework NMS. 640×640 input.", |
| "demo": { |
| "template": "image_detection", |
| "config": { |
| "input_size": 640, |
| "confidence_threshold": 0.25 |
| } |
| }, |
| "files": [ |
| { |
| "name": "yolo11s.mlpackage.zip", |
| "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/yolov9/yolo11s.mlpackage.zip", |
| "archive": "zip", |
| "size_bytes": 17580204, |
| "sha256": "79e82aacc3ad20fc1eb990df6979fae9b927d4b06f33bd20ec0e1c0dcb7d1f6b", |
| "compute_units": "all", |
| "kind": "model" |
| } |
| ], |
| "requirements": { |
| "min_ios": "17.0", |
| "min_ram_mb": 300 |
| }, |
| "license": { |
| "name": "AGPL-3.0", |
| "url": "https://github.com/ultralytics/ultralytics" |
| }, |
| "upstream": { |
| "name": "ultralytics/ultralytics", |
| "url": "https://github.com/ultralytics/ultralytics", |
| "year": 2024 |
| } |
| }, |
| { |
| "id": "yolov10n", |
| "name": "YOLOv10n", |
| "subtitle": "Object Detection, 2024", |
| "category_id": "detection", |
| "description_md": "YOLOv10 nano. 640×640 input. Dual-assignment strategy.", |
| "demo": { |
| "template": "image_detection", |
| "config": { |
| "input_size": 640, |
| "confidence_threshold": 0.25 |
| } |
| }, |
| "files": [ |
| { |
| "name": "YOLOv10N.mlpackage.zip", |
| "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/yolov10/YOLOv10N.mlpackage.zip", |
| "archive": "zip", |
| "size_bytes": 4309168, |
| "sha256": "9a687144a6b0b764f508c8f544fe46b6674629b8f09a1e99d8ca69b0be899891", |
| "compute_units": "all", |
| "kind": "model" |
| } |
| ], |
| "requirements": { |
| "min_ios": "17.0", |
| "min_ram_mb": 300 |
| }, |
| "license": { |
| "name": "AGPL-3.0", |
| "url": "https://github.com/THU-MIG/yolov10" |
| }, |
| "upstream": { |
| "name": "THU-MIG/yolov10", |
| "url": "https://github.com/THU-MIG/yolov10", |
| "year": 2024 |
| } |
| }, |
| { |
| "id": "yoloworld", |
| "name": "YOLO-World", |
| "subtitle": "Open-Vocabulary Detection, 2024", |
| "category_id": "detection", |
| "description_md": "Open-vocabulary detection. Type any text query. YOLO-World V2-S + CLIP ViT-B/32.", |
| "demo": { |
| "template": "open_vocab_detection", |
| "config": { |
| "input_size": 640 |
| } |
| }, |
| "files": [ |
| { |
| "name": "yoloworld_detector.mlpackage.zip", |
| "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/yoloworld/yoloworld_detector.mlpackage.zip", |
| "archive": "zip", |
| "size_bytes": 23710620, |
| "sha256": "611d299ae74c83f90a5cc9f4585709859d5db735baa8ade721e0c2d99cd5af92", |
| "compute_units": "all", |
| "kind": "model" |
| }, |
| { |
| "name": "clip_text_encoder.mlpackage.zip", |
| "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/yoloworld/clip_text_encoder.mlpackage.zip", |
| "archive": "zip", |
| "size_bytes": 116681932, |
| "sha256": "45770a743297e8c2a57cc330d4f5c80f47734263680895b33b593b50dd2c382b", |
| "compute_units": "cpuOnly", |
| "kind": "model" |
| } |
| ], |
| "requirements": { |
| "min_ios": "17.0", |
| "min_ram_mb": 600 |
| }, |
| "license": { |
| "name": "GPL-3.0", |
| "url": "https://github.com/AILab-CVC/YOLO-World" |
| }, |
| "upstream": { |
| "name": "AILab-CVC/YOLO-World", |
| "url": "https://github.com/AILab-CVC/YOLO-World", |
| "year": 2024 |
| } |
| }, |
| { |
| "id": "moge2_vitb_normal_504", |
| "name": "MoGe-2 ViT-B (504×504)", |
| "subtitle": "Microsoft, CVPR 2025", |
| "category_id": "depth", |
| "description_md": "Monocular geometry from a single image. Metric depth, surface normals, confidence mask. DINOv2 ViT-B/14 backbone.", |
| "demo": { |
| "template": "depth_visualization", |
| "config": { |
| "input_size": 504, |
| "output_keys": [ |
| "depth", |
| "normal", |
| "mask", |
| "metric_scale" |
| ], |
| "depth_unit": "meters" |
| } |
| }, |
| "files": [ |
| { |
| "name": "MoGe2_ViTB_Normal_504.mlpackage.zip", |
| "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/moge2/MoGe2_ViTB_Normal_504.mlpackage.zip", |
| "archive": "zip", |
| "size_bytes": 193312088, |
| "sha256": "f60cfb4804707a489d99e24453188cd31ddcabb299bbf6da4507edc9cecbf9e7", |
| "compute_units": "all", |
| "kind": "model" |
| } |
| ], |
| "requirements": { |
| "min_ios": "17.0", |
| "min_ram_mb": 600 |
| }, |
| "license": { |
| "name": "MIT", |
| "url": "https://github.com/microsoft/MoGe/blob/main/LICENSE" |
| }, |
| "upstream": { |
| "name": "microsoft/MoGe", |
| "url": "https://github.com/microsoft/MoGe", |
| "year": 2025 |
| }, |
| "conversion_script_url": "https://github.com/john-rocky/CoreML-Models/blob/master/conversion_scripts/convert_moge2.py" |
| }, |
| { |
| "id": "depth_anything_v3_small_504", |
| "name": "Depth Anything 3 Small (504×504)", |
| "subtitle": "ByteDance-Seed, ICLR 2026 oral", |
| "category_id": "depth", |
| "description_md": "Relative monocular depth from a single image. DA3 Main Series, Small (0.08B params, DINOv2 ViT-S/14 + DualDPT head). First public Core ML conversion of Depth Anything 3.", |
| "demo": { |
| "template": "depth_visualization", |
| "config": { |
| "input_size": 504, |
| "output_keys": [ |
| "depth", |
| "confidence" |
| ], |
| "depth_unit": "relative" |
| } |
| }, |
| "files": [ |
| { |
| "name": "DepthAnythingV3_small_504.mlpackage.zip", |
| "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/depth_anything_v3/DepthAnythingV3_small_504.mlpackage.zip", |
| "archive": "zip", |
| "size_bytes": 46274487, |
| "sha256": "c10f8afa01fdc1d22682014824d8e40df67921f96c328f69118ecb725641f78d", |
| "compute_units": "all", |
| "kind": "model" |
| } |
| ], |
| "requirements": { |
| "min_ios": "17.0", |
| "min_ram_mb": 300 |
| }, |
| "license": { |
| "name": "Apache 2.0", |
| "url": "https://github.com/ByteDance-Seed/Depth-Anything-3/blob/main/LICENSE" |
| }, |
| "upstream": { |
| "name": "ByteDance-Seed/Depth-Anything-3", |
| "url": "https://github.com/ByteDance-Seed/Depth-Anything-3", |
| "year": 2025 |
| }, |
| "conversion_script_url": "https://github.com/john-rocky/CoreML-Models/blob/master/conversion_scripts/convert_depth_anything_v3.py" |
| }, |
| { |
| "id": "depth_anything_v3_base_504", |
| "name": "Depth Anything 3 Base (504×504)", |
| "subtitle": "ByteDance-Seed, ICLR 2026 oral", |
| "category_id": "depth", |
| "description_md": "Relative monocular depth from a single image. DA3 Main Series, Base (0.12B params, DINOv2 ViT-B/14 + DualDPT head). Higher quality than Small at ~3× the model size.", |
| "demo": { |
| "template": "depth_visualization", |
| "config": { |
| "input_size": 504, |
| "output_keys": [ |
| "depth", |
| "confidence" |
| ], |
| "depth_unit": "relative" |
| } |
| }, |
| "files": [ |
| { |
| "name": "DepthAnythingV3_base_504.mlpackage.zip", |
| "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/depth_anything_v3/DepthAnythingV3_base_504.mlpackage.zip", |
| "archive": "zip", |
| "size_bytes": 181679139, |
| "sha256": "cd96d12b7d14fb92c312ad1efe771eb1732680578e11bf6b76ab63f4c5d6c51b", |
| "compute_units": "all", |
| "kind": "model" |
| } |
| ], |
| "requirements": { |
| "min_ios": "17.0", |
| "min_ram_mb": 700 |
| }, |
| "license": { |
| "name": "Apache 2.0", |
| "url": "https://github.com/ByteDance-Seed/Depth-Anything-3/blob/main/LICENSE" |
| }, |
| "upstream": { |
| "name": "ByteDance-Seed/Depth-Anything-3", |
| "url": "https://github.com/ByteDance-Seed/Depth-Anything-3", |
| "year": 2025 |
| }, |
| "conversion_script_url": "https://github.com/john-rocky/CoreML-Models/blob/master/conversion_scripts/convert_depth_anything_v3.py" |
| }, |
| { |
| "id": "siglip", |
| "name": "SigLIP", |
| "subtitle": "Zero-Shot Classification, 2023", |
| "category_id": "vision_language", |
| "description_md": "Zero-shot image classification. Dual encoder (image + text). 224×224 input.", |
| "demo": { |
| "template": "zero_shot_classify", |
| "config": { |
| "input_size": 224, |
| "image_encoder": "SigLIP_ImageEncoder.mlpackage.zip", |
| "text_encoder": "SigLIP_TextEncoder.mlpackage.zip", |
| "vocab_file": "siglip_vocab.json", |
| "prompt_template": "{}", |
| "logit_scale": 117.33 |
| } |
| }, |
| "files": [ |
| { |
| "name": "SigLIP_ImageEncoder.mlpackage.zip", |
| "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/siglip/SigLIP_ImageEncoder.mlpackage.zip", |
| "archive": "zip", |
| "size_bytes": 170352400, |
| "sha256": "98f6abf5f4aa145199f4ae22305f9c1d5929eee6b126daad84783b2b2090ee24", |
| "compute_units": "cpuOnly", |
| "kind": "model" |
| }, |
| { |
| "name": "SigLIP_TextEncoder.mlpackage.zip", |
| "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/siglip/SigLIP_TextEncoder.mlpackage.zip", |
| "archive": "zip", |
| "size_bytes": 203975769, |
| "sha256": "9dead2d58705838aef7ad83c3bf4036698c78d872ca1cdd04f2c4a6272009ccf", |
| "compute_units": "cpuOnly", |
| "kind": "model" |
| }, |
| { |
| "name": "siglip_vocab.json", |
| "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/siglip/siglip_vocab.json", |
| "size_bytes": 673754, |
| "sha256": "b94b3a58e04f619936b3890804dff7c478522c07515ff748cf127c5443ee5229", |
| "kind": "vocab" |
| } |
| ], |
| "requirements": { |
| "min_ios": "17.0", |
| "min_ram_mb": 800 |
| }, |
| "license": { |
| "name": "Apache-2.0", |
| "url": "https://github.com/google-research/big_vision" |
| }, |
| "upstream": { |
| "name": "google-research/big_vision", |
| "url": "https://github.com/google-research/big_vision", |
| "year": 2023 |
| }, |
| "conversion_script_url": "https://github.com/john-rocky/CoreML-Models/blob/master/conversion_scripts/convert_siglip.py" |
| }, |
| { |
| "id": "florence2", |
| "name": "Florence-2", |
| "subtitle": "Microsoft, 2024", |
| "category_id": "vision_language", |
| "description_md": "Vision-language captioning, OCR, and VQA. Three-stage encoder-decoder. 768×768 input.", |
| "demo": { |
| "template": "image_to_text", |
| "config": { |
| "image_size": 768, |
| "max_tokens": 256, |
| "vision_encoder": "Florence2VisionEncoder.mlpackage.zip", |
| "text_encoder": "Florence2TextEncoder.mlpackage.zip", |
| "decoder": "Florence2Decoder.mlpackage.zip", |
| "vocab_file": "florence2_vocab.json", |
| "tasks": { |
| "caption": [ |
| 0, |
| 2264, |
| 473, |
| 5, |
| 2274, |
| 6190, |
| 116, |
| 2 |
| ], |
| "detailed_caption": [ |
| 0, |
| 2264, |
| 473, |
| 5, |
| 31962, |
| 2274, |
| 6190, |
| 116, |
| 2 |
| ], |
| "ocr": [ |
| 0, |
| 2264, |
| 473, |
| 5, |
| 71307, |
| 116, |
| 2 |
| ] |
| } |
| } |
| }, |
| "files": [ |
| { |
| "name": "Florence2VisionEncoder.mlpackage.zip", |
| "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/florence2/Florence2VisionEncoder.mlpackage.zip", |
| "archive": "zip", |
| "size_bytes": 81198683, |
| "sha256": "9422f189c21220a0f9966eb9d780856772feb55597dcc579fc4e3c88990d0046", |
| "compute_units": "cpuOnly", |
| "kind": "model" |
| }, |
| { |
| "name": "Florence2TextEncoder.mlpackage.zip", |
| "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/florence2/Florence2TextEncoder.mlpackage.zip", |
| "archive": "zip", |
| "size_bytes": 72742890, |
| "sha256": "f985deeef0408ea8aac33ac4f5c6d9635cd9c64c98b53f85031db6e27f3bfd92", |
| "compute_units": "cpuOnly", |
| "kind": "model" |
| }, |
| { |
| "name": "Florence2Decoder.mlpackage.zip", |
| "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/florence2/Florence2Decoder.mlpackage.zip", |
| "archive": "zip", |
| "size_bytes": 85329746, |
| "sha256": "fe85a6faab5281272bcd79dabfbf87d60ba1a78dd9455e2bf71c67a134d61dc5", |
| "compute_units": "cpuOnly", |
| "kind": "model" |
| }, |
| { |
| "name": "florence2_vocab.json", |
| "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/florence2/florence2_vocab.json", |
| "size_bytes": 999352, |
| "sha256": "861fee9af5520403f6dbb4940d6af6627f1481b71cdc4a870f1f61344e57e645", |
| "kind": "vocab" |
| } |
| ], |
| "requirements": { |
| "min_ios": "17.0", |
| "min_ram_mb": 1200 |
| }, |
| "license": { |
| "name": "MIT", |
| "url": "https://huggingface.co/microsoft/Florence-2-base" |
| }, |
| "upstream": { |
| "name": "microsoft/Florence-2", |
| "url": "https://huggingface.co/microsoft/Florence-2-base", |
| "year": 2024 |
| }, |
| "conversion_script_url": "https://github.com/john-rocky/CoreML-Models/blob/master/conversion_scripts/convert_florence2.py" |
| }, |
| { |
| "id": "face3d", |
| "name": "3DDFA V2", |
| "subtitle": "3D Face Reconstruction, 2020", |
| "category_id": "face", |
| "description_md": "Single-image 3D face reconstruction. Predicts 6 DoF pose + expression parameters.", |
| "demo": { |
| "template": "face_3d", |
| "config": { |
| "input_size": 120 |
| } |
| }, |
| "files": [ |
| { |
| "name": "3DDFA_V2.mlpackage.zip", |
| "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/face3d/3DDFA_V2.mlpackage.zip", |
| "archive": "zip", |
| "size_bytes": 6083375, |
| "sha256": "0f715dc220c046f558e3b8fc65246df9a2eec77182830a16628783430cdacdc8", |
| "compute_units": "all", |
| "kind": "model" |
| } |
| ], |
| "requirements": { |
| "min_ios": "17.0", |
| "min_ram_mb": 200 |
| }, |
| "license": { |
| "name": "MIT", |
| "url": "https://github.com/cleardusk/3DDFA_V2" |
| }, |
| "upstream": { |
| "name": "cleardusk/3DDFA_V2", |
| "url": "https://github.com/cleardusk/3DDFA_V2", |
| "year": 2020 |
| } |
| }, |
| { |
| "id": "nitroe", |
| "name": "Nitro-E (4-Step)", |
| "subtitle": "AMD, 2025", |
| "category_id": "generation", |
| "description_md": "AMD's 304M E-MMDiT text-to-image model (Oct 2025). 4-step distilled variant, 512×512. Llama 3.2 1B text encoder + E-MMDiT denoiser + DC-AE VAE decoder. ~1.04 GB bundled after INT4/INT8 palettization. ~2–3 s / image on iPhone 15+.", |
| "demo": { |
| "template": "text_to_image_nitroe", |
| "config": { |
| "image_size": 512, |
| "latent_size": 16, |
| "latent_channels": 32, |
| "steps": 4, |
| "guidance_scale": 0.0, |
| "scheduler": "flow_match_euler", |
| "tokenizer": "llama3_bpe", |
| "max_sequence_length": 128, |
| "text_encoder": "NitroE_TextEncoder.mlpackage.zip", |
| "denoiser": "NitroE_EMMDiT.mlpackage.zip", |
| "vae_decoder": "NitroE_VAEDecoder.mlpackage.zip", |
| "vocab_file": "Llama3Vocab.json", |
| "merges_file": "Llama3Merges.txt" |
| } |
| }, |
| "files": [ |
| { |
| "name": "NitroE_TextEncoder.mlpackage.zip", |
| "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/nitroe/NitroE_TextEncoder.mlpackage.zip", |
| "archive": "zip", |
| "size_bytes": 571690371, |
| "sha256": "9b366b29d790ab986c376e55aa78b8f546da46c8b4f66229a40914e267dc4124", |
| "compute_units": "cpuAndNeuralEngine", |
| "kind": "model" |
| }, |
| { |
| "name": "NitroE_EMMDiT.mlpackage.zip", |
| "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/nitroe/NitroE_EMMDiT.mlpackage.zip", |
| "archive": "zip", |
| "size_bytes": 296773203, |
| "sha256": "93a7ed971c5c419de2e9a110d4d5e68c3904a1ead67b3a9ec0638c60240f1706", |
| "compute_units": "cpuAndNeuralEngine", |
| "kind": "model" |
| }, |
| { |
| "name": "NitroE_VAEDecoder.mlpackage.zip", |
| "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/nitroe/NitroE_VAEDecoder.mlpackage.zip", |
| "archive": "zip", |
| "size_bytes": 160754977, |
| "sha256": "4837023736d82b49adbb0c5419376ebee3322fcc15c758ac72854387ef6d2142", |
| "compute_units": "cpuAndNeuralEngine", |
| "kind": "model" |
| }, |
| { |
| "name": "Llama3Vocab.json", |
| "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/nitroe/Llama3Vocab.json", |
| "size_bytes": 2492868, |
| "sha256": "f8f40517934d6f5d14842ade2af0161d0c26e0403e277fcf6ef66c113f215466", |
| "kind": "vocab" |
| }, |
| { |
| "name": "Llama3Merges.txt", |
| "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/nitroe/Llama3Merges.txt", |
| "size_bytes": 3243735, |
| "sha256": "0cd100e0ab7dbd83c3b668a7cfb2aed1f63f2aec66dae6ec4b63de2c552bb82d", |
| "kind": "vocab" |
| } |
| ], |
| "requirements": { |
| "min_ios": "18.0", |
| "min_ram_mb": 2500 |
| }, |
| "license": { |
| "name": "MIT (Nitro-E) / Llama 3.2 Community License (text encoder)", |
| "url": "https://huggingface.co/amd/Nitro-E" |
| }, |
| "upstream": { |
| "name": "amd/Nitro-E", |
| "url": "https://huggingface.co/amd/Nitro-E", |
| "year": 2025 |
| }, |
| "conversion_script_url": "https://github.com/john-rocky/CoreML-Models/blob/master/conversion_scripts/convert_nitro_e_emmdit.py" |
| }, |
| { |
| "id": "hypersd", |
| "name": "Hyper-SD (1-Step)", |
| "subtitle": "ByteDance, 2024", |
| "category_id": "generation", |
| "description_md": "Single-step text-to-image from SD1.5 via TCD distillation. 512×512. Chunked UNet (6-bit).", |
| "demo": { |
| "template": "text_to_image", |
| "config": { |
| "image_size": 512, |
| "latent_size": 64, |
| "latent_channels": 4, |
| "steps": 1, |
| "guidance_scale": 1.0, |
| "text_encoder": "HyperSDTextEncoder.mlpackage.zip", |
| "unet_chunk1": "HyperSDUnetChunk1.mlpackage.zip", |
| "unet_chunk2": "HyperSDUnetChunk2.mlpackage.zip", |
| "vae_decoder": "HyperSDVAEDecoder.mlpackage.zip", |
| "vocab_file": "vocab.json", |
| "merges_file": "merges.txt" |
| } |
| }, |
| "files": [ |
| { |
| "name": "HyperSDTextEncoder.mlpackage.zip", |
| "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/hypersd/HyperSDTextEncoder.mlpackage.zip", |
| "archive": "zip", |
| "size_bytes": 226397794, |
| "sha256": "201b0fcc3573811aac6a4e8545c695bc4fb2f7710ea0d60c227919d87b37687e", |
| "compute_units": "cpuAndNeuralEngine", |
| "kind": "model" |
| }, |
| { |
| "name": "HyperSDUnetChunk1.mlpackage.zip", |
| "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/hypersd/HyperSDUnetChunk1.mlpackage.zip", |
| "archive": "zip", |
| "size_bytes": 324819653, |
| "sha256": "279da11b8231aeeb9045f6ceabebb3a68c20a1b86ecc81aa6914b77ce76d5203", |
| "compute_units": "cpuAndNeuralEngine", |
| "kind": "model" |
| }, |
| { |
| "name": "HyperSDUnetChunk2.mlpackage.zip", |
| "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/hypersd/HyperSDUnetChunk2.mlpackage.zip", |
| "archive": "zip", |
| "size_bytes": 304530429, |
| "sha256": "0a700d11a105da589bb3e5666e38b9c72fa283149951b253fc11722e70e72faa", |
| "compute_units": "cpuAndNeuralEngine", |
| "kind": "model" |
| }, |
| { |
| "name": "HyperSDVAEDecoder.mlpackage.zip", |
| "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/hypersd/HyperSDVAEDecoder.mlpackage.zip", |
| "archive": "zip", |
| "size_bytes": 91282754, |
| "sha256": "1260371542d845a2261ed2de36c5fe3e9ccb740a6ceb59b1990705d125e8cf66", |
| "compute_units": "cpuAndGPU", |
| "kind": "model" |
| }, |
| { |
| "name": "vocab.json", |
| "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/hypersd/vocab.json", |
| "size_bytes": 1059962, |
| "sha256": "e089ad92ba36837a0d31433e555c8f45fe601ab5c221d4f607ded32d9f7a4349", |
| "kind": "vocab" |
| }, |
| { |
| "name": "merges.txt", |
| "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/hypersd/merges.txt", |
| "size_bytes": 524619, |
| "sha256": "9fd691f7c8039210e0fced15865466c65820d09b63988b0174bfe25de299051a", |
| "kind": "vocab" |
| } |
| ], |
| "requirements": { |
| "min_ios": "17.0", |
| "min_ram_mb": 1000 |
| }, |
| "license": { |
| "name": "OpenRAIL-M", |
| "url": "https://huggingface.co/ByteDance/Hyper-SD" |
| }, |
| "upstream": { |
| "name": "ByteDance/Hyper-SD", |
| "url": "https://huggingface.co/ByteDance/Hyper-SD", |
| "year": 2024 |
| }, |
| "conversion_script_url": "https://github.com/john-rocky/CoreML-Models/blob/master/conversion_scripts/convert_hypersd.py" |
| }, |
| { |
| "id": "matanyone", |
| "name": "MatAnyone", |
| "subtitle": "Video Matting, 2025", |
| "category_id": "video", |
| "description_md": "Temporally consistent video matting. 5-model pipeline with memory propagation.", |
| "demo": { |
| "template": "video_matting", |
| "config": { |
| "frame_size": 512, |
| "encoder": "MatAnyone_encoder.mlpackage.zip", |
| "mask_encoder": "MatAnyone_mask_encoder.mlpackage.zip", |
| "read_first": "MatAnyone_read_first.mlpackage.zip", |
| "read": "MatAnyone_read.mlpackage.zip", |
| "decoder": "MatAnyone_decoder.mlpackage.zip" |
| } |
| }, |
| "files": [ |
| { |
| "name": "MatAnyone_encoder.mlpackage.zip", |
| "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/matanyone/MatAnyone_encoder.mlpackage.zip", |
| "archive": "zip", |
| "size_bytes": 17306121, |
| "sha256": "97ffd6bc4611f9a3351dc890fc00954ba48171e517e66a39f7a5f1f38110dfda", |
| "compute_units": "cpuAndGPU", |
| "kind": "model" |
| }, |
| { |
| "name": "MatAnyone_mask_encoder.mlpackage.zip", |
| "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/matanyone/MatAnyone_mask_encoder.mlpackage.zip", |
| "archive": "zip", |
| "size_bytes": 16819866, |
| "sha256": "ba67559188ffc64d8e46418c051c6a55815d4482def17519fa518daac7d5a911", |
| "compute_units": "cpuAndGPU", |
| "kind": "model" |
| }, |
| { |
| "name": "MatAnyone_read_first.mlpackage.zip", |
| "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/matanyone/MatAnyone_read_first.mlpackage.zip", |
| "archive": "zip", |
| "size_bytes": 21991849, |
| "sha256": "34daf7227dbcec7373a3fef175259fa7ec631ed8cb91d5595ca57ee9b22df7bb", |
| "compute_units": "cpuOnly", |
| "kind": "model" |
| }, |
| { |
| "name": "MatAnyone_read.mlpackage.zip", |
| "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/matanyone/MatAnyone_read.mlpackage.zip", |
| "archive": "zip", |
| "size_bytes": 22135429, |
| "sha256": "052e52c0ffb7ff9ede448128950cd4c1c9a96589b6900c82b5104d99addb7fa5", |
| "compute_units": "cpuOnly", |
| "kind": "model" |
| }, |
| { |
| "name": "MatAnyone_decoder.mlpackage.zip", |
| "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/matanyone/MatAnyone_decoder.mlpackage.zip", |
| "archive": "zip", |
| "size_bytes": 8807630, |
| "sha256": "67136aa67000e604838fe9aa7de151c514ef84f0b83f1da0f043cf70652d28eb", |
| "compute_units": "cpuAndGPU", |
| "kind": "model" |
| } |
| ], |
| "requirements": { |
| "min_ios": "17.0", |
| "min_ram_mb": 800 |
| }, |
| "license": { |
| "name": "MIT", |
| "url": "https://github.com/pq-yang/MatAnyone" |
| }, |
| "upstream": { |
| "name": "pq-yang/MatAnyone", |
| "url": "https://github.com/pq-yang/MatAnyone", |
| "year": 2025 |
| }, |
| "conversion_script_url": "https://github.com/john-rocky/CoreML-Models/blob/master/conversion_scripts/convert_matanyone.py" |
| }, |
| { |
| "id": "demucs", |
| "name": "HTDemucs", |
| "subtitle": "Audio Source Separation", |
| "category_id": "audio", |
| "description_md": "Split music into 4 stems: drums, bass, vocals, other. 44.1 kHz stereo, FP32.", |
| "demo": { |
| "template": "audio_in_out", |
| "config": { |
| "sample_rate": 44100, |
| "segment_length": 343980, |
| "output_stems": [ |
| "drums", |
| "bass", |
| "vocals", |
| "other" |
| ] |
| } |
| }, |
| "files": [ |
| { |
| "name": "HTDemucs_SourceSeparation_F32.mlpackage.zip", |
| "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/demucs/HTDemucs_SourceSeparation_F32.mlpackage.zip", |
| "archive": "zip", |
| "size_bytes": 79076395, |
| "sha256": "0fbb941e15a5b2fa425d14fe630ed4c14b6dee72780c1f5b2b05f58803bce5f7", |
| "compute_units": "cpuOnly", |
| "kind": "model" |
| } |
| ], |
| "requirements": { |
| "min_ios": "17.0", |
| "min_ram_mb": 1000 |
| }, |
| "license": { |
| "name": "MIT", |
| "url": "https://github.com/adefossez/demucs" |
| }, |
| "upstream": { |
| "name": "adefossez/demucs", |
| "url": "https://github.com/adefossez/demucs", |
| "year": 2021 |
| }, |
| "conversion_script_url": "https://github.com/john-rocky/CoreML-Models/blob/master/conversion_scripts/convert_htdemucs.py" |
| }, |
| { |
| "id": "kokoro", |
| "name": "Kokoro-82M", |
| "subtitle": "Multilingual TTS", |
| "category_id": "speech", |
| "description_md": "English + Japanese text-to-speech. 24 kHz. StyleTTS2 + iSTFTNet vocoder. Multiple voices.", |
| "demo": { |
| "template": "text_to_audio", |
| "config": { |
| "mode": "tts", |
| "sample_rate": 24000, |
| "vocab_file": "kokoro_vocab.json", |
| "voices": [ |
| "af_heart", |
| "af_bella", |
| "am_michael", |
| "bf_emma", |
| "bm_george" |
| ] |
| } |
| }, |
| "files": [ |
| { |
| "name": "Kokoro_Predictor.mlpackage.zip", |
| "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/kokoro/Kokoro_Predictor.mlpackage.zip", |
| "archive": "zip", |
| "size_bytes": 72191470, |
| "sha256": "af1d55dc842980c32b5591a70f603941f11ab60a435bed0c13a107a8ef467bed", |
| "compute_units": "cpuAndGPU", |
| "kind": "model" |
| }, |
| { |
| "name": "Kokoro_Decoder_128.mlpackage.zip", |
| "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/kokoro/Kokoro_Decoder_128.mlpackage.zip", |
| "archive": "zip", |
| "size_bytes": 229120589, |
| "sha256": "cece0d072f5ba6aa3f729cf4c76b4de51823bcc65a26ab363c10441c3cd8b306", |
| "compute_units": "cpuAndGPU", |
| "kind": "model" |
| }, |
| { |
| "name": "Kokoro_Decoder_256.mlpackage.zip", |
| "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/kokoro/Kokoro_Decoder_256.mlpackage.zip", |
| "archive": "zip", |
| "size_bytes": 229123438, |
| "sha256": "36d5e16d5c5ccb500fc96f1b07a1d5ac57b791f8e09e61b78319d76949003efe", |
| "compute_units": "cpuAndGPU", |
| "kind": "model" |
| }, |
| { |
| "name": "Kokoro_Decoder_512.mlpackage.zip", |
| "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/kokoro/Kokoro_Decoder_512.mlpackage.zip", |
| "archive": "zip", |
| "size_bytes": 229128735, |
| "sha256": "0a44484c327e4fe8443b0bcf104d6964fe3f30d628c9e78aee3f31af7f2475dc", |
| "compute_units": "cpuAndGPU", |
| "kind": "model" |
| }, |
| { |
| "name": "kokoro_vocab.json", |
| "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/kokoro/kokoro_vocab.json", |
| "size_bytes": 1144, |
| "sha256": "70abefbe8a1c8865e43e0a43bbdc25b91a33e4aa053479d443ccf23e20a59e5d", |
| "kind": "vocab" |
| } |
| ], |
| "requirements": { |
| "min_ios": "17.0", |
| "min_ram_mb": 1000 |
| }, |
| "license": { |
| "name": "Apache-2.0", |
| "url": "https://huggingface.co/hexgrad/Kokoro-82M" |
| }, |
| "upstream": { |
| "name": "hexgrad/Kokoro-82M", |
| "url": "https://huggingface.co/hexgrad/Kokoro-82M", |
| "year": 2024 |
| }, |
| "conversion_script_url": "https://github.com/john-rocky/CoreML-Models/blob/master/conversion_scripts/convert_kokoro.py" |
| }, |
| { |
| "id": "stable_audio", |
| "name": "Stable Audio Open", |
| "subtitle": "Text-to-Music, 2024", |
| "category_id": "speech", |
| "description_md": "Text-to-music. Up to 11.9s stereo 44.1 kHz. Rectified flow DiT + T5 + Oobleck VAE.", |
| "demo": { |
| "template": "text_to_audio", |
| "config": { |
| "mode": "music", |
| "sample_rate": 44100, |
| "max_duration": 11.9 |
| } |
| }, |
| "files": [ |
| { |
| "name": "StableAudioT5Encoder.mlpackage.zip", |
| "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/stableaudio/StableAudioT5Encoder.mlpackage.zip", |
| "archive": "zip", |
| "size_bytes": 98538259, |
| "sha256": "319a8ba775d309240253ced68a03a3923d0aec9a79f608044f9403bdcfe4b741", |
| "compute_units": "cpuAndGPU", |
| "kind": "model" |
| }, |
| { |
| "name": "StableAudioNumberEmbedder.mlpackage.zip", |
| "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/stableaudio/StableAudioNumberEmbedder.mlpackage.zip", |
| "archive": "zip", |
| "size_bytes": 376018, |
| "sha256": "04bdc5de00a2cf1c4a18f80c94f0d74ecfab41f3ad99f2fb7a031d6ff5af75da", |
| "compute_units": "cpuAndGPU", |
| "kind": "model" |
| }, |
| { |
| "name": "StableAudioDiT.mlpackage.zip", |
| "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/stableaudio/StableAudioDiT.mlpackage.zip", |
| "archive": "zip", |
| "size_bytes": 1265748504, |
| "sha256": "b17da4fc4df857821d39dbdf7d3bfe7062a2272ab3e5df1284d545afb54047e4", |
| "compute_units": "cpuOnly", |
| "kind": "model" |
| }, |
| { |
| "name": "StableAudioVAEDecoder.mlpackage.zip", |
| "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/stableaudio/StableAudioVAEDecoder.mlpackage.zip", |
| "archive": "zip", |
| "size_bytes": 144960275, |
| "sha256": "7207544cca9799cc1d6803c5e81badd0bb4527b2d3a64d5cab5700a5f19a9374", |
| "compute_units": "cpuAndGPU", |
| "kind": "model" |
| }, |
| { |
| "name": "t5_vocab.json", |
| "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/stableaudio/t5_vocab.json", |
| "size_bytes": 749757, |
| "sha256": "7c9ff3ac1b3dbcaa617ee659f2df68688cfd44f1a5eb3be3fa0a2f56c749d56a", |
| "kind": "vocab" |
| } |
| ], |
| "requirements": { |
| "min_ios": "17.0", |
| "min_ram_mb": 1200 |
| }, |
| "license": { |
| "name": "custom", |
| "url": "https://huggingface.co/stabilityai/stable-audio-open-small" |
| }, |
| "upstream": { |
| "name": "stabilityai/stable-audio-open-small", |
| "url": "https://huggingface.co/stabilityai/stable-audio-open-small", |
| "year": 2024 |
| }, |
| "conversion_script_url": "https://github.com/john-rocky/CoreML-Models/blob/master/conversion_scripts/convert_stable_audio.py" |
| }, |
| { |
| "id": "openvoice", |
| "name": "OpenVoice V2", |
| "subtitle": "Voice Cloning", |
| "category_id": "audio", |
| "description_md": "Zero-shot voice conversion. Clone a speaker from ~10s reference audio.", |
| "demo": { |
| "template": "audio_in_out", |
| "config": { |
| "sample_rate": 22050, |
| "output_stems": [ |
| "converted" |
| ] |
| } |
| }, |
| "files": [ |
| { |
| "name": "OpenVoice_SpeakerEncoder.mlpackage.zip", |
| "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/openvoice/OpenVoice_SpeakerEncoder.mlpackage.zip", |
| "archive": "zip", |
| "size_bytes": 1519880, |
| "sha256": "c3f2a96aaf5ecb5c5afc62b3d3dfbd47dc7ae64bc3edb7aa68befb54aef74459", |
| "compute_units": "cpuAndGPU", |
| "kind": "model" |
| }, |
| { |
| "name": "OpenVoice_VoiceConverter.mlpackage.zip", |
| "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/openvoice/OpenVoice_VoiceConverter.mlpackage.zip", |
| "archive": "zip", |
| "size_bytes": 59799630, |
| "sha256": "ef3ce8a2d1564aefa13830d7d0ca43f85e0aa62d5f59622c8bc456c307ab5e05", |
| "compute_units": "cpuAndGPU", |
| "kind": "model" |
| } |
| ], |
| "requirements": { |
| "min_ios": "17.0", |
| "min_ram_mb": 500 |
| }, |
| "license": { |
| "name": "MIT", |
| "url": "https://github.com/myshell-ai/OpenVoice" |
| }, |
| "upstream": { |
| "name": "myshell-ai/OpenVoice", |
| "url": "https://github.com/myshell-ai/OpenVoice", |
| "year": 2023 |
| }, |
| "conversion_script_url": "https://github.com/john-rocky/CoreML-Models/blob/master/conversion_scripts/convert_openvoice.py" |
| }, |
| { |
| "id": "diarization", |
| "name": "Pyannote Diarization", |
| "subtitle": "Speaker Identification", |
| "category_id": "audio", |
| "description_md": "Speaker diarization: who spoke when. 16 kHz mono, 10s segments.", |
| "demo": { |
| "template": "audio_in_out", |
| "config": { |
| "sample_rate": 16000, |
| "output_stems": [ |
| "speaker_timeline" |
| ] |
| } |
| }, |
| "files": [ |
| { |
| "name": "SpeakerSegmentation.mlpackage.zip", |
| "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/diarization/SpeakerSegmentation.mlpackage.zip", |
| "archive": "zip", |
| "size_bytes": 5327137, |
| "sha256": "dcfa2b98900f2b99029abfb593644b70418186a6ec2e94c9a79c2b3d7a84378a", |
| "compute_units": "cpuAndGPU", |
| "kind": "model" |
| } |
| ], |
| "requirements": { |
| "min_ios": "17.0", |
| "min_ram_mb": 200 |
| }, |
| "license": { |
| "name": "MIT", |
| "url": "https://github.com/pyannote/pyannote-audio" |
| }, |
| "upstream": { |
| "name": "pyannote/pyannote-audio", |
| "url": "https://github.com/pyannote/pyannote-audio", |
| "year": 2021 |
| }, |
| "conversion_script_url": "https://github.com/john-rocky/CoreML-Models/blob/master/conversion_scripts/convert_diarization.py" |
| }, |
| { |
| "id": "realesrgan", |
| "name": "Real-ESRGAN 4x", |
| "subtitle": "Super Resolution, 2021", |
| "category_id": "enhancement", |
| "description_md": "Real-world blind super-resolution. 4× upscale from any input. Handles noise, blur, and JPEG artifacts. 512×512 input → 2048×2048 output.", |
| "demo": { |
| "template": "image_in_out", |
| "config": { |
| "input_size": 512, |
| "output_type": "image" |
| } |
| }, |
| "files": [ |
| { |
| "name": "RealESRGAN_x4.mlpackage.zip", |
| "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/realesrgan/RealESRGAN_x4.mlpackage.zip", |
| "archive": "zip", |
| "size_bytes": 62076106, |
| "sha256": "5fd9d7ea7e6187ffb53a1125d5b20f3e76503e89f994aebf189d965977afdda5", |
| "compute_units": "all", |
| "kind": "model" |
| } |
| ], |
| "requirements": { |
| "min_ios": "17.0", |
| "min_ram_mb": 500 |
| }, |
| "license": { |
| "name": "BSD-3-Clause", |
| "url": "https://github.com/xinntao/Real-ESRGAN/blob/master/LICENSE" |
| }, |
| "upstream": { |
| "name": "xinntao/Real-ESRGAN", |
| "url": "https://github.com/xinntao/Real-ESRGAN", |
| "year": 2021 |
| } |
| }, |
| { |
| "id": "gfpgan", |
| "name": "GFPGAN", |
| "subtitle": "Face Restoration, 2021", |
| "category_id": "restoration", |
| "description_md": "Blind face restoration with generative facial prior. Restores degraded face photos to high quality. 512×512 input/output.", |
| "demo": { |
| "template": "image_in_out", |
| "config": { |
| "input_size": 512, |
| "output_type": "image" |
| } |
| }, |
| "files": [ |
| { |
| "name": "GFPGAN.mlpackage.zip", |
| "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/gfpgan/GFPGAN.mlpackage.zip", |
| "archive": "zip", |
| "size_bytes": 312748176, |
| "sha256": "929d0ab30fa739bd3b49a37055f070ce35d39d62cf043fd93c75e388263bef5d", |
| "compute_units": "all", |
| "kind": "model" |
| } |
| ], |
| "requirements": { |
| "min_ios": "17.0", |
| "min_ram_mb": 600 |
| }, |
| "license": { |
| "name": "Apache-2.0", |
| "url": "https://github.com/TencentARC/GFPGAN/blob/master/LICENSE" |
| }, |
| "upstream": { |
| "name": "TencentARC/GFPGAN", |
| "url": "https://github.com/TencentARC/GFPGAN", |
| "year": 2021 |
| } |
| }, |
| { |
| "id": "rfdetr_n", |
| "name": "RF-DETR Nano", |
| "subtitle": "Object Detection, 2025", |
| "category_id": "detection", |
| "description_md": "End-to-end transformer detector. 384×384 input. 300 queries, 91 classes (COCO + background). No NMS needed.", |
| "demo": { |
| "template": "image_detection", |
| "config": { |
| "input_size": 384, |
| "confidence_threshold": 0.5, |
| "output_format": "detr", |
| "num_classes": 91, |
| "background_class": 0 |
| } |
| }, |
| "files": [ |
| { |
| "name": "rfdetr_n_coco.mlpackage.zip", |
| "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/rfdetr/rfdetr_n_coco.mlpackage.zip", |
| "archive": "zip", |
| "size_bytes": 99819094, |
| "sha256": "3cac3793b97aa88d5f79290afee24ba86e30da65e884933e3f8b0ba077ec48b4", |
| "compute_units": "all", |
| "kind": "model" |
| } |
| ], |
| "requirements": { |
| "min_ios": "17.0", |
| "min_ram_mb": 400 |
| }, |
| "license": { |
| "name": "Apache-2.0", |
| "url": "https://github.com/roboflow/rf-detr/blob/main/LICENSE" |
| }, |
| "upstream": { |
| "name": "roboflow/rf-detr", |
| "url": "https://github.com/roboflow/rf-detr", |
| "year": 2025 |
| } |
| }, |
| { |
| "id": "face_parsing", |
| "name": "Face Parsing", |
| "subtitle": "Facial Segmentation, 2019", |
| "category_id": "segmentation", |
| "description_md": "Semantic face parsing into 19 regions: skin, nose, eyes, eyebrows, ears, mouth, lip, hair, hat, eyeglass, earring, necklace, neck, cloth, background. 512×512 input.", |
| "demo": { |
| "template": "image_in_out", |
| "config": { |
| "input_size": 512, |
| "output_type": "segmap", |
| "num_classes": 19 |
| } |
| }, |
| "files": [ |
| { |
| "name": "FaceParsing.mlpackage.zip", |
| "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/faceparsing/FaceParsing.mlpackage.zip", |
| "archive": "zip", |
| "size_bytes": 49431633, |
| "sha256": "a6dd498bb4e19df169903ed4ca8883d256bc7c0c9fc92da1ed0477f0fe34859c", |
| "compute_units": "all", |
| "kind": "model" |
| } |
| ], |
| "requirements": { |
| "min_ios": "17.0", |
| "min_ram_mb": 300 |
| }, |
| "license": { |
| "name": "MIT", |
| "url": "https://github.com/zllrunning/face-parsing.PyTorch/blob/master/LICENSE" |
| }, |
| "upstream": { |
| "name": "zllrunning/face-parsing.PyTorch", |
| "url": "https://github.com/zllrunning/face-parsing.PyTorch", |
| "year": 2019 |
| } |
| }, |
| { |
| "id": "mobilesam", |
| "name": "MobileSAM", |
| "subtitle": "Segment Anything, 2023", |
| "category_id": "segmentation", |
| "description_md": "Lightweight Segment Anything. Tap any point to generate a segmentation mask. ViT-Tiny encoder + lightweight decoder. ~60× smaller than SAM.", |
| "demo": { |
| "template": "segment_anything", |
| "config": { |
| "encoder": "MobileSAM.zip", |
| "decoder": "MobileSAM.zip", |
| "input_size": 1024 |
| } |
| }, |
| "files": [ |
| { |
| "name": "MobileSAM.zip", |
| "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/mobilesam/MobileSAM.zip", |
| "archive": "zip", |
| "size_bytes": 20143994, |
| "sha256": "0d8d48cb90a48cd860cc3105f54fdeca2a3cb75876a7c936e7243221e3f24681", |
| "compute_units": "all", |
| "kind": "model" |
| } |
| ], |
| "requirements": { |
| "min_ios": "17.0", |
| "min_ram_mb": 300 |
| }, |
| "license": { |
| "name": "Apache-2.0", |
| "url": "https://github.com/ChaoningZhang/MobileSAM/blob/master/LICENSE" |
| }, |
| "upstream": { |
| "name": "ChaoningZhang/MobileSAM", |
| "url": "https://github.com/ChaoningZhang/MobileSAM", |
| "year": 2023 |
| } |
| }, |
| { |
| "id": "lama", |
| "name": "LaMa", |
| "subtitle": "Image Inpainting, 2022", |
| "category_id": "inpainting", |
| "description_md": "Resolution-robust large mask inpainting. Draw over unwanted objects to remove them. Fast Fourier convolutions for global context. 800×800 input.", |
| "demo": { |
| "template": "inpainting", |
| "config": { |
| "input_size": 800 |
| } |
| }, |
| "files": [ |
| { |
| "name": "LaMa.mlpackage.zip", |
| "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/lama/LaMa.mlpackage.zip", |
| "archive": "zip", |
| "size_bytes": 196237256, |
| "sha256": "b57b8451a1a86c00aea52d75230fb5f49d3076eec67403192758c9d2b59c0e69", |
| "compute_units": "all", |
| "kind": "model" |
| } |
| ], |
| "requirements": { |
| "min_ios": "17.0", |
| "min_ram_mb": 600 |
| }, |
| "license": { |
| "name": "Apache-2.0", |
| "url": "https://github.com/advimman/lama/blob/main/LICENSE" |
| }, |
| "upstream": { |
| "name": "advimman/lama", |
| "url": "https://github.com/advimman/lama", |
| "year": 2022 |
| } |
| }, |
| { |
| "id": "pixelization", |
| "name": "Pixelization", |
| "subtitle": "Cell-Controllable Pixel Art, SIGGRAPH Asia 2022", |
| "category_id": "enhancement", |
| "description_md": "Turn any photo into pixel art. Aliasing-aware generator + anti-alias refinement. Drag the cell-size slider (2–8) to change pixel block size — the network runs once per photo, the slider only re-snaps the grid. 512×512 input. Non-commercial research use only.", |
| "demo": { |
| "template": "image_in_out", |
| "config": { |
| "input_size": 512, |
| "output_type": "pixel_art", |
| "native_cell_size": 4, |
| "cell_size_min": 2, |
| "cell_size_max": 8, |
| "cell_size_default": 4 |
| } |
| }, |
| "files": [ |
| { |
| "name": "Pixelization_512.mlpackage.zip", |
| "url": "https://huggingface.co/mlboydaisuke/coreml-zoo/resolve/main/pixelization/Pixelization_512.mlpackage.zip", |
| "archive": "zip", |
| "size_bytes": 37100167, |
| "sha256": "f9eac7e8fa6487a452ab1506f23bea6990c78dbc532f2d575ef1cb03013c8867", |
| "compute_units": "cpuAndNeuralEngine", |
| "kind": "model" |
| } |
| ], |
| "requirements": { |
| "min_ios": "17.0", |
| "min_ram_mb": 250 |
| }, |
| "license": { |
| "name": "Non-commercial research", |
| "url": "https://github.com/WuZongWei6/Pixelization/blob/main/LICENSE.md" |
| }, |
| "upstream": { |
| "name": "WuZongWei6/Pixelization", |
| "url": "https://github.com/WuZongWei6/Pixelization", |
| "year": 2022 |
| }, |
| "conversion_script_url": "https://github.com/john-rocky/CoreML-Models/blob/master/conversion_scripts/convert_pixelization.py" |
| } |
| ] |
| } |