Spaces:

dreamlessx
/

LandmarkDiff

Running

File size: 12,241 Bytes

"""Facial landmark extraction using MediaPipe Face Mesh v2."""

from __future__ import annotations

import logging
from dataclasses import dataclass
from pathlib import Path

import cv2
import mediapipe as mp
import numpy as np

logger = logging.getLogger(__name__)

# Region color map for visualization (BGR)
REGION_COLORS: dict[str, tuple[int, int, int]] = {
    "jawline": (255, 255, 255),  # white
    "eyebrow_left": (0, 255, 0),  # green
    "eyebrow_right": (0, 255, 0),
    "eye_left": (255, 255, 0),  # cyan
    "eye_right": (255, 255, 0),
    "nose": (0, 255, 255),  # yellow
    "lips": (0, 0, 255),  # red
    "iris_left": (255, 0, 255),  # magenta
    "iris_right": (255, 0, 255),
}

# MediaPipe landmark index groups by anatomical region
LANDMARK_REGIONS: dict[str, list[int]] = {
    "jawline": [
        10,
        338,
        297,
        332,
        284,
        251,
        389,
        356,
        454,
        323,
        361,
        288,
        397,
        365,
        379,
        378,
        400,
        377,
        152,
        148,
        176,
        149,
        150,
        136,
        172,
        58,
        132,
        93,
        234,
        127,
        162,
        21,
        54,
        103,
        67,
        109,
    ],
    "eye_left": [
        33,
        7,
        163,
        144,
        145,
        153,
        154,
        155,
        133,
        173,
        157,
        158,
        159,
        160,
        161,
        246,
    ],
    "eye_right": [
        362,
        382,
        381,
        380,
        374,
        373,
        390,
        249,
        263,
        466,
        388,
        387,
        386,
        385,
        384,
        398,
    ],
    "eyebrow_left": [70, 63, 105, 66, 107, 55, 65, 52, 53, 46],
    "eyebrow_right": [300, 293, 334, 296, 336, 285, 295, 282, 283, 276],
    "nose": [
        1,
        2,
        4,
        5,
        6,
        19,
        94,
        141,
        168,
        195,
        197,
        236,
        240,
        274,
        275,
        278,
        279,
        294,
        326,
        327,
        360,
        363,
        370,
        456,
        460,
    ],
    "lips": [
        61,
        146,
        91,
        181,
        84,
        17,
        314,
        405,
        321,
        375,
        291,
        308,
        324,
        318,
        402,
        317,
        14,
        87,
        178,
        88,
        95,
        78,
    ],
    "iris_left": [468, 469, 470, 471, 472],
    "iris_right": [473, 474, 475, 476, 477],
}


@dataclass(frozen=True)
class FaceLandmarks:
    """Extracted facial landmarks with metadata."""

    landmarks: np.ndarray  # (478, 3) normalized (x, y, z)
    image_width: int
    image_height: int
    confidence: float

    @property
    def pixel_coords(self) -> np.ndarray:
        """Convert normalized landmarks to pixel coordinates (478, 2).

        Coordinates are clamped to valid image bounds so that extreme
        head poses do not produce out-of-range indices.
        """
        coords = self.landmarks[:, :2].copy()
        coords[:, 0] *= self.image_width
        coords[:, 1] *= self.image_height
        coords[:, 0] = np.clip(coords[:, 0], 0, self.image_width - 1)
        coords[:, 1] = np.clip(coords[:, 1], 0, self.image_height - 1)
        return coords

    def pixel_coords_at(self, width: int, height: int) -> np.ndarray:
        """Convert normalized landmarks to pixel coordinates at a given size.

        Use this when the image has been resized after landmark extraction.
        Coordinates are clamped to [0, width-1] x [0, height-1].
        """
        coords = self.landmarks[:, :2].copy()
        coords[:, 0] *= width
        coords[:, 1] *= height
        coords[:, 0] = np.clip(coords[:, 0], 0, width - 1)
        coords[:, 1] = np.clip(coords[:, 1], 0, height - 1)
        return coords

    def rescale(self, width: int, height: int) -> FaceLandmarks:
        """Return a copy with updated image dimensions.

        Landmarks stay in normalized [0,1] space; only the stored
        width/height change, so ``pixel_coords`` returns values at
        the new resolution.
        """
        return FaceLandmarks(
            landmarks=self.landmarks.copy(),
            image_width=width,
            image_height=height,
            confidence=self.confidence,
        )

    def get_region(self, region: str) -> np.ndarray:
        """Get landmark indices for a named region."""
        indices = LANDMARK_REGIONS.get(region, [])
        return self.landmarks[indices]


def extract_landmarks(
    image: np.ndarray,
    min_detection_confidence: float = 0.5,
    min_tracking_confidence: float = 0.5,
) -> FaceLandmarks | None:
    """Extract 478 facial landmarks from an image using MediaPipe Face Mesh.

    Args:
        image: BGR image as numpy array.
        min_detection_confidence: Minimum face detection confidence.
        min_tracking_confidence: Minimum landmark tracking confidence.

    Returns:
        FaceLandmarks if a face is detected, None otherwise.
    """
    h, w = image.shape[:2]
    rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

    # Try new Tasks API first (mediapipe >= 0.10.20), fall back to legacy solutions API
    try:
        landmarks, confidence = _extract_tasks_api(rgb, min_detection_confidence)
    except Exception:
        logger.debug("Tasks API unavailable, trying Solutions API", exc_info=True)
        try:
            landmarks, confidence = _extract_solutions_api(
                rgb, min_detection_confidence, min_tracking_confidence
            )
        except Exception:
            logger.debug("Both MediaPipe APIs failed", exc_info=True)
            return None

    if landmarks is None:
        return None

    return FaceLandmarks(
        landmarks=landmarks,
        image_width=w,
        image_height=h,
        confidence=confidence,
    )


def _extract_tasks_api(
    rgb: np.ndarray,
    min_confidence: float,
) -> tuple[np.ndarray | None, float]:
    """Extract landmarks using MediaPipe Tasks API (>= 0.10.20)."""
    FaceLandmarker = mp.tasks.vision.FaceLandmarker
    FaceLandmarkerOptions = mp.tasks.vision.FaceLandmarkerOptions
    RunningMode = mp.tasks.vision.RunningMode
    BaseOptions = mp.tasks.BaseOptions
    import tempfile
    import urllib.request

    # Download model if not cached
    model_path = Path(tempfile.gettempdir()) / "face_landmarker_v2_with_blendshapes.task"
    if not model_path.exists():
        url = "https://storage.googleapis.com/mediapipe-models/face_landmarker/face_landmarker/float16/1/face_landmarker.task"
        urllib.request.urlretrieve(url, str(model_path))

    options = FaceLandmarkerOptions(
        base_options=BaseOptions(model_asset_path=str(model_path)),
        running_mode=RunningMode.IMAGE,
        num_faces=1,
        min_face_detection_confidence=min_confidence,
        output_face_blendshapes=False,
        output_facial_transformation_matrixes=False,
    )

    with FaceLandmarker.create_from_options(options) as landmarker:
        mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=rgb)
        result = landmarker.detect(mp_image)

    if not result.face_landmarks:
        return None, 0.0

    face_lms = result.face_landmarks[0]
    landmarks = np.array(
        [(lm.x, lm.y, lm.z) for lm in face_lms],
        dtype=np.float32,
    )

    # MediaPipe Tasks API doesn't expose per-landmark detection confidence;
    # return 1.0 to indicate successful detection
    return landmarks, 1.0


def _extract_solutions_api(
    rgb: np.ndarray,
    min_detection_confidence: float,
    min_tracking_confidence: float,
) -> tuple[np.ndarray | None, float]:
    """Extract landmarks using legacy MediaPipe Solutions API."""
    with mp.solutions.face_mesh.FaceMesh(
        static_image_mode=True,
        max_num_faces=1,
        refine_landmarks=True,
        min_detection_confidence=min_detection_confidence,
        min_tracking_confidence=min_tracking_confidence,
    ) as face_mesh:
        results = face_mesh.process(rgb)

    if not results.multi_face_landmarks:
        return None, 0.0

    face = results.multi_face_landmarks[0]
    landmarks = np.array(
        [(lm.x, lm.y, lm.z) for lm in face.landmark],
        dtype=np.float32,
    )
    # Legacy API doesn't expose detection confidence; return 1.0 for success
    return landmarks, 1.0


def visualize_landmarks(
    image: np.ndarray,
    face: FaceLandmarks,
    radius: int = 1,
    draw_regions: bool = True,
) -> np.ndarray:
    """Draw colored landmark dots on image by anatomical region.

    Args:
        image: BGR image to draw on (will be copied).
        face: Extracted face landmarks.
        radius: Dot radius in pixels.
        draw_regions: If True, color by region. Otherwise all white.

    Returns:
        Annotated image copy.
    """
    canvas = image.copy()
    coords = face.pixel_coords

    if draw_regions:
        # Build index -> color mapping
        idx_to_color: dict[int, tuple[int, int, int]] = {}
        for region, indices in LANDMARK_REGIONS.items():
            color = REGION_COLORS.get(region, (255, 255, 255))
            for idx in indices:
                idx_to_color[idx] = color

        for i, (x, y) in enumerate(coords):
            color = idx_to_color.get(i, (128, 128, 128))
            cv2.circle(canvas, (int(x), int(y)), radius, color, -1)
    else:
        for x, y in coords:
            cv2.circle(canvas, (int(x), int(y)), radius, (255, 255, 255), -1)

    return canvas


def render_landmark_image(
    face: FaceLandmarks,
    width: int | None = None,
    height: int | None = None,
    radius: int = 2,
) -> np.ndarray:
    """Render MediaPipe face mesh tessellation on black canvas.

    Draws the full 2556-edge tessellation mesh that CrucibleAI/ControlNetMediaPipeFace
    was pre-trained on. This is critical -- the ControlNet expects dense triangulated
    wireframes, not sparse dots.

    Falls back to colored dots if tessellation connections aren't available.

    Args:
        face: Extracted face landmarks.
        width: Canvas width (defaults to face.image_width).
        height: Canvas height (defaults to face.image_height).
        radius: Dot radius (used for key landmark dots overlay).

    Returns:
        BGR image with face mesh on black background.
    """
    w = width or face.image_width
    h = height or face.image_height
    canvas = np.zeros((h, w, 3), dtype=np.uint8)

    coords = face.landmarks[:, :2].copy()
    coords[:, 0] *= w
    coords[:, 1] *= h
    pts = coords.astype(np.int32)

    # Draw tessellation mesh (what CrucibleAI ControlNet expects)
    try:
        from mediapipe.tasks.python.vision.face_landmarker import FaceLandmarksConnections

        tessellation = FaceLandmarksConnections.FACE_LANDMARKS_TESSELATION
        contours = FaceLandmarksConnections.FACE_LANDMARKS_CONTOURS

        # Draw tessellation edges (thin, gray-white)
        for conn in tessellation:
            p1 = tuple(pts[conn.start])
            p2 = tuple(pts[conn.end])
            cv2.line(canvas, p1, p2, (192, 192, 192), 1, cv2.LINE_AA)

        # Draw contour edges on top (brighter, key features)
        for conn in contours:
            p1 = tuple(pts[conn.start])
            p2 = tuple(pts[conn.end])
            cv2.line(canvas, p1, p2, (255, 255, 255), 1, cv2.LINE_AA)

    except (ImportError, AttributeError):
        # Fallback: draw colored dots if tessellation not available
        idx_to_color: dict[int, tuple[int, int, int]] = {}
        for region, indices in LANDMARK_REGIONS.items():
            color = REGION_COLORS.get(region, (128, 128, 128))
            for idx in indices:
                idx_to_color[idx] = color

        for i, (x, y) in enumerate(coords):
            color = idx_to_color.get(i, (128, 128, 128))
            cv2.circle(canvas, (int(x), int(y)), radius, color, -1)

    return canvas


def load_image(path: str | Path) -> np.ndarray:
    """Load an image from disk as BGR numpy array."""
    img = cv2.imread(str(path))
    if img is None:
        raise FileNotFoundError(f"Could not load image: {path}")
    return img