File size: 3,979 Bytes
c86c45b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import math

import cv2
import numpy as np

_LANDMARK_INDICES = [1, 152, 33, 263, 61, 291]

_MODEL_POINTS = np.array(
    [
        [0.0, 0.0, 0.0],
        [0.0, -330.0, -65.0],
        [-225.0, 170.0, -135.0],
        [225.0, 170.0, -135.0],
        [-150.0, -150.0, -125.0],
        [150.0, -150.0, -125.0],
    ],
    dtype=np.float64,
)


class HeadPoseEstimator:
    def __init__(self, max_angle: float = 30.0, roll_weight: float = 0.5):
        self.max_angle = max_angle
        self.roll_weight = roll_weight
        self._camera_matrix = None
        self._frame_size = None
        self._dist_coeffs = np.zeros((4, 1), dtype=np.float64)
        self._cache_key = None
        self._cache_result = None

    def _get_camera_matrix(self, frame_w: int, frame_h: int) -> np.ndarray:
        if self._camera_matrix is not None and self._frame_size == (frame_w, frame_h):
            return self._camera_matrix
        focal_length = float(frame_w)
        cx, cy = frame_w / 2.0, frame_h / 2.0
        self._camera_matrix = np.array(
            [[focal_length, 0, cx], [0, focal_length, cy], [0, 0, 1]],
            dtype=np.float64,
        )
        self._frame_size = (frame_w, frame_h)
        return self._camera_matrix

    def _solve(self, landmarks: np.ndarray, frame_w: int, frame_h: int):
        key = (landmarks.data.tobytes(), frame_w, frame_h)
        if self._cache_key == key:
            return self._cache_result

        image_points = np.array(
            [
                [landmarks[i, 0] * frame_w, landmarks[i, 1] * frame_h]
                for i in _LANDMARK_INDICES
            ],
            dtype=np.float64,
        )
        camera_matrix = self._get_camera_matrix(frame_w, frame_h)
        success, rvec, tvec = cv2.solvePnP(
            _MODEL_POINTS,
            image_points,
            camera_matrix,
            self._dist_coeffs,
            flags=cv2.SOLVEPNP_ITERATIVE,
        )
        result = (success, rvec, tvec, image_points)
        self._cache_key = key
        self._cache_result = result
        return result

    def estimate(
        self, landmarks: np.ndarray, frame_w: int, frame_h: int
    ) -> tuple[float, float, float] | None:
        success, rvec, tvec, _ = self._solve(landmarks, frame_w, frame_h)
        if not success:
            return None

        rmat, _ = cv2.Rodrigues(rvec)
        nose_dir = rmat @ np.array([0.0, 0.0, 1.0])
        face_up = rmat @ np.array([0.0, 1.0, 0.0])

        yaw = math.degrees(math.atan2(nose_dir[0], -nose_dir[2]))
        pitch = math.degrees(math.asin(np.clip(-nose_dir[1], -1.0, 1.0)))
        roll = math.degrees(math.atan2(face_up[0], -face_up[1]))

        return (yaw, pitch, roll)

    def score(self, landmarks: np.ndarray, frame_w: int, frame_h: int) -> float:
        angles = self.estimate(landmarks, frame_w, frame_h)
        if angles is None:
            return 0.0

        yaw, pitch, roll = angles
        deviation = math.sqrt(yaw**2 + pitch**2 + (self.roll_weight * roll) ** 2)
        t = min(deviation / self.max_angle, 1.0)
        return 0.5 * (1.0 + math.cos(math.pi * t))

    def draw_axes(
        self,
        frame: np.ndarray,
        landmarks: np.ndarray,
        axis_length: float = 50.0,
    ) -> np.ndarray:
        h, w = frame.shape[:2]
        success, rvec, tvec, image_points = self._solve(landmarks, w, h)
        if not success:
            return frame

        camera_matrix = self._get_camera_matrix(w, h)
        nose = tuple(image_points[0].astype(int))

        axes_3d = np.float64(
            [[axis_length, 0, 0], [0, axis_length, 0], [0, 0, axis_length]]
        )
        projected, _ = cv2.projectPoints(
            axes_3d, rvec, tvec, camera_matrix, self._dist_coeffs
        )

        colors = [(0, 0, 255), (0, 255, 0), (255, 0, 0)]
        for i, color in enumerate(colors):
            pt = tuple(projected[i].ravel().astype(int))
            cv2.line(frame, nose, pt, color, 2)

        return frame