// Copyright 2025 The ODML Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "runtime/components/preprocessor/stb_image_preprocessor.h" #include #include #include // NOLINT: Required for path manipulation. #include #include #include #include #include #include #include "absl/status/status.h" // from @com_google_absl #include "litert/cc/litert_layout.h" // from @litert #include "litert/cc/litert_tensor_buffer.h" // from @litert #include "litert/cc/litert_tensor_buffer_types.h" // from @litert #include "runtime/components/preprocessor/image_preprocessor.h" #include "runtime/engine/io_types.h" #include "runtime/util/test_utils.h" // NOLINT namespace litert::lm { namespace { using ::testing::ElementsAre; using ::testing::status::StatusIs; constexpr char kTestdataDir[] = "litert_lm/runtime/components/preprocessor/testdata/"; TEST(StbImagePreprocessorTest, PreprocessSuccess) { StbImagePreprocessor preprocessor; // Load the image file. const std::string image_path = (std::filesystem::path(::testing::SrcDir()) / kTestdataDir / "apple.png") .string(); std::ifstream file_stream(image_path, std::ios::binary); ASSERT_TRUE(file_stream.is_open()) << "Failed to open image file: " << image_path; std::stringstream buffer; buffer << file_stream.rdbuf(); std::string image_bytes = buffer.str(); // Target dimensions: Batch=1, Height=224, Width=224, Channels=3 (RGB) ImagePreprocessParameter parameter; parameter.SetTargetDimensions({1, 224, 224, 3}); auto input_image = InputImage(image_bytes); ASSERT_OK_AND_ASSIGN(auto preprocessed_image, preprocessor.Preprocess(input_image, parameter)); ASSERT_OK_AND_ASSIGN(auto preprocessed_tensor, preprocessed_image.GetPreprocessedImageTensor()); // Verify the output tensor properties. auto buffer_type = preprocessed_tensor->BufferType(); ASSERT_TRUE(buffer_type.HasValue()); EXPECT_EQ(buffer_type.Value(), ::litert::TensorBufferType::kHostMemory); auto tensor_type = preprocessed_tensor->TensorType(); ASSERT_TRUE(tensor_type.HasValue()); EXPECT_THAT(tensor_type.Value().Layout().Dimensions(), ElementsAre(1, 224, 224, 3)); // Verify pixel values are in the range [0.0, 1.0]. auto output_tensor_lock_and_addr = ::litert::TensorBufferScopedLock::Create( *preprocessed_tensor, TensorBuffer::LockMode::kRead); ASSERT_TRUE(output_tensor_lock_and_addr.HasValue()); const float* data = static_cast(output_tensor_lock_and_addr->second); ASSERT_NE(data, nullptr); size_t num_elements = 224 * 224 * 3; for (size_t i = 0; i < num_elements; ++i) { EXPECT_GE(data[i], 0.0f); EXPECT_LE(data[i], 1.0f); } // Sample a few fixed locations to detect issues like incorrect rotation, // color channel order, or scaling. constexpr float kTolerance = 1e-6f; const int height = 224; const int width = 224; const int channels = 3; // Helper to get the starting index for a pixel (y, x). auto get_pixel_index = [&](int y, int x) { return (y * width + x) * channels; }; // --- Sample 1: Top-Left Pixel (0,0) --- const float expected_0_0_r = 0.26666668f; const float expected_0_0_g = 0.20000000f; const float expected_0_0_b = 0.12941177f; int idx_0_0 = get_pixel_index(0, 0); EXPECT_NEAR(data[idx_0_0 + 0], expected_0_0_r, kTolerance) << "R at (0,0)"; EXPECT_NEAR(data[idx_0_0 + 1], expected_0_0_g, kTolerance) << "G at (0,0)"; EXPECT_NEAR(data[idx_0_0 + 2], expected_0_0_b, kTolerance) << "B at (0,0)"; // --- Sample 2: Top-Right Pixel (0, 223) --- const float expected_0_223_r = 0.96078432f; const float expected_0_223_g = 0.96078432f; const float expected_0_223_b = 0.94509804f; int idx_0_223 = get_pixel_index(0, width - 1); EXPECT_NEAR(data[idx_0_223 + 0], expected_0_223_r, kTolerance) << "R at (0,223)"; EXPECT_NEAR(data[idx_0_223 + 1], expected_0_223_g, kTolerance) << "G at (0,223)"; EXPECT_NEAR(data[idx_0_223 + 2], expected_0_223_b, kTolerance) << "B at (0,223)"; // --- Sample 3: Center Pixel (112, 112) --- const float expected_112_112_r = 0.38039216f; const float expected_112_112_g = 0.00784314f; const float expected_112_112_b = 0.00392157f; int idx_112_112 = get_pixel_index(height / 2, width / 2); EXPECT_NEAR(data[idx_112_112 + 0], expected_112_112_r, kTolerance) << "R at (112,112)"; EXPECT_NEAR(data[idx_112_112 + 1], expected_112_112_g, kTolerance) << "G at (112,112)"; EXPECT_NEAR(data[idx_112_112 + 2], expected_112_112_b, kTolerance) << "B at (112,112)"; // --- Sample 4: Bottom-Left Pixel (223, 0) --- const float expected_223_0_r = 0.41960785f; const float expected_223_0_g = 0.26274511f; const float expected_223_0_b = 0.16470589f; int idx_223_0 = get_pixel_index(height - 1, 0); EXPECT_NEAR(data[idx_223_0 + 0], expected_223_0_r, kTolerance) << "R at (223,0)"; EXPECT_NEAR(data[idx_223_0 + 1], expected_223_0_g, kTolerance) << "G at (223,0)"; EXPECT_NEAR(data[idx_223_0 + 2], expected_223_0_b, kTolerance) << "B at (223,0)"; // --- Sample 5: Bottom-Right Pixel (223, 223) --- const float expected_223_223_r = 0.79607844f; const float expected_223_223_g = 0.72941178f; const float expected_223_223_b = 0.65490198f; int idx_223_223 = get_pixel_index(height - 1, width - 1); EXPECT_NEAR(data[idx_223_223 + 0], expected_223_223_r, kTolerance) << "R at (223,223)"; EXPECT_NEAR(data[idx_223_223 + 1], expected_223_223_g, kTolerance) << "G at (223,223)"; EXPECT_NEAR(data[idx_223_223 + 2], expected_223_223_b, kTolerance) << "B at (223,223)"; } TEST(StbImagePreprocessorTest, PreprocessFailedWithInvalidDimensions) { StbImagePreprocessor preprocessor; std::string dummy_bytes = "dummy"; // Invalid dimensions size (e.g., missing channels). ImagePreprocessParameter parameter; parameter.SetTargetDimensions({1, 224, 224}); EXPECT_THAT(preprocessor.Preprocess(InputImage(dummy_bytes), parameter), StatusIs(absl::StatusCode::kInvalidArgument)); } TEST(StbImagePreprocessorTest, PreprocessFailedWithInvalidImage) { StbImagePreprocessor preprocessor; std::string invalid_image_bytes = "invalid_image_bytes"; ImagePreprocessParameter parameter; parameter.SetTargetDimensions({1, 224, 224, 3}); EXPECT_THAT( preprocessor.Preprocess(InputImage(invalid_image_bytes), parameter), StatusIs(absl::StatusCode::kInvalidArgument, "Failed to decode image. Reason: unknown image type")); } TEST(StbImagePreprocessorTest, PreprocessWithPatchify) { StbImagePreprocessor preprocessor; // Load the image file. const std::string image_path = (std::filesystem::path(::testing::SrcDir()) / kTestdataDir / "apple.png") .string(); std::ifstream file_stream(image_path, std::ios::binary); ASSERT_TRUE(file_stream.is_open()) << "Failed to open image file: " << image_path; std::stringstream buffer; buffer << file_stream.rdbuf(); std::string image_bytes = buffer.str(); ImagePreprocessParameter parameter; constexpr int kPatchSize = 16; parameter.SetPatchifyConfig({.patch_width = kPatchSize, .patch_height = kPatchSize, .max_num_patches = 4096}); auto input_image = InputImage(image_bytes); ASSERT_OK_AND_ASSIGN(auto preprocessed_image, preprocessor.Preprocess(input_image, parameter)); ASSERT_TRUE(preprocessed_image.IsTensorBufferMap()); ASSERT_OK_AND_ASSIGN(auto tensor_map, preprocessed_image.GetPreprocessedImageTensorMap()); ASSERT_NE(tensor_map, nullptr); EXPECT_TRUE(tensor_map->contains("images")); EXPECT_TRUE(tensor_map->contains("positions_xy")); const auto& images_tensor = tensor_map->at("images"); auto images_tensor_type = images_tensor.TensorType(); ASSERT_TRUE(images_tensor_type.HasValue()); // The apple.png is 1024x1024. // 1024 / 16 = 64. 64 * 64 = 4096 patches. // 16 * 16 * 3 = 768 elements per patch. EXPECT_THAT(images_tensor_type.Value().Layout().Dimensions(), ElementsAre(1, 4096, 768)); const auto& positions_tensor = tensor_map->at("positions_xy"); auto positions_tensor_type = positions_tensor.TensorType(); ASSERT_TRUE(positions_tensor_type.HasValue()); EXPECT_THAT(positions_tensor_type.Value().Layout().Dimensions(), ElementsAre(1, 4096, 2)); // Verify positions. auto positions_lock = ::litert::TensorBufferScopedLock::Create( positions_tensor, ::litert::TensorBuffer::LockMode::kRead); ASSERT_TRUE(positions_lock.HasValue()); const int32_t* positions_ptr = reinterpret_cast(positions_lock->second); for (int h = 0; h < 64; ++h) { for (int w = 0; w < 64; ++w) { int idx = h * 64 + w; EXPECT_EQ(positions_ptr[idx * 2], w); EXPECT_EQ(positions_ptr[idx * 2 + 1], h); } } // Verify image values. auto images_lock = ::litert::TensorBufferScopedLock::Create( images_tensor, ::litert::TensorBuffer::LockMode::kRead); ASSERT_TRUE(images_lock.HasValue()); const float* data = reinterpret_cast(images_lock->second); constexpr float kTolerance = 1e-6f; constexpr int height = 1024; constexpr int width = 1024; constexpr int channels = 3; // Helper to get the starting index for a pixel (y, x). auto get_pixel_index = [&](int y, int x) { const int stride = kPatchSize * kPatchSize * channels; return ((y / kPatchSize) * (width / kPatchSize) + x / kPatchSize) * stride + ((y % kPatchSize) * kPatchSize + x % kPatchSize) * channels; }; // --- Sample 1: Top-Left Pixel (0,0) --- const float expected_0_0_r = 0.27058824f; const float expected_0_0_g = 0.20392157f; const float expected_0_0_b = 0.12549020f; int idx_0_0 = get_pixel_index(0, 0); EXPECT_NEAR(data[idx_0_0 + 0], expected_0_0_r, kTolerance) << "R at (0,0)"; EXPECT_NEAR(data[idx_0_0 + 1], expected_0_0_g, kTolerance) << "G at (0,0)"; EXPECT_NEAR(data[idx_0_0 + 2], expected_0_0_b, kTolerance) << "B at (0,0)"; // --- Sample 2: Top-Right Pixel (0, 1023) --- const float expected_0_1023_r = 0.97647058f; const float expected_0_1023_g = 0.97254902f; const float expected_0_1023_b = 0.95686275f; int idx_0_1023 = get_pixel_index(0, width - 1); EXPECT_NEAR(data[idx_0_1023 + 0], expected_0_1023_r, kTolerance) << "R at (0,1023)"; EXPECT_NEAR(data[idx_0_1023 + 1], expected_0_1023_g, kTolerance) << "G at (0,1023)"; EXPECT_NEAR(data[idx_0_1023 + 2], expected_0_1023_b, kTolerance) << "B at (0,1023)"; // --- Sample 3: Center Pixel (512, 512) --- const float expected_512_512_r = 0.35294118f; const float expected_512_512_g = 0.00784314f; const float expected_512_512_b = 0.00392157f; int idx_512_512 = get_pixel_index(height / 2, width / 2); EXPECT_NEAR(data[idx_512_512 + 0], expected_512_512_r, kTolerance) << "R at (512,512)"; EXPECT_NEAR(data[idx_512_512 + 1], expected_512_512_g, kTolerance) << "G at (512,512)"; EXPECT_NEAR(data[idx_512_512 + 2], expected_512_512_b, kTolerance) << "B at (512,512)"; // --- Sample 4: Bottom-Left Pixel (1023, 0) --- const float expected_1023_0_r = 0.42745098f; const float expected_1023_0_g = 0.26274511f; const float expected_1023_0_b = 0.16470589f; int idx_1023_0 = get_pixel_index(height - 1, 0); EXPECT_NEAR(data[idx_1023_0 + 0], expected_1023_0_r, kTolerance) << "R at (1023,0)"; EXPECT_NEAR(data[idx_1023_0 + 1], expected_1023_0_g, kTolerance) << "G at (1023,0)"; EXPECT_NEAR(data[idx_1023_0 + 2], expected_1023_0_b, kTolerance) << "B at (1023,0)"; // --- Sample 5: Bottom-Right Pixel (1023, 1023) --- const float expected_1023_1023_r = 0.68235296f; const float expected_1023_1023_g = 0.61568630f; const float expected_1023_1023_b = 0.51372552f; int idx_1023_1023 = get_pixel_index(height - 1, width - 1); EXPECT_NEAR(data[idx_1023_1023 + 0], expected_1023_1023_r, kTolerance) << "R at (1023,1023)"; EXPECT_NEAR(data[idx_1023_1023 + 1], expected_1023_1023_g, kTolerance) << "G at (1023,1023)"; EXPECT_NEAR(data[idx_1023_1023 + 2], expected_1023_1023_b, kTolerance) << "B at (1023,1023)"; } TEST(StbImagePreprocessorTest, PreprocessWithPatchifyResize) { StbImagePreprocessor preprocessor; // Load the image file. const std::string image_path = (std::filesystem::path(::testing::SrcDir()) / kTestdataDir / "apple.png") .string(); std::ifstream file_stream(image_path, std::ios::binary); ASSERT_TRUE(file_stream.is_open()) << "Failed to open image file: " << image_path; std::stringstream buffer; buffer << file_stream.rdbuf(); std::string image_bytes = buffer.str(); ImagePreprocessParameter parameter; // Max 49 patches means it should resize to 112x112 (112/16 = 7, 7*7=49). constexpr int kPatchSize = 16; parameter.SetPatchifyConfig({.patch_width = kPatchSize, .patch_height = kPatchSize, .max_num_patches = 49}); auto input_image = InputImage(image_bytes); ASSERT_OK_AND_ASSIGN(auto preprocessed_image, preprocessor.Preprocess(input_image, parameter)); ASSERT_TRUE(preprocessed_image.IsTensorBufferMap()); ASSERT_OK_AND_ASSIGN(auto tensor_map, preprocessed_image.GetPreprocessedImageTensorMap()); const auto& images_tensor = tensor_map->at("images"); auto images_tensor_type = images_tensor.TensorType(); ASSERT_TRUE(images_tensor_type.HasValue()); EXPECT_THAT(images_tensor_type.Value().Layout().Dimensions(), ElementsAre(1, 49, 768)); const auto& positions_tensor = tensor_map->at("positions_xy"); auto positions_tensor_type = positions_tensor.TensorType(); ASSERT_TRUE(positions_tensor_type.HasValue()); EXPECT_THAT(positions_tensor_type.Value().Layout().Dimensions(), ElementsAre(1, 49, 2)); // Verify image values. auto images_lock = ::litert::TensorBufferScopedLock::Create( images_tensor, ::litert::TensorBuffer::LockMode::kRead); ASSERT_TRUE(images_lock.HasValue()); const float* data = reinterpret_cast(images_lock->second); constexpr float kTolerance = 1e-6f; constexpr int height = 112; constexpr int width = 112; constexpr int channels = 3; // Helper to get the starting index for a pixel (y, x). auto get_pixel_index = [&](int y, int x) { const int stride = kPatchSize * kPatchSize * channels; return ((y / kPatchSize) * (width / kPatchSize) + x / kPatchSize) * stride + ((y % kPatchSize) * kPatchSize + x % kPatchSize) * channels; }; // --- Sample 1: Top-Left Pixel (0,0) --- const float expected_0_0_r = 0.26666668f; const float expected_0_0_g = 0.20000000f; const float expected_0_0_b = 0.13333334f; int idx_0_0 = get_pixel_index(0, 0); EXPECT_NEAR(data[idx_0_0 + 0], expected_0_0_r, kTolerance) << "R at (0,0)"; EXPECT_NEAR(data[idx_0_0 + 1], expected_0_0_g, kTolerance) << "G at (0,0)"; EXPECT_NEAR(data[idx_0_0 + 2], expected_0_0_b, kTolerance) << "B at (0,0)"; // --- Sample 2: Top-Right Pixel (0, 111) --- const float expected_0_111_r = 0.94901961f; const float expected_0_111_g = 0.94509804f; const float expected_0_111_b = 0.92549020f; int idx_0_111 = get_pixel_index(0, width - 1); EXPECT_NEAR(data[idx_0_111 + 0], expected_0_111_r, kTolerance) << "R at (0,111)"; EXPECT_NEAR(data[idx_0_111 + 1], expected_0_111_g, kTolerance) << "G at (0,111)"; EXPECT_NEAR(data[idx_0_111 + 2], expected_0_111_b, kTolerance) << "B at (0,111)"; // --- Sample 3: Center Pixel (56, 56) --- const float expected_56_56_r = 0.41568628f; const float expected_56_56_g = 0.015686275f; const float expected_56_56_b = 0.011764706f; int idx_56_56 = get_pixel_index(height / 2, width / 2); EXPECT_NEAR(data[idx_56_56 + 0], expected_56_56_r, kTolerance) << "R at (56,56)"; EXPECT_NEAR(data[idx_56_56 + 1], expected_56_56_g, kTolerance) << "G at (56,56)"; EXPECT_NEAR(data[idx_56_56 + 2], expected_56_56_b, kTolerance) << "B at (56,56)"; // --- Sample 4: Bottom-Left Pixel (111, 0) --- const float expected_111_0_r = 0.42352942f; const float expected_111_0_g = 0.27058824f; const float expected_111_0_b = 0.16470589f; int idx_111_0 = get_pixel_index(height - 1, 0); EXPECT_NEAR(data[idx_111_0 + 0], expected_111_0_r, kTolerance) << "R at (111,0)"; EXPECT_NEAR(data[idx_111_0 + 1], expected_111_0_g, kTolerance) << "G at (111,0)"; EXPECT_NEAR(data[idx_111_0 + 2], expected_111_0_b, kTolerance) << "B at (111,0)"; // --- Sample 5: Bottom-Right Pixel (111, 111) --- const float expected_111_111_r = 0.82745099f; const float expected_111_111_g = 0.76470589f; const float expected_111_111_b = 0.69803923f; int idx_111_111 = get_pixel_index(height - 1, width - 1); EXPECT_NEAR(data[idx_111_111 + 0], expected_111_111_r, kTolerance) << "R at (111,111)"; EXPECT_NEAR(data[idx_111_111 + 1], expected_111_111_g, kTolerance) << "G at (111,111)"; EXPECT_NEAR(data[idx_111_111 + 2], expected_111_111_b, kTolerance) << "B at (111,111)"; } } // namespace } // namespace litert::lm