| import os |
| import random |
| import cv2 |
| from datetime import datetime |
| import logging |
|
|
| |
| log_file = "sample_images.log" |
| logging.basicConfig(filename=log_file, level=logging.INFO, |
| format='%(asctime)s - %(levelname)s - %(message)s') |
|
|
| def detect_faces(image_path): |
| |
| face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml') |
| |
| |
| image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE) |
| if image is None: |
| return False |
| |
| |
| faces = face_cascade.detectMultiScale(image, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30)) |
| |
| |
| return len(faces) > 0 |
|
|
| def sample_images(input_folder, output_folder, sample_rate=0.2): |
| |
| if not os.path.exists(output_folder): |
| os.makedirs(output_folder) |
|
|
| |
| total_files = 0 |
| sampled_files = 0 |
| start_time = datetime.now() |
|
|
| |
| for root, dirs, files in os.walk(input_folder): |
| relative_path = os.path.relpath(root, input_folder) |
| output_subfolder = os.path.join(output_folder, relative_path) |
|
|
| if not os.path.exists(output_subfolder): |
| os.makedirs(output_subfolder) |
|
|
| total_files += len(files) |
|
|
| |
| sampled_files_this_batch = [] |
| for file in files: |
| if file.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif')): |
| input_file_path = os.path.join(root, file) |
| if detect_faces(input_file_path): |
| sampled_files_this_batch.append(file) |
|
|
| sampled_files += len(sampled_files_this_batch) |
|
|
| for file in files: |
| if file in sampled_files_this_batch: |
| input_file_path = os.path.join(root, file) |
| output_file_path = os.path.join(output_subfolder, file) |
| os.link(input_file_path, output_file_path) |
| |
| |
| logging.info(f"Sampled and copied {input_file_path} to {output_file_path}") |
|
|
| elapsed_time = datetime.now() - start_time |
| print(f"Processed {sampled_files}/{total_files} files in {elapsed_time}") |
|
|
| end_time = datetime.now() |
| total_time = end_time - start_time |
| logging.info(f"Total time taken: {total_time}") |
| logging.info(f"Sampled {sampled_files} out of {total_files} files.") |
|
|
| if __name__ == "__main__": |
| input_folder = "EvalSet" |
| output_folder = "resampledEvalSet" |
| sample_images(input_folder, output_folder) |
|
|