AI-Reverse_Face_Search / src /prepare_data.py
Waseem-k's picture
Initial Commit
6b39516 verified
import os
import shutil
import random
from collections import defaultdict
# --- CONFIGURATION ---
SOURCE_IMG_DIR = "raw_data/img_align_celeba/img_align_celeba" # Path to unzipped images
IDENTITY_FILE = "raw_data/identity_CelebA.txt"
DEST_DIR = "stored_images"
NUM_CELEBS = 50 # How many distinct people you want
MIN_IMAGES_PER_CELEB = 20 # Minimum photos per person (to ensure good matching)
def prepare_dataset():
# 1. Parse the Identity File
print("Parsing identity file...")
celeb_map = defaultdict(list)
try:
with open(IDENTITY_FILE, "r") as f:
for line in f:
# File format: "000001.jpg 2880"
parts = line.split()
if len(parts) >= 2:
filename = parts[0]
celeb_id = parts[1]
celeb_map[celeb_id].append(filename)
except FileNotFoundError:
print(f"Error: Could not find {IDENTITY_FILE}. Did you download the dataset?")
return
print(f"Found {len(celeb_map)} total celebrities.")
# 2. Filter: Keep only celebs with enough images
valid_celebs = {cid: imgs for cid, imgs in celeb_map.items() if len(imgs) >= MIN_IMAGES_PER_CELEB}
print(f"Found {len(valid_celebs)} celebrities with >= {MIN_IMAGES_PER_CELEB} images.")
# 3. Sample: Pick random celebrities
if len(valid_celebs) < NUM_CELEBS:
print("Warning: Not enough celebs meet criteria. Using all available.")
selected_ids = list(valid_celebs.keys())
else:
selected_ids = random.sample(list(valid_celebs.keys()), NUM_CELEBS)
# 4. Move and Rename Images
if os.path.exists(DEST_DIR):
shutil.rmtree(DEST_DIR) # Clean start
os.makedirs(DEST_DIR)
count = 0
print(f"Copying images for {len(selected_ids)} selected celebrities...")
for cid in selected_ids:
images = valid_celebs[cid]
for img_name in images:
src_path = os.path.join(SOURCE_IMG_DIR, img_name)
# RENAME STRATEGY: CelebID_OriginalName.jpg
# This ensures your Search Engine knows "Celeb 2880" is a unique person
new_name = f"Celeb_{cid}_{img_name}"
dest_path = os.path.join(DEST_DIR, new_name)
try:
shutil.copy2(src_path, dest_path)
count += 1
except FileNotFoundError:
print(f"Warning: Image {src_path} not found.")
print(f"Success! Copied {count} images to '{DEST_DIR}/'.")
print("You can now run 'python ingestion.py'")
if __name__ == "__main__":
prepare_dataset()