Spaces:
Sleeping
Sleeping
File size: 2,703 Bytes
6b39516 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 | import os
import shutil
import random
from collections import defaultdict
# --- CONFIGURATION ---
SOURCE_IMG_DIR = "raw_data/img_align_celeba/img_align_celeba" # Path to unzipped images
IDENTITY_FILE = "raw_data/identity_CelebA.txt"
DEST_DIR = "stored_images"
NUM_CELEBS = 50 # How many distinct people you want
MIN_IMAGES_PER_CELEB = 20 # Minimum photos per person (to ensure good matching)
def prepare_dataset():
# 1. Parse the Identity File
print("Parsing identity file...")
celeb_map = defaultdict(list)
try:
with open(IDENTITY_FILE, "r") as f:
for line in f:
# File format: "000001.jpg 2880"
parts = line.split()
if len(parts) >= 2:
filename = parts[0]
celeb_id = parts[1]
celeb_map[celeb_id].append(filename)
except FileNotFoundError:
print(f"Error: Could not find {IDENTITY_FILE}. Did you download the dataset?")
return
print(f"Found {len(celeb_map)} total celebrities.")
# 2. Filter: Keep only celebs with enough images
valid_celebs = {cid: imgs for cid, imgs in celeb_map.items() if len(imgs) >= MIN_IMAGES_PER_CELEB}
print(f"Found {len(valid_celebs)} celebrities with >= {MIN_IMAGES_PER_CELEB} images.")
# 3. Sample: Pick random celebrities
if len(valid_celebs) < NUM_CELEBS:
print("Warning: Not enough celebs meet criteria. Using all available.")
selected_ids = list(valid_celebs.keys())
else:
selected_ids = random.sample(list(valid_celebs.keys()), NUM_CELEBS)
# 4. Move and Rename Images
if os.path.exists(DEST_DIR):
shutil.rmtree(DEST_DIR) # Clean start
os.makedirs(DEST_DIR)
count = 0
print(f"Copying images for {len(selected_ids)} selected celebrities...")
for cid in selected_ids:
images = valid_celebs[cid]
for img_name in images:
src_path = os.path.join(SOURCE_IMG_DIR, img_name)
# RENAME STRATEGY: CelebID_OriginalName.jpg
# This ensures your Search Engine knows "Celeb 2880" is a unique person
new_name = f"Celeb_{cid}_{img_name}"
dest_path = os.path.join(DEST_DIR, new_name)
try:
shutil.copy2(src_path, dest_path)
count += 1
except FileNotFoundError:
print(f"Warning: Image {src_path} not found.")
print(f"Success! Copied {count} images to '{DEST_DIR}/'.")
print("You can now run 'python ingestion.py'")
if __name__ == "__main__":
prepare_dataset() |