File size: 2,703 Bytes
6b39516
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import os
import shutil
import random
from collections import defaultdict

# --- CONFIGURATION ---
SOURCE_IMG_DIR = "raw_data/img_align_celeba/img_align_celeba" # Path to unzipped images
IDENTITY_FILE = "raw_data/identity_CelebA.txt"
DEST_DIR = "stored_images"
NUM_CELEBS = 50          # How many distinct people you want
MIN_IMAGES_PER_CELEB = 20 # Minimum photos per person (to ensure good matching)

def prepare_dataset():
    # 1. Parse the Identity File
    print("Parsing identity file...")
    celeb_map = defaultdict(list)
    
    try:
        with open(IDENTITY_FILE, "r") as f:
            for line in f:
                # File format: "000001.jpg 2880"
                parts = line.split()
                if len(parts) >= 2:
                    filename = parts[0]
                    celeb_id = parts[1]
                    celeb_map[celeb_id].append(filename)
    except FileNotFoundError:
        print(f"Error: Could not find {IDENTITY_FILE}. Did you download the dataset?")
        return

    print(f"Found {len(celeb_map)} total celebrities.")

    # 2. Filter: Keep only celebs with enough images
    valid_celebs = {cid: imgs for cid, imgs in celeb_map.items() if len(imgs) >= MIN_IMAGES_PER_CELEB}
    print(f"Found {len(valid_celebs)} celebrities with >= {MIN_IMAGES_PER_CELEB} images.")

    # 3. Sample: Pick random celebrities
    if len(valid_celebs) < NUM_CELEBS:
        print("Warning: Not enough celebs meet criteria. Using all available.")
        selected_ids = list(valid_celebs.keys())
    else:
        selected_ids = random.sample(list(valid_celebs.keys()), NUM_CELEBS)

    # 4. Move and Rename Images
    if os.path.exists(DEST_DIR):
        shutil.rmtree(DEST_DIR) # Clean start
    os.makedirs(DEST_DIR)

    count = 0
    print(f"Copying images for {len(selected_ids)} selected celebrities...")
    
    for cid in selected_ids:
        images = valid_celebs[cid]
        for img_name in images:
            src_path = os.path.join(SOURCE_IMG_DIR, img_name)
            
            # RENAME STRATEGY: CelebID_OriginalName.jpg
            # This ensures your Search Engine knows "Celeb 2880" is a unique person
            new_name = f"Celeb_{cid}_{img_name}"
            dest_path = os.path.join(DEST_DIR, new_name)
            
            try:
                shutil.copy2(src_path, dest_path)
                count += 1
            except FileNotFoundError:
                print(f"Warning: Image {src_path} not found.")

    print(f"Success! Copied {count} images to '{DEST_DIR}/'.")
    print("You can now run 'python ingestion.py'")

if __name__ == "__main__":
    prepare_dataset()