Spaces:
Running
Running
| """ | |
| Synthetic dataset generation with a fixed seed for full reproducibility. | |
| All datasets are generated purely from numpy/random β no external downloads. | |
| """ | |
| import random | |
| import numpy as np | |
| import pandas as pd | |
| SEED = 42 | |
| # --------------------------------------------------------------------------- | |
| # Task 1 β Employee records with missing values | |
| # --------------------------------------------------------------------------- | |
| def generate_task1_datasets(): | |
| """Returns (dirty_df, clean_df) for Task 1.""" | |
| rng = np.random.default_rng(SEED) | |
| random.seed(SEED) | |
| n = 100 | |
| departments = ["Engineering", "Marketing", "Sales", "HR", "Finance"] | |
| first_names = ["Alice", "Bob", "Carol", "David", "Eve", "Frank", "Grace", | |
| "Heidi", "Ivan", "Judy", "Karl", "Laura", "Mallory", "Niaj", | |
| "Oscar", "Peggy", "Quinn", "Romeo", "Sybil", "Trent"] | |
| last_names = ["Smith", "Jones", "Brown", "Taylor", "Wilson", "Davis", | |
| "Miller", "Anderson", "Thomas", "Jackson"] | |
| names = [f"{random.choice(first_names)} {random.choice(last_names)}" for _ in range(n)] | |
| ages = rng.integers(22, 60, size=n).astype(float) | |
| salaries = rng.integers(40_000, 120_000, size=n).astype(float) | |
| depts = rng.choice(departments, size=n) | |
| experience = rng.integers(0, 30, size=n).astype(float) | |
| clean_df = pd.DataFrame({ | |
| "name": names, | |
| "age": ages, | |
| "salary": salaries, | |
| "department": depts, | |
| "experience": experience, | |
| }) | |
| dirty_df = clean_df.copy() | |
| # Inject ~20 % NaN into age, salary, department | |
| for col, frac in [("age", 0.20), ("salary", 0.20), ("department", 0.10)]: | |
| idx = rng.choice(n, size=int(n * frac), replace=False) | |
| dirty_df.loc[idx, col] = np.nan | |
| return dirty_df.reset_index(drop=True), clean_df.reset_index(drop=True) | |
| # --------------------------------------------------------------------------- | |
| # Task 2 β Product catalog with format & duplicate issues | |
| # --------------------------------------------------------------------------- | |
| def _scramble_phone(phone: str, rng) -> str: | |
| digits = phone.replace("-", "") | |
| fmt = rng.integers(0, 3) | |
| if fmt == 0: | |
| return digits # 5551234567 | |
| elif fmt == 1: | |
| return f"({digits[:3]}){digits[3:]}" # (555)1234567 | |
| else: | |
| return phone # 555-123-4567 (canonical) | |
| def _scramble_date(date_str: str, rng) -> str: | |
| dt = pd.to_datetime(date_str) | |
| fmt = rng.integers(0, 3) | |
| if fmt == 0: | |
| return dt.strftime("%Y-%m-%d") | |
| elif fmt == 1: | |
| return dt.strftime("%b %d %Y") | |
| else: | |
| return dt.strftime("%d/%m/%Y") | |
| def generate_task2_datasets(): | |
| """Returns (dirty_df, clean_df) for Task 2.""" | |
| rng = np.random.default_rng(SEED) | |
| random.seed(SEED) | |
| n = 200 | |
| categories = ["Electronics", "Clothing", "Food", "Books", "Toys"] | |
| product_ids = [f"P{str(i).zfill(4)}" for i in range(1, n + 1)] | |
| product_names = [f"Product_{i}" for i in range(1, n + 1)] | |
| prices = np.round(rng.uniform(5.0, 500.0, size=n), 2) | |
| categories_col = rng.choice(categories, size=n) | |
| phones = [ | |
| f"{rng.integers(100,999)}-{rng.integers(100,999)}-{rng.integers(1000,9999)}" | |
| for _ in range(n) | |
| ] | |
| days_offset = rng.integers(0, 1000, size=n) | |
| dates = [ | |
| (pd.Timestamp("2020-01-01") + pd.Timedelta(days=int(d))).strftime("%Y-%m-%d") | |
| for d in days_offset | |
| ] | |
| clean_df = pd.DataFrame({ | |
| "product_id": product_ids, | |
| "product_name": product_names, | |
| "price": prices, | |
| "category": categories_col, | |
| "phone": phones, | |
| "listed_date": dates, | |
| }) | |
| dirty_df = clean_df.copy() | |
| # Scramble ~60 % of phone formats | |
| phone_idx = rng.choice(n, size=int(n * 0.6), replace=False) | |
| dirty_df.loc[phone_idx, "phone"] = [ | |
| _scramble_phone(dirty_df.loc[i, "phone"], rng) for i in phone_idx | |
| ] | |
| # Scramble ~60 % of date formats | |
| date_idx = rng.choice(n, size=int(n * 0.6), replace=False) | |
| dirty_df.loc[date_idx, "listed_date"] = [ | |
| _scramble_date(dirty_df.loc[i, "listed_date"], rng) for i in date_idx | |
| ] | |
| # Add 15 duplicate rows | |
| dup_idx = rng.choice(n, size=15, replace=False) | |
| dup_rows = dirty_df.iloc[dup_idx].copy() | |
| dirty_df = pd.concat([dirty_df, dup_rows], ignore_index=True) | |
| return dirty_df.reset_index(drop=True), clean_df.reset_index(drop=True) | |
| # --------------------------------------------------------------------------- | |
| # Task 3 β Customer database: full pipeline | |
| # --------------------------------------------------------------------------- | |
| def generate_task3_datasets(): | |
| """Returns (dirty_df, clean_df) for Task 3.""" | |
| rng = np.random.default_rng(SEED) | |
| random.seed(SEED) | |
| n = 300 | |
| countries = ["USA", "UK", "Canada", "Australia", "Germany"] | |
| first_names = ["Alice", "Bob", "Carol", "David", "Eve", "Frank", "Grace", | |
| "Heidi", "Ivan", "Judy"] | |
| last_names = ["Smith", "Jones", "Brown", "Taylor", "Wilson"] | |
| names = [f"{random.choice(first_names)} {random.choice(last_names)}" for _ in range(n)] | |
| ages = rng.integers(18, 75, size=n).astype(float) | |
| purchase_amounts = np.round(rng.uniform(10.0, 500.0, size=n), 2) | |
| countries_col = rng.choice(countries, size=n) | |
| emails = [f"user{i}@example.com" for i in range(1, n + 1)] | |
| days_offset = rng.integers(0, 730, size=n) | |
| signup_dates = [ | |
| (pd.Timestamp("2022-01-01") + pd.Timedelta(days=int(d))).strftime("%Y-%m-%d") | |
| for d in days_offset | |
| ] | |
| clean_df = pd.DataFrame({ | |
| "name": names, | |
| "age": ages, | |
| "purchase_amount": purchase_amounts, | |
| "country": countries_col, | |
| "email": emails, | |
| "signup_date": signup_dates, | |
| }) | |
| dirty_df = clean_df.copy() | |
| # Missing values (~15 % in age, purchase_amount, country, signup_date) | |
| for col, frac in [("age", 0.15), ("purchase_amount", 0.15), | |
| ("country", 0.10), ("signup_date", 0.10)]: | |
| idx = rng.choice(n, size=int(n * frac), replace=False) | |
| dirty_df.loc[idx, col] = np.nan | |
| # Outliers in purchase_amount (~3 %) | |
| out_idx = rng.choice(n, size=int(n * 0.03), replace=False) | |
| dirty_df.loc[out_idx, "purchase_amount"] = ( | |
| dirty_df.loc[out_idx, "purchase_amount"] * 10 | |
| ) | |
| # Mixed case in country (~40 %) | |
| case_idx = rng.choice(n, size=int(n * 0.40), replace=False) | |
| dirty_df.loc[case_idx, "country"] = dirty_df.loc[case_idx, "country"].str.lower() | |
| # Mixed date formats (~50 %) β only scramble non-null entries | |
| date_idx = rng.choice(n, size=int(n * 0.50), replace=False) | |
| valid_date_idx = [i for i in date_idx if pd.notna(dirty_df.loc[i, "signup_date"])] | |
| for i in valid_date_idx: | |
| dirty_df.loc[i, "signup_date"] = _scramble_date(dirty_df.loc[i, "signup_date"], rng) | |
| # 20 duplicate rows | |
| dup_idx = rng.choice(n, size=20, replace=False) | |
| dup_rows = dirty_df.iloc[dup_idx].copy() | |
| dirty_df = pd.concat([dirty_df, dup_rows], ignore_index=True) | |
| return dirty_df.reset_index(drop=True), clean_df.reset_index(drop=True) | |
| # --------------------------------------------------------------------------- | |
| # Task 4 β Multi-source merge pipeline (Expert) | |
| # --------------------------------------------------------------------------- | |
| # Two independently generated "source" DataFrames with misaligned schemas | |
| # that must be aligned and merged before the standard cleaning pipeline. | |
| # | |
| # Source A β CRM export (150 rows): | |
| # cust_id, full_name, Age, purchase_amt, Country, signup | |
| # | |
| # Source B β Marketing export (100 rows): | |
| # customer_id, name, age_years, spend, country_name, registration_date, email | |
| # | |
| # Target schema after align_schema + merge_sources (250 rows): | |
| # customer_id, name, age, purchase_amount, country, signup_date, email | |
| # | |
| # Additional dirty issues injected after merge: | |
| # - Missing values in age, purchase_amount, country (~10%) | |
| # - Mixed country capitalisation (~30%) | |
| # - Mixed date formats in signup_date (~40%) | |
| # - 10 duplicate rows | |
| def generate_task4_datasets(): | |
| """ | |
| Returns (source_a, source_b, clean_merged_df). | |
| source_a and source_b have misaligned schemas. | |
| clean_merged_df is the ground-truth after alignment + merge + cleaning. | |
| """ | |
| rng = np.random.default_rng(SEED + 4) # distinct seed offset | |
| random.seed(SEED + 4) | |
| countries = ["USA", "UK", "Canada", "Australia", "Germany"] | |
| first_names = ["Alice", "Bob", "Carol", "David", "Eve", "Frank", | |
| "Grace", "Heidi", "Ivan", "Judy", "Karl", "Laura"] | |
| last_names = ["Smith", "Jones", "Brown", "Taylor", "Wilson", | |
| "Davis", "Miller", "Anderson", "Thomas", "Jackson"] | |
| # ---- Source A β CRM (150 rows) ---- | |
| n_a = 150 | |
| names_a = [f"{random.choice(first_names)} {random.choice(last_names)}" for _ in range(n_a)] | |
| ages_a = rng.integers(18, 75, size=n_a).astype(float) | |
| amounts_a = np.round(rng.uniform(10.0, 500.0, size=n_a), 2) | |
| countries_a = rng.choice(countries, size=n_a) | |
| days_a = rng.integers(0, 730, size=n_a) | |
| dates_a = [(pd.Timestamp("2022-01-01") + pd.Timedelta(days=int(d))).strftime("%Y-%m-%d") | |
| for d in days_a] | |
| emails_a = [f"crm_{i}@example.com" for i in range(1, n_a + 1)] | |
| source_a = pd.DataFrame({ | |
| "cust_id": [f"A{str(i).zfill(4)}" for i in range(1, n_a + 1)], | |
| "full_name": names_a, # β name | |
| "Age": ages_a, # β age (capital A β schema mismatch) | |
| "purchase_amt": amounts_a, # β purchase_amount (truncated name) | |
| "Country": countries_a, # β country (capital C) | |
| "signup": dates_a, # β signup_date (truncated name) | |
| "email": emails_a, | |
| }) | |
| # ---- Source B β Marketing (100 rows) ---- | |
| n_b = 100 | |
| names_b = [f"{random.choice(first_names)} {random.choice(last_names)}" for _ in range(n_b)] | |
| ages_b = rng.integers(18, 75, size=n_b).astype(float) | |
| amounts_b = np.round(rng.uniform(10.0, 500.0, size=n_b), 2) | |
| countries_b = rng.choice(countries, size=n_b) | |
| days_b = rng.integers(0, 730, size=n_b) | |
| dates_b = [(pd.Timestamp("2022-01-01") + pd.Timedelta(days=int(d))).strftime("%Y-%m-%d") | |
| for d in days_b] | |
| emails_b = [f"mkt_{i}@example.com" for i in range(1, n_b + 1)] | |
| source_b = pd.DataFrame({ | |
| "customer_id": [f"B{str(i).zfill(4)}" for i in range(1, n_b + 1)], | |
| "name": names_b, | |
| "age_years": ages_b, # β age (suffix mismatch) | |
| "spend": amounts_b, # β purchase_amount (synonym) | |
| "country_name": countries_b, # β country (suffix mismatch) | |
| "registration_date": dates_b, # β signup_date (synonym) | |
| "email": emails_b, | |
| }) | |
| # ---- Ground-truth clean merged DataFrame ---- | |
| clean_a = pd.DataFrame({ | |
| "customer_id": source_a["cust_id"], | |
| "name": source_a["full_name"], | |
| "age": source_a["Age"], | |
| "purchase_amount":source_a["purchase_amt"], | |
| "country": source_a["Country"], | |
| "signup_date": source_a["signup"], | |
| "email": source_a["email"], | |
| }) | |
| clean_b = pd.DataFrame({ | |
| "customer_id": source_b["customer_id"], | |
| "name": source_b["name"], | |
| "age": source_b["age_years"], | |
| "purchase_amount":source_b["spend"], | |
| "country": source_b["country_name"], | |
| "signup_date": source_b["registration_date"], | |
| "email": source_b["email"], | |
| }) | |
| clean_merged = pd.concat([clean_a, clean_b], ignore_index=True).reset_index(drop=True) | |
| return source_a.copy(), source_b.copy(), clean_merged |