Create tpch generator · py
Browse files- tpch generator · py +162 -0
tpch generator · py
ADDED
|
@@ -0,0 +1,162 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
data/tpch_generator.py
|
| 3 |
+
Gerador de dados TPC-H sintéticos com rótulos de fraude.
|
| 4 |
+
Simula as tabelas relacionais: customers, orders, lineitem, supplier, nation, part.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import numpy as np
|
| 8 |
+
import pandas as pd
|
| 9 |
+
from typing import Dict
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def generate_tpch_data(
|
| 13 |
+
n_customers: int = 500,
|
| 14 |
+
n_orders: int = 2000,
|
| 15 |
+
fraud_rate: float = 0.05,
|
| 16 |
+
seed: int = 42,
|
| 17 |
+
) -> Dict[str, pd.DataFrame]:
|
| 18 |
+
"""
|
| 19 |
+
Gera dataset TPC-H sintético com rótulos de fraude nos pedidos.
|
| 20 |
+
|
| 21 |
+
Retorna dict com DataFrames: customers, orders, lineitem,
|
| 22 |
+
supplier, nation, part.
|
| 23 |
+
"""
|
| 24 |
+
rng = np.random.default_rng(seed)
|
| 25 |
+
|
| 26 |
+
# ── NATION ────────────────────────────────────────────────────────────────
|
| 27 |
+
n_nations = 25
|
| 28 |
+
nations = pd.DataFrame({
|
| 29 |
+
"n_nationkey": np.arange(n_nations),
|
| 30 |
+
"n_name": [f"NATION_{i}" for i in range(n_nations)],
|
| 31 |
+
"n_regionkey": rng.integers(0, 5, n_nations),
|
| 32 |
+
"n_comment": [f"comment_{i}" for i in range(n_nations)],
|
| 33 |
+
})
|
| 34 |
+
|
| 35 |
+
# ── SUPPLIER ──────────────────────────────────────────────────────────────
|
| 36 |
+
n_suppliers = max(10, n_customers // 20)
|
| 37 |
+
suppliers = pd.DataFrame({
|
| 38 |
+
"s_suppkey": np.arange(n_suppliers),
|
| 39 |
+
"s_name": [f"Supplier#{i:05d}" for i in range(n_suppliers)],
|
| 40 |
+
"s_nationkey": rng.integers(0, n_nations, n_suppliers),
|
| 41 |
+
"s_acctbal": rng.uniform(-999, 9999, n_suppliers).round(2),
|
| 42 |
+
# Feature: fornecedores de risco têm acctbal negativo
|
| 43 |
+
"s_risk_flag": None,
|
| 44 |
+
})
|
| 45 |
+
suppliers["s_risk_flag"] = (suppliers["s_acctbal"] < 100).astype(int)
|
| 46 |
+
|
| 47 |
+
# ── PART ──────────────────────────────────────────────────────────────────
|
| 48 |
+
n_parts = max(50, n_orders // 5)
|
| 49 |
+
parts = pd.DataFrame({
|
| 50 |
+
"p_partkey": np.arange(n_parts),
|
| 51 |
+
"p_name": [f"part_{i}" for i in range(n_parts)],
|
| 52 |
+
"p_mfgr": rng.choice(["Manufacturer#1", "Manufacturer#2", "Manufacturer#3"], n_parts),
|
| 53 |
+
"p_retailprice": rng.uniform(5, 2000, n_parts).round(2),
|
| 54 |
+
"p_type": rng.choice(["ECONOMY", "STANDARD", "PROMO", "SMALL", "LARGE"], n_parts),
|
| 55 |
+
})
|
| 56 |
+
|
| 57 |
+
# ── CUSTOMERS ─────────────────────────────────────────────────────────────
|
| 58 |
+
customers = pd.DataFrame({
|
| 59 |
+
"c_custkey": np.arange(n_customers),
|
| 60 |
+
"c_name": [f"Customer#{i:08d}" for i in range(n_customers)],
|
| 61 |
+
"c_nationkey": rng.integers(0, n_nations, n_customers),
|
| 62 |
+
"c_acctbal": rng.uniform(-999, 9999, n_customers).round(2),
|
| 63 |
+
"c_mktsegment": rng.choice(
|
| 64 |
+
["AUTOMOBILE", "BUILDING", "FURNITURE", "MACHINERY", "HOUSEHOLD"],
|
| 65 |
+
n_customers
|
| 66 |
+
),
|
| 67 |
+
# Features derivadas
|
| 68 |
+
"c_account_age_days": rng.integers(1, 3650, n_customers),
|
| 69 |
+
"c_num_prev_orders": rng.poisson(5, n_customers),
|
| 70 |
+
})
|
| 71 |
+
|
| 72 |
+
# ── ORDERS ────────────────────────────────────────────────────────────────
|
| 73 |
+
# Fraude: combinação de sinais relacionais + ruído
|
| 74 |
+
customer_keys = rng.integers(0, n_customers, n_orders)
|
| 75 |
+
order_dates = pd.to_datetime("2020-01-01") + pd.to_timedelta(
|
| 76 |
+
rng.integers(0, 1095, n_orders), unit="D"
|
| 77 |
+
)
|
| 78 |
+
|
| 79 |
+
totalprice = rng.exponential(scale=5000, size=n_orders).round(2)
|
| 80 |
+
|
| 81 |
+
# Sinal de fraude baseado em features relacionais
|
| 82 |
+
cust_acctbal = customers.loc[customer_keys, "c_acctbal"].values
|
| 83 |
+
cust_age = customers.loc[customer_keys, "c_account_age_days"].values
|
| 84 |
+
|
| 85 |
+
fraud_score = (
|
| 86 |
+
0.4 * (cust_acctbal < 0).astype(float) # conta negativa
|
| 87 |
+
+ 0.3 * (totalprice > 15000).astype(float) # pedido alto
|
| 88 |
+
+ 0.2 * (cust_age < 30).astype(float) # conta nova
|
| 89 |
+
+ 0.1 * rng.random(n_orders) # ruído
|
| 90 |
+
)
|
| 91 |
+
# Normaliza e aplica threshold para atingir fraud_rate alvo
|
| 92 |
+
threshold = np.quantile(fraud_score, 1 - fraud_rate)
|
| 93 |
+
is_fraud = (fraud_score >= threshold).astype(int)
|
| 94 |
+
|
| 95 |
+
orders = pd.DataFrame({
|
| 96 |
+
"o_orderkey": np.arange(n_orders),
|
| 97 |
+
"o_custkey": customer_keys,
|
| 98 |
+
"o_orderstatus": rng.choice(["F", "O", "P"], n_orders),
|
| 99 |
+
"o_totalprice": totalprice,
|
| 100 |
+
"o_orderdate": order_dates,
|
| 101 |
+
"o_orderpriority": rng.choice(
|
| 102 |
+
["1-URGENT","2-HIGH","3-MEDIUM","4-NOT SPECIFIED","5-LOW"], n_orders
|
| 103 |
+
),
|
| 104 |
+
"o_shippriority": rng.integers(0, 3, n_orders),
|
| 105 |
+
"is_fraud": is_fraud, # TARGET
|
| 106 |
+
})
|
| 107 |
+
|
| 108 |
+
# ── LINEITEM ─────��────────────────────────────────────────────────────────
|
| 109 |
+
# Cada pedido tem 1-7 linhas
|
| 110 |
+
n_lines_per_order = rng.integers(1, 8, n_orders)
|
| 111 |
+
total_lines = n_lines_per_order.sum()
|
| 112 |
+
|
| 113 |
+
order_keys_expanded = np.repeat(np.arange(n_orders), n_lines_per_order)
|
| 114 |
+
linenumbers = np.concatenate([np.arange(1, n+1) for n in n_lines_per_order])
|
| 115 |
+
|
| 116 |
+
lineitem = pd.DataFrame({
|
| 117 |
+
"l_orderkey": order_keys_expanded,
|
| 118 |
+
"l_partkey": rng.integers(0, n_parts, total_lines),
|
| 119 |
+
"l_suppkey": rng.integers(0, n_suppliers, total_lines),
|
| 120 |
+
"l_linenumber": linenumbers,
|
| 121 |
+
"l_quantity": rng.integers(1, 51, total_lines).astype(float),
|
| 122 |
+
"l_extendedprice": rng.uniform(10, 5000, total_lines).round(2),
|
| 123 |
+
"l_discount": rng.uniform(0, 0.1, total_lines).round(2),
|
| 124 |
+
"l_tax": rng.uniform(0, 0.08, total_lines).round(2),
|
| 125 |
+
"l_returnflag": rng.choice(["A", "N", "R"], total_lines),
|
| 126 |
+
"l_linestatus": rng.choice(["F", "O"], total_lines),
|
| 127 |
+
"l_shipmode": rng.choice(["AIR","MAIL","RAIL","REG AIR","SHIP","TRUCK","FOB"], total_lines),
|
| 128 |
+
})
|
| 129 |
+
|
| 130 |
+
tables = {
|
| 131 |
+
"customers": customers,
|
| 132 |
+
"orders": orders,
|
| 133 |
+
"lineitem": lineitem,
|
| 134 |
+
"supplier": suppliers,
|
| 135 |
+
"nation": nations,
|
| 136 |
+
"part": parts,
|
| 137 |
+
}
|
| 138 |
+
|
| 139 |
+
return tables
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
def get_schema_info(tables: Dict[str, pd.DataFrame]) -> Dict:
|
| 143 |
+
"""Retorna informações do schema: PKs, FKs, shapes."""
|
| 144 |
+
return {
|
| 145 |
+
"shapes": {k: v.shape for k, v in tables.items()},
|
| 146 |
+
"primary_keys": {
|
| 147 |
+
"customers": "c_custkey",
|
| 148 |
+
"orders": "o_orderkey",
|
| 149 |
+
"lineitem": ("l_orderkey", "l_linenumber"),
|
| 150 |
+
"supplier": "s_suppkey",
|
| 151 |
+
"nation": "n_nationkey",
|
| 152 |
+
"part": "p_partkey",
|
| 153 |
+
},
|
| 154 |
+
"foreign_keys": [
|
| 155 |
+
("orders", "o_custkey", "customers", "c_custkey"),
|
| 156 |
+
("lineitem", "l_orderkey", "orders", "o_orderkey"),
|
| 157 |
+
("lineitem", "l_suppkey", "supplier", "s_suppkey"),
|
| 158 |
+
("lineitem", "l_partkey", "part", "p_partkey"),
|
| 159 |
+
("customers","c_nationkey", "nation", "n_nationkey"),
|
| 160 |
+
("supplier", "s_nationkey", "nation", "n_nationkey"),
|
| 161 |
+
],
|
| 162 |
+
}
|