Danielfonseca1212 commited on
Commit
0e321eb
·
verified ·
1 Parent(s): cf49f9c

Create tpch generator · py

Browse files
Files changed (1) hide show
  1. tpch generator · py +162 -0
tpch generator · py ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ data/tpch_generator.py
3
+ Gerador de dados TPC-H sintéticos com rótulos de fraude.
4
+ Simula as tabelas relacionais: customers, orders, lineitem, supplier, nation, part.
5
+ """
6
+
7
+ import numpy as np
8
+ import pandas as pd
9
+ from typing import Dict
10
+
11
+
12
+ def generate_tpch_data(
13
+ n_customers: int = 500,
14
+ n_orders: int = 2000,
15
+ fraud_rate: float = 0.05,
16
+ seed: int = 42,
17
+ ) -> Dict[str, pd.DataFrame]:
18
+ """
19
+ Gera dataset TPC-H sintético com rótulos de fraude nos pedidos.
20
+
21
+ Retorna dict com DataFrames: customers, orders, lineitem,
22
+ supplier, nation, part.
23
+ """
24
+ rng = np.random.default_rng(seed)
25
+
26
+ # ── NATION ────────────────────────────────────────────────────────────────
27
+ n_nations = 25
28
+ nations = pd.DataFrame({
29
+ "n_nationkey": np.arange(n_nations),
30
+ "n_name": [f"NATION_{i}" for i in range(n_nations)],
31
+ "n_regionkey": rng.integers(0, 5, n_nations),
32
+ "n_comment": [f"comment_{i}" for i in range(n_nations)],
33
+ })
34
+
35
+ # ── SUPPLIER ──────────────────────────────────────────────────────────────
36
+ n_suppliers = max(10, n_customers // 20)
37
+ suppliers = pd.DataFrame({
38
+ "s_suppkey": np.arange(n_suppliers),
39
+ "s_name": [f"Supplier#{i:05d}" for i in range(n_suppliers)],
40
+ "s_nationkey": rng.integers(0, n_nations, n_suppliers),
41
+ "s_acctbal": rng.uniform(-999, 9999, n_suppliers).round(2),
42
+ # Feature: fornecedores de risco têm acctbal negativo
43
+ "s_risk_flag": None,
44
+ })
45
+ suppliers["s_risk_flag"] = (suppliers["s_acctbal"] < 100).astype(int)
46
+
47
+ # ── PART ──────────────────────────────────────────────────────────────────
48
+ n_parts = max(50, n_orders // 5)
49
+ parts = pd.DataFrame({
50
+ "p_partkey": np.arange(n_parts),
51
+ "p_name": [f"part_{i}" for i in range(n_parts)],
52
+ "p_mfgr": rng.choice(["Manufacturer#1", "Manufacturer#2", "Manufacturer#3"], n_parts),
53
+ "p_retailprice": rng.uniform(5, 2000, n_parts).round(2),
54
+ "p_type": rng.choice(["ECONOMY", "STANDARD", "PROMO", "SMALL", "LARGE"], n_parts),
55
+ })
56
+
57
+ # ── CUSTOMERS ─────────────────────────────────────────────────────────────
58
+ customers = pd.DataFrame({
59
+ "c_custkey": np.arange(n_customers),
60
+ "c_name": [f"Customer#{i:08d}" for i in range(n_customers)],
61
+ "c_nationkey": rng.integers(0, n_nations, n_customers),
62
+ "c_acctbal": rng.uniform(-999, 9999, n_customers).round(2),
63
+ "c_mktsegment": rng.choice(
64
+ ["AUTOMOBILE", "BUILDING", "FURNITURE", "MACHINERY", "HOUSEHOLD"],
65
+ n_customers
66
+ ),
67
+ # Features derivadas
68
+ "c_account_age_days": rng.integers(1, 3650, n_customers),
69
+ "c_num_prev_orders": rng.poisson(5, n_customers),
70
+ })
71
+
72
+ # ── ORDERS ────────────────────────────────────────────────────────────────
73
+ # Fraude: combinação de sinais relacionais + ruído
74
+ customer_keys = rng.integers(0, n_customers, n_orders)
75
+ order_dates = pd.to_datetime("2020-01-01") + pd.to_timedelta(
76
+ rng.integers(0, 1095, n_orders), unit="D"
77
+ )
78
+
79
+ totalprice = rng.exponential(scale=5000, size=n_orders).round(2)
80
+
81
+ # Sinal de fraude baseado em features relacionais
82
+ cust_acctbal = customers.loc[customer_keys, "c_acctbal"].values
83
+ cust_age = customers.loc[customer_keys, "c_account_age_days"].values
84
+
85
+ fraud_score = (
86
+ 0.4 * (cust_acctbal < 0).astype(float) # conta negativa
87
+ + 0.3 * (totalprice > 15000).astype(float) # pedido alto
88
+ + 0.2 * (cust_age < 30).astype(float) # conta nova
89
+ + 0.1 * rng.random(n_orders) # ruído
90
+ )
91
+ # Normaliza e aplica threshold para atingir fraud_rate alvo
92
+ threshold = np.quantile(fraud_score, 1 - fraud_rate)
93
+ is_fraud = (fraud_score >= threshold).astype(int)
94
+
95
+ orders = pd.DataFrame({
96
+ "o_orderkey": np.arange(n_orders),
97
+ "o_custkey": customer_keys,
98
+ "o_orderstatus": rng.choice(["F", "O", "P"], n_orders),
99
+ "o_totalprice": totalprice,
100
+ "o_orderdate": order_dates,
101
+ "o_orderpriority": rng.choice(
102
+ ["1-URGENT","2-HIGH","3-MEDIUM","4-NOT SPECIFIED","5-LOW"], n_orders
103
+ ),
104
+ "o_shippriority": rng.integers(0, 3, n_orders),
105
+ "is_fraud": is_fraud, # TARGET
106
+ })
107
+
108
+ # ── LINEITEM ─────��────────────────────────────────────────────────────────
109
+ # Cada pedido tem 1-7 linhas
110
+ n_lines_per_order = rng.integers(1, 8, n_orders)
111
+ total_lines = n_lines_per_order.sum()
112
+
113
+ order_keys_expanded = np.repeat(np.arange(n_orders), n_lines_per_order)
114
+ linenumbers = np.concatenate([np.arange(1, n+1) for n in n_lines_per_order])
115
+
116
+ lineitem = pd.DataFrame({
117
+ "l_orderkey": order_keys_expanded,
118
+ "l_partkey": rng.integers(0, n_parts, total_lines),
119
+ "l_suppkey": rng.integers(0, n_suppliers, total_lines),
120
+ "l_linenumber": linenumbers,
121
+ "l_quantity": rng.integers(1, 51, total_lines).astype(float),
122
+ "l_extendedprice": rng.uniform(10, 5000, total_lines).round(2),
123
+ "l_discount": rng.uniform(0, 0.1, total_lines).round(2),
124
+ "l_tax": rng.uniform(0, 0.08, total_lines).round(2),
125
+ "l_returnflag": rng.choice(["A", "N", "R"], total_lines),
126
+ "l_linestatus": rng.choice(["F", "O"], total_lines),
127
+ "l_shipmode": rng.choice(["AIR","MAIL","RAIL","REG AIR","SHIP","TRUCK","FOB"], total_lines),
128
+ })
129
+
130
+ tables = {
131
+ "customers": customers,
132
+ "orders": orders,
133
+ "lineitem": lineitem,
134
+ "supplier": suppliers,
135
+ "nation": nations,
136
+ "part": parts,
137
+ }
138
+
139
+ return tables
140
+
141
+
142
+ def get_schema_info(tables: Dict[str, pd.DataFrame]) -> Dict:
143
+ """Retorna informações do schema: PKs, FKs, shapes."""
144
+ return {
145
+ "shapes": {k: v.shape for k, v in tables.items()},
146
+ "primary_keys": {
147
+ "customers": "c_custkey",
148
+ "orders": "o_orderkey",
149
+ "lineitem": ("l_orderkey", "l_linenumber"),
150
+ "supplier": "s_suppkey",
151
+ "nation": "n_nationkey",
152
+ "part": "p_partkey",
153
+ },
154
+ "foreign_keys": [
155
+ ("orders", "o_custkey", "customers", "c_custkey"),
156
+ ("lineitem", "l_orderkey", "orders", "o_orderkey"),
157
+ ("lineitem", "l_suppkey", "supplier", "s_suppkey"),
158
+ ("lineitem", "l_partkey", "part", "p_partkey"),
159
+ ("customers","c_nationkey", "nation", "n_nationkey"),
160
+ ("supplier", "s_nationkey", "nation", "n_nationkey"),
161
+ ],
162
+ }