Spaces:

Danielfonseca1212
/

RelGNNDeepRelationalLearning

Sleeping

App Files Files Community

Danielfonseca1212 commited on Mar 4

Commit

0e321eb

verified ·

1 Parent(s): cf49f9c

Create tpch generator · py

Browse files

Files changed (1) hide show

tpch generator · py +162 -0

tpch generator · py ADDED Viewed

	@@ -0,0 +1,162 @@

+"""
+data/tpch_generator.py
+Gerador de dados TPC-H sintéticos com rótulos de fraude.
+Simula as tabelas relacionais: customers, orders, lineitem, supplier, nation, part.
+"""
+import numpy as np
+import pandas as pd
+from typing import Dict
+def generate_tpch_data(
+    n_customers: int = 500,
+    n_orders: int = 2000,
+    fraud_rate: float = 0.05,
+    seed: int = 42,
+) -> Dict[str, pd.DataFrame]:
+    """
+    Gera dataset TPC-H sintético com rótulos de fraude nos pedidos.
+    Retorna dict com DataFrames: customers, orders, lineitem,
+                                  supplier, nation, part.
+    """
+    rng = np.random.default_rng(seed)
+    # ── NATION ────────────────────────────────────────────────────────────────
+    n_nations = 25
+    nations = pd.DataFrame({
+        "n_nationkey": np.arange(n_nations),
+        "n_name": [f"NATION_{i}" for i in range(n_nations)],
+        "n_regionkey": rng.integers(0, 5, n_nations),
+        "n_comment": [f"comment_{i}" for i in range(n_nations)],
+    })
+    # ── SUPPLIER ──────────────────────────────────────────────────────────────
+    n_suppliers = max(10, n_customers // 20)
+    suppliers = pd.DataFrame({
+        "s_suppkey": np.arange(n_suppliers),
+        "s_name": [f"Supplier#{i:05d}" for i in range(n_suppliers)],
+        "s_nationkey": rng.integers(0, n_nations, n_suppliers),
+        "s_acctbal": rng.uniform(-999, 9999, n_suppliers).round(2),
+        # Feature: fornecedores de risco têm acctbal negativo
+        "s_risk_flag": None,
+    })
+    suppliers["s_risk_flag"] = (suppliers["s_acctbal"] < 100).astype(int)
+    # ── PART ──────────────────────────────────────────────────────────────────
+    n_parts = max(50, n_orders // 5)
+    parts = pd.DataFrame({
+        "p_partkey": np.arange(n_parts),
+        "p_name": [f"part_{i}" for i in range(n_parts)],
+        "p_mfgr": rng.choice(["Manufacturer#1", "Manufacturer#2", "Manufacturer#3"], n_parts),
+        "p_retailprice": rng.uniform(5, 2000, n_parts).round(2),
+        "p_type": rng.choice(["ECONOMY", "STANDARD", "PROMO", "SMALL", "LARGE"], n_parts),
+    })
+    # ── CUSTOMERS ─────────────────────────────────────────────────────────────
+    customers = pd.DataFrame({
+        "c_custkey": np.arange(n_customers),
+        "c_name": [f"Customer#{i:08d}" for i in range(n_customers)],
+        "c_nationkey": rng.integers(0, n_nations, n_customers),
+        "c_acctbal": rng.uniform(-999, 9999, n_customers).round(2),
+        "c_mktsegment": rng.choice(
+            ["AUTOMOBILE", "BUILDING", "FURNITURE", "MACHINERY", "HOUSEHOLD"],
+            n_customers
+        ),
+        # Features derivadas
+        "c_account_age_days": rng.integers(1, 3650, n_customers),
+        "c_num_prev_orders": rng.poisson(5, n_customers),
+    })
+    # ── ORDERS ────────────────────────────────────────────────────────────────
+    # Fraude: combinação de sinais relacionais + ruído
+    customer_keys = rng.integers(0, n_customers, n_orders)
+    order_dates   = pd.to_datetime("2020-01-01") + pd.to_timedelta(
+        rng.integers(0, 1095, n_orders), unit="D"
+    )
+    totalprice = rng.exponential(scale=5000, size=n_orders).round(2)
+    # Sinal de fraude baseado em features relacionais
+    cust_acctbal = customers.loc[customer_keys, "c_acctbal"].values
+    cust_age     = customers.loc[customer_keys, "c_account_age_days"].values
+    fraud_score = (
+        0.4 * (cust_acctbal < 0).astype(float)       # conta negativa
+      + 0.3 * (totalprice > 15000).astype(float)      # pedido alto
+      + 0.2 * (cust_age < 30).astype(float)           # conta nova
+      + 0.1 * rng.random(n_orders)                    # ruído
+    )
+    # Normaliza e aplica threshold para atingir fraud_rate alvo
+    threshold = np.quantile(fraud_score, 1 - fraud_rate)
+    is_fraud = (fraud_score >= threshold).astype(int)
+    orders = pd.DataFrame({
+        "o_orderkey":  np.arange(n_orders),
+        "o_custkey":   customer_keys,
+        "o_orderstatus": rng.choice(["F", "O", "P"], n_orders),
+        "o_totalprice":  totalprice,
+        "o_orderdate":   order_dates,
+        "o_orderpriority": rng.choice(
+            ["1-URGENT","2-HIGH","3-MEDIUM","4-NOT SPECIFIED","5-LOW"], n_orders
+        ),
+        "o_shippriority": rng.integers(0, 3, n_orders),
+        "is_fraud": is_fraud,  # TARGET
+    })
+    # ── LINEITEM ─────��────────────────────────────────────────────────────────
+    # Cada pedido tem 1-7 linhas
+    n_lines_per_order = rng.integers(1, 8, n_orders)
+    total_lines = n_lines_per_order.sum()
+    order_keys_expanded = np.repeat(np.arange(n_orders), n_lines_per_order)
+    linenumbers = np.concatenate([np.arange(1, n+1) for n in n_lines_per_order])
+    lineitem = pd.DataFrame({
+        "l_orderkey":    order_keys_expanded,
+        "l_partkey":     rng.integers(0, n_parts, total_lines),
+        "l_suppkey":     rng.integers(0, n_suppliers, total_lines),
+        "l_linenumber":  linenumbers,
+        "l_quantity":    rng.integers(1, 51, total_lines).astype(float),
+        "l_extendedprice": rng.uniform(10, 5000, total_lines).round(2),
+        "l_discount":    rng.uniform(0, 0.1, total_lines).round(2),
+        "l_tax":         rng.uniform(0, 0.08, total_lines).round(2),
+        "l_returnflag":  rng.choice(["A", "N", "R"], total_lines),
+        "l_linestatus":  rng.choice(["F", "O"], total_lines),
+        "l_shipmode":    rng.choice(["AIR","MAIL","RAIL","REG AIR","SHIP","TRUCK","FOB"], total_lines),
+    })
+    tables = {
+        "customers": customers,
+        "orders":    orders,
+        "lineitem":  lineitem,
+        "supplier":  suppliers,
+        "nation":    nations,
+        "part":      parts,
+    }
+    return tables
+def get_schema_info(tables: Dict[str, pd.DataFrame]) -> Dict:
+    """Retorna informações do schema: PKs, FKs, shapes."""
+    return {
+        "shapes": {k: v.shape for k, v in tables.items()},
+        "primary_keys": {
+            "customers": "c_custkey",
+            "orders":    "o_orderkey",
+            "lineitem":  ("l_orderkey", "l_linenumber"),
+            "supplier":  "s_suppkey",
+            "nation":    "n_nationkey",
+            "part":      "p_partkey",
+        },
+        "foreign_keys": [
+            ("orders",   "o_custkey",   "customers", "c_custkey"),
+            ("lineitem", "l_orderkey",  "orders",    "o_orderkey"),
+            ("lineitem", "l_suppkey",   "supplier",  "s_suppkey"),
+            ("lineitem", "l_partkey",   "part",      "p_partkey"),
+            ("customers","c_nationkey", "nation",    "n_nationkey"),
+            ("supplier", "s_nationkey", "nation",    "n_nationkey"),
+        ],
+    }