Spaces:
Sleeping
Sleeping
| """ | |
| labeler.py — Supervised learning target construction for crypto trading. | |
| Target definition (binary): | |
| y = 1 if the trade hits RR=1:2 target BEFORE stop within LABEL_FORWARD_BARS | |
| y = 0 if stop is hit first OR neither hits within the window | |
| Design decisions: | |
| - Stop and target computed from ATR at signal bar (no lookahead) | |
| - Realistic costs (fees + slippage) deducted from target threshold | |
| - Both long and short labeling supported (direction from rule engine) | |
| - Time-series integrity: labeling uses only forward prices from bar+1 | |
| - NaN label produced when insufficient forward bars exist (dropped later) | |
| Target horizon N = 24 bars (1H timeframe = 1 full trading day): | |
| - Short enough to avoid regime change within the trade | |
| - Long enough for 1:2 RR to fully play out | |
| - Empirically: >24 bars introduces too many confounding events | |
| - <12 bars under-samples legitimate continuation moves | |
| """ | |
| import numpy as np | |
| import pandas as pd | |
| from typing import Optional | |
| from ml_config import ( | |
| LABEL_FORWARD_BARS, | |
| STOP_MULT, | |
| TARGET_RR, | |
| ROUND_TRIP_COST, | |
| ) | |
| def label_single_trade( | |
| df: pd.DataFrame, | |
| signal_idx: int, | |
| atr: float, | |
| direction: int, # +1 = long, -1 = short | |
| forward_bars: int = LABEL_FORWARD_BARS, | |
| ) -> Optional[int]: | |
| """ | |
| Label a single trade signal. | |
| Args: | |
| df: Full OHLCV DataFrame (index = timestamp, sorted ascending) | |
| signal_idx: Integer position of signal bar in df | |
| atr: ATR value AT signal bar (must be pre-computed, no lookahead) | |
| direction: +1 long, -1 short | |
| forward_bars: Max bars to check | |
| Returns: | |
| 1 = win (target hit first) | |
| 0 = loss (stop hit first or timeout) | |
| None = insufficient data | |
| """ | |
| if signal_idx + 1 >= len(df): | |
| return None | |
| entry_price = float(df["close"].iloc[signal_idx]) | |
| stop_distance = atr * STOP_MULT | |
| # Cost-adjusted thresholds: we need price to move further than naive RR | |
| cost_ticks = entry_price * ROUND_TRIP_COST | |
| target_distance = stop_distance * TARGET_RR + cost_ticks | |
| if direction == 1: # long | |
| stop_price = entry_price - stop_distance | |
| target_price = entry_price + target_distance | |
| else: # short | |
| stop_price = entry_price + stop_distance | |
| target_price = entry_price - target_distance | |
| end_idx = min(signal_idx + 1 + forward_bars, len(df)) | |
| forward = df.iloc[signal_idx + 1 : end_idx] | |
| if len(forward) == 0: | |
| return None | |
| for _, bar in forward.iterrows(): | |
| high = float(bar["high"]) | |
| low = float(bar["low"]) | |
| if direction == 1: | |
| # Long: check stop (low) then target (high) — pessimistic ordering | |
| if low <= stop_price: | |
| return 0 | |
| if high >= target_price: | |
| return 1 | |
| else: | |
| # Short: check stop (high) then target (low) | |
| if high >= stop_price: | |
| return 0 | |
| if low <= target_price: | |
| return 1 | |
| # Neither hit within window = loss (opportunity cost + fees) | |
| return 0 | |
| def label_dataframe( | |
| df: pd.DataFrame, | |
| signal_mask: pd.Series, | |
| atr_series: pd.Series, | |
| direction_series: pd.Series, | |
| forward_bars: int = LABEL_FORWARD_BARS, | |
| min_bars_remaining: int = LABEL_FORWARD_BARS, | |
| ) -> pd.Series: | |
| """ | |
| Label all signal bars in a DataFrame. | |
| Args: | |
| df: Full OHLCV DataFrame | |
| signal_mask: Boolean series, True where a setup was flagged | |
| atr_series: ATR at each bar (aligned to df index) | |
| direction_series: +1/-1 for each signal bar | |
| forward_bars: Max forward window | |
| min_bars_remaining: Drop labels too close to end of data | |
| Returns: | |
| Series of {1, 0, NaN} aligned to df.index | |
| """ | |
| labels = pd.Series(np.nan, index=df.index, dtype="float64") | |
| n = len(df) | |
| signal_positions = np.where(signal_mask.values)[0] | |
| for pos in signal_positions: | |
| # Drop signals too close to end of data (insufficient forward bars) | |
| if pos + min_bars_remaining >= n: | |
| continue | |
| atr_val = float(atr_series.iloc[pos]) | |
| direction = int(direction_series.iloc[pos]) | |
| if np.isnan(atr_val) or direction == 0: | |
| continue | |
| label = label_single_trade(df, pos, atr_val, direction, forward_bars) | |
| if label is not None: | |
| labels.iloc[pos] = float(label) | |
| return labels | |
| def compute_label_stats(labels: pd.Series) -> dict: | |
| """Return win rate, class balance, and label counts for diagnostics.""" | |
| valid = labels.dropna() | |
| total = len(valid) | |
| wins = int((valid == 1).sum()) | |
| losses = int((valid == 0).sum()) | |
| win_rate = wins / total if total > 0 else 0.0 | |
| class_imbalance = wins / losses if losses > 0 else float("inf") | |
| return { | |
| "total_labels": total, | |
| "wins": wins, | |
| "losses": losses, | |
| "win_rate": round(win_rate, 4), | |
| "class_imbalance_ratio": round(class_imbalance, 3), | |
| } | |