File size: 2,549 Bytes
2f560eb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import pandas as pd # type: ignore
import numpy as np # type: ignore
import torch # type: ignore

import os
project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
data_dir = os.path.join(project_root, "data")
if not os.path.exists(data_dir):
    os.makedirs(data_dir)
CACHE_FILE = os.path.join(data_dir, "vfv_market_data.csv")

WINDOW_SIZE = 15

def get_processed_tensors():
    """
    Reads the yfinance CSV, cleans MultiIndex headers, 
    and converts prices into normalized 15-element windows.
    """
    if not os.path.exists(CACHE_FILE):
        print(f"Error: {CACHE_FILE} not found. Run your fetcher script first.")
        return None

    # Load CSV with MultiIndex (Price/Ticker)
    # yfinance saves two header rows. header=[0,1] ensures we capture both.
    df = pd.read_csv(CACHE_FILE, header=[0, 1], index_col=0, parse_dates=True)

    # Flatten MultiIndex
    # Converts (Price, VFV.TO) -> Price. This allows df['Close'] to work.
    df.columns = df.columns.get_level_values(0)

    # Extract Close prices and convert to float
    # errors='coerce' turns any non-numeric strings (like Ticker names) into NaN
    prices = pd.to_numeric(df['Close'], errors='coerce').dropna().values
    
    if len(prices) < WINDOW_SIZE + 1:
        print(f"Error: Not enough data. Need at least {WINDOW_SIZE + 1} points.")
        return None

    # Calculate Log Returns
    # r_t = ln(P_t / P_{t-1})
    # This results in a vector of length len(prices) - 1
    log_returns = np.log(prices[1:] / prices[:-1])

    # Create Sliding Windows
    windows = []
    for i in range(len(log_returns) - WINDOW_SIZE + 1):
        window = log_returns[i : i + WINDOW_SIZE]
        
        # Z-Score Normalization
        # (x - mean) / std_dev
        # Essential for Quantum Angle Embedding to avoid saturation
        mu = np.mean(window)
        std = np.std(window)
        
        if std > 1e-9:
            norm_window = (window - mu) / std
        else:
            norm_window = window - mu  # Handle zero-variance cases
            
        windows.append(norm_window)

    # Convert to PyTorch Tensor
    return torch.tensor(np.array(windows), dtype=torch.float32)

if __name__ == "__main__":
    tensors = get_processed_tensors()
    if tensors is not None:
        print("--- Processing Complete ---")
        print(f"Tensor Shape: {tensors.shape}")  # Should be [N, 15]
        print("\nFirst Window Example:")
        print(tensors[0])
        print("\nLatest Window Example (Last 15 minutes):")
        print(tensors[-1])