42Cummer's picture
Upload 15 files
2f560eb verified
import pandas as pd # type: ignore
import numpy as np # type: ignore
import torch # type: ignore
import os
project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
data_dir = os.path.join(project_root, "data")
if not os.path.exists(data_dir):
os.makedirs(data_dir)
CACHE_FILE = os.path.join(data_dir, "vfv_market_data.csv")
WINDOW_SIZE = 15
def get_processed_tensors():
"""
Reads the yfinance CSV, cleans MultiIndex headers,
and converts prices into normalized 15-element windows.
"""
if not os.path.exists(CACHE_FILE):
print(f"Error: {CACHE_FILE} not found. Run your fetcher script first.")
return None
# Load CSV with MultiIndex (Price/Ticker)
# yfinance saves two header rows. header=[0,1] ensures we capture both.
df = pd.read_csv(CACHE_FILE, header=[0, 1], index_col=0, parse_dates=True)
# Flatten MultiIndex
# Converts (Price, VFV.TO) -> Price. This allows df['Close'] to work.
df.columns = df.columns.get_level_values(0)
# Extract Close prices and convert to float
# errors='coerce' turns any non-numeric strings (like Ticker names) into NaN
prices = pd.to_numeric(df['Close'], errors='coerce').dropna().values
if len(prices) < WINDOW_SIZE + 1:
print(f"Error: Not enough data. Need at least {WINDOW_SIZE + 1} points.")
return None
# Calculate Log Returns
# r_t = ln(P_t / P_{t-1})
# This results in a vector of length len(prices) - 1
log_returns = np.log(prices[1:] / prices[:-1])
# Create Sliding Windows
windows = []
for i in range(len(log_returns) - WINDOW_SIZE + 1):
window = log_returns[i : i + WINDOW_SIZE]
# Z-Score Normalization
# (x - mean) / std_dev
# Essential for Quantum Angle Embedding to avoid saturation
mu = np.mean(window)
std = np.std(window)
if std > 1e-9:
norm_window = (window - mu) / std
else:
norm_window = window - mu # Handle zero-variance cases
windows.append(norm_window)
# Convert to PyTorch Tensor
return torch.tensor(np.array(windows), dtype=torch.float32)
if __name__ == "__main__":
tensors = get_processed_tensors()
if tensors is not None:
print("--- Processing Complete ---")
print(f"Tensor Shape: {tensors.shape}") # Should be [N, 15]
print("\nFirst Window Example:")
print(tensors[0])
print("\nLatest Window Example (Last 15 minutes):")
print(tensors[-1])