| |
| """ |
| Data Validation Script |
| Test the economic indicators and identify math issues |
| """ |
|
|
| import os |
| import sys |
| import pandas as pd |
| import numpy as np |
| from datetime import datetime |
|
|
| |
| sys.path.append(os.path.join(os.path.dirname(__file__), 'src')) |
|
|
| from src.core.enhanced_fred_client import EnhancedFREDClient |
|
|
| def test_data_validation(): |
| """Test data validation and identify issues""" |
| |
| |
| api_key = "demo" |
| |
| print("=== ECONOMIC DATA VALIDATION TEST ===\n") |
| |
| try: |
| |
| client = EnhancedFREDClient(api_key) |
| |
| |
| indicators = ['GDPC1', 'CPIAUCSL', 'INDPRO', 'RSAFS', 'FEDFUNDS', 'DGS10'] |
| |
| print("1. Testing data fetching...") |
| data = client.fetch_economic_data( |
| indicators=indicators, |
| start_date='2020-01-01', |
| end_date='2024-12-31', |
| frequency='auto' |
| ) |
| |
| print(f"Data shape: {data.shape}") |
| print(f"Date range: {data.index.min()} to {data.index.max()}") |
| print(f"Columns: {list(data.columns)}") |
| |
| print("\n2. Raw data sample (last 5 observations):") |
| print(data.tail()) |
| |
| print("\n3. Data statistics:") |
| print(data.describe()) |
| |
| print("\n4. Missing data analysis:") |
| missing_data = data.isnull().sum() |
| print(missing_data) |
| |
| print("\n5. Testing frequency standardization...") |
| |
| for indicator in indicators: |
| if indicator in data.columns: |
| series = data[indicator].dropna() |
| print(f"{indicator}: {len(series)} observations, freq: {series.index.freq}") |
| |
| print("\n6. Testing growth rate calculation...") |
| |
| for indicator in indicators: |
| if indicator in data.columns: |
| series = data[indicator].dropna() |
| if len(series) > 1: |
| |
| pct_change = series.pct_change().dropna() |
| latest_change = pct_change.iloc[-1] * 100 if len(pct_change) > 0 else 0 |
| print(f"{indicator}: Latest change = {latest_change:.2f}%") |
| print(f" Raw values: {series.iloc[-2]:.2f} -> {series.iloc[-1]:.2f}") |
| |
| print("\n7. Testing unit normalization...") |
| |
| for indicator in indicators: |
| if indicator in data.columns: |
| series = data[indicator].dropna() |
| if len(series) > 0: |
| mean_val = series.mean() |
| std_val = series.std() |
| print(f"{indicator}: Mean={mean_val:.2f}, Std={std_val:.2f}") |
| |
| |
| if mean_val > 1000000: |
| print(f" WARNING: {indicator} has very large values - may need unit conversion") |
| elif mean_val < 1 and indicator in ['FEDFUNDS', 'DGS10']: |
| print(f" WARNING: {indicator} has small values - may be in decimal form instead of percentage") |
| |
| print("\n8. Testing data quality validation...") |
| quality_report = client.validate_data_quality(data) |
| print("Quality report summary:") |
| for series, metrics in quality_report['missing_data'].items(): |
| print(f" {series}: {metrics['completeness']:.1f}% complete") |
| |
| print("\n9. Testing frequency alignment...") |
| |
| frequencies = {} |
| for indicator in indicators: |
| if indicator in data.columns: |
| series = data[indicator].dropna() |
| if len(series) > 0: |
| freq = pd.infer_freq(series.index) |
| frequencies[indicator] = freq |
| print(f" {indicator}: {freq}") |
| |
| |
| unique_freqs = set(frequencies.values()) |
| if len(unique_freqs) > 1: |
| print(f" WARNING: Multiple frequencies detected: {unique_freqs}") |
| print(" This may cause issues in modeling and forecasting") |
| |
| print("\n=== VALIDATION COMPLETE ===") |
| |
| |
| print("\n=== POTENTIAL ISSUES IDENTIFIED ===") |
| |
| issues = [] |
| |
| |
| for indicator in indicators: |
| if indicator in data.columns: |
| series = data[indicator].dropna() |
| if len(series) > 0: |
| mean_val = series.mean() |
| if mean_val > 1000000: |
| issues.append(f"Unit scale issue: {indicator} has very large values ({mean_val:.0f})") |
| elif mean_val < 1 and indicator in ['FEDFUNDS', 'DGS10']: |
| issues.append(f"Unit format issue: {indicator} may be in decimal form instead of percentage") |
| |
| |
| if len(unique_freqs) > 1: |
| issues.append(f"Frequency mismatch: Series have different frequencies {unique_freqs}") |
| |
| |
| for series, metrics in quality_report['missing_data'].items(): |
| if metrics['missing_percentage'] > 10: |
| issues.append(f"Missing data: {series} has {metrics['missing_percentage']:.1f}% missing values") |
| |
| if issues: |
| for issue in issues: |
| print(f" • {issue}") |
| else: |
| print(" No major issues detected") |
| |
| except Exception as e: |
| print(f"Error during validation: {e}") |
| import traceback |
| traceback.print_exc() |
|
|
| if __name__ == "__main__": |
| test_data_validation() |