File size: 4,930 Bytes
2797314
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1ca2e0e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2797314
 
 
 
 
1ca2e0e
2797314
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7d775b3
 
 
 
 
2797314
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
"""
Local Schema Extraction (No LLM)
Fast, cheap extraction of dataset metadata without sending to LLM.
"""

import polars as pl
from pathlib import Path
from typing import Dict, Any, Optional


def extract_schema_local(file_path: str, sample_rows: int = 5) -> Dict[str, Any]:
    """
    Extract dataset schema and basic stats locally without LLM.
    
    Returns:
        - column names and types
        - row/column counts
        - missing value counts
        - small sample for reference
        - memory usage
    """
    try:
        # Read with Polars (faster than pandas)
        if file_path.endswith('.csv'):
            # 🔥 FIX: Use infer_schema_length and ignore_errors to handle mixed-type columns
            # This prevents failures like: could not parse `835.159865` as dtype `i64`
            try:
                df = pl.read_csv(file_path, infer_schema_length=10000, ignore_errors=True)
            except Exception:
                # Final fallback: read everything as strings, then let Polars infer
                try:
                    import pandas as pd
                    pdf = pd.read_csv(file_path, low_memory=False)
                    df = pl.from_pandas(pdf)
                except Exception as e2:
                    return {
                        'error': f"Failed to read CSV: {str(e2)}",
                        'file_path': file_path
                    }
        elif file_path.endswith('.parquet'):
            df = pl.read_parquet(file_path)
        else:
            # Fallback to pandas
            import pandas as pd
            pdf = pd.read_csv(file_path, low_memory=False)
            df = pl.from_pandas(pdf)
        
        # Basic metadata
        schema_info = {
            'file_path': file_path,
            'file_size_mb': round(Path(file_path).stat().st_size / (1024 * 1024), 2),
            'num_rows': df.shape[0],
            'num_columns': df.shape[1],
            'columns': {}
        }
        
        # Per-column metadata
        for col in df.columns:
            col_series = df[col]
            dtype_str = str(col_series.dtype)
            
            col_info = {
                'dtype': dtype_str,
                'missing_count': col_series.null_count(),
                'missing_pct': round(col_series.null_count() / len(col_series) * 100, 2),
                'unique_count': col_series.n_unique() if len(col_series) < 100000 else None  # Skip for huge datasets
            }
            
            # Type-specific stats (lightweight)
            if dtype_str in ['Int64', 'Float64', 'Int32', 'Float32']:
                try:
                    col_info['min'] = float(col_series.min())
                    col_info['max'] = float(col_series.max())
                    col_info['mean'] = float(col_series.mean())
                except:
                    pass
            
            schema_info['columns'][col] = col_info
        
        # Small sample for LLM context (only first few rows)
        sample_data = df.head(sample_rows).to_dicts()
        schema_info['sample_rows'] = sample_data
        
        # Categorize columns
        schema_info['numeric_columns'] = [
            col for col, info in schema_info['columns'].items()
            if 'Int' in info['dtype'] or 'Float' in info['dtype']
        ]
        schema_info['categorical_columns'] = [
            col for col, info in schema_info['columns'].items()
            if info['dtype'] in ['Utf8', 'String'] or (
                info.get('unique_count') is not None and 
                info.get('unique_count') < 50 and 
                col not in schema_info['numeric_columns']
            )
        ]
        schema_info['datetime_columns'] = [
            col for col, info in schema_info['columns'].items()
            if 'Date' in info['dtype'] or 'Time' in info['dtype']
        ]
        
        return schema_info
        
    except Exception as e:
        return {
            'error': f"Failed to extract schema: {str(e)}",
            'file_path': file_path
        }


def infer_task_type(target_column: str, schema_info: Dict[str, Any]) -> Optional[str]:
    """
    Infer ML task type from target column without LLM.
    """
    if not target_column or target_column not in schema_info.get('columns', {}):
        return None
    
    target_info = schema_info['columns'][target_column]
    
    # Numeric with many unique values → regression
    if target_info['dtype'] in ['Int64', 'Float64', 'Int32', 'Float32']:
        unique_count = target_info.get('unique_count')
        if unique_count and unique_count > 20:
            return 'regression'
        elif unique_count and unique_count <= 10:
            return 'classification'
    
    # Categorical or low cardinality → classification
    if target_info['dtype'] in ['Utf8', 'String'] or target_info.get('unique_count', 0) <= 20:
        return 'classification'
    
    return None