File size: 3,325 Bytes
5a58b2b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
from typing import Any, Dict, List
import logging

logger = logging.getLogger(__name__)

# Feature Configuration (18 Features) - EXACT ORDER
MODEL_FEATURES = [
    'General_Health', 'Checkup', 'Exercise', 'Skin_Cancer', 'Other_Cancer', 
    'Depression', 'Diabetes', 'Arthritis', 'Sex', 'Age', 'Height', 'Weight', 
    'BMI', 'Smoking', 'Alcohol', 'Fruit', 'Green_Vegetables', 'Fried_Potato'
]

SCALERS = {
    'Weight': {'min': 29.94, 'max': 136.0},
    'Height': {'min': 142.0, 'max': 200.0},
    'BMI': {'min': 12.87, 'max': 43.28},
    'Age': {'min': 21, 'max': 82},
    'Fruit': {'min': 0, 'max': 56.0},
    'Green_Vegetables': {'min': 0, 'max': 44.0},
    'Fried_Potato': {'min': 0, 'max': 17.0},
    'Alcohol': {'min': 0, 'max': 15.0},
    'General_Health': {
        'Excellent': 1.0, 'Very_Good': 0.75, 'Good': 0.5, 'Fair': 0.25, 'Poor': 0.0
    },
    'Checkup': {
        'Within 1 year': 1.0, '1-2 years': 0.75, '2-5 years': 0.5, '5+ years': 0.25, 'Never': 0.0
    },
    'Diabetes': {
        'No': 0.0, 'Borderline': 0.33, 'During Pregnancy': 0.66, 'Yes': 1.0
    },
    'Sex': {'Female': 0.0, 'Male': 1.0, '0': 0.0, '1': 1.0},
    'Exercise': {'0': 0.0, '1': 1.0},
    'Smoking': {'0': 0.0, '1': 1.0},
    'Skin_Cancer': {'0': 0.0, '1': 1.0},
    'Other_Cancer': {'0': 0.0, '1': 1.0},
    'Depression': {'0': 0.0, '1': 1.0},
    'Arthritis': {'0': 0.0, '1': 1.0}
}

class PreprocessingService:
    @staticmethod
    def process_input(data: Dict[str, Any]) -> List[float]:
        input_vector = []
        for feature in MODEL_FEATURES:
            raw_val = data.get(feature)
            processed_val = PreprocessingService._process_single_value(raw_val, feature)
            input_vector.append(processed_val)
        return input_vector

    @staticmethod
    def _process_single_value(value: Any, feature_name: str) -> float:
        val_str = str(value).lower()
        
        if feature_name in SCALERS:
            config = SCALERS[feature_name]
            
            # Numeric Range Scaling
            if isinstance(config, dict) and 'min' in config:
                try:
                    val = float(value)
                    norm_val = (val - config['min']) / (config['max'] - config['min'])
                    return max(0.0, min(1.0, norm_val))
                except (ValueError, TypeError):
                    logger.warning(f"Failed to convert numeric value '{value}' for feature '{feature_name}'. Defaulting to 0.0.")
                    return 0.0
            
            # Categorical Logic
            elif isinstance(config, dict):
                # Exact match
                if str(value) in config:
                    return config[str(value)]
                
                # Check normalized key (e.g. for "Very_Good" vs "Very Good" if needed, though Pydantic handles validation)
                # But let's keep the logic from original app just in case
                if val_str in ['on', 'true', 'yes']: return 1.0
                if val_str in ['off', 'false', 'no']: return 0.0
                
                logger.warning(f"Unknown categorical value '{value}' for feature '{feature_name}'. Defaulting to 0.0.")
                return 0.0
        
        # Default fallback
        try:
            return float(value)
        except:
            return 0.0