Spaces:

Learnerbegginer
/

Auto-ML-Preprocessing

Sleeping

App Files Files Community

Learnerbegginer commited on 3 days ago

Commit

ebbb600

1 Parent(s): c3c3195

Update Streamlit app to support deployed backend with fallback to localhost

Browse files

Files changed (1) hide show

app.py +182 -227

app.py CHANGED Viewed

@@ -54,48 +54,67 @@ st.markdown("""
 </style>
 """, unsafe_allow_html=True)
-# API base URL
-API_BASE = "http://localhost:8000"
 def check_backend_health():
-    """Check if backend is running"""
-    try:
-        response = requests.get(f"{API_BASE}/health", timeout=5)
-        return response.status_code == 200
-    except:
-        return False
 def upload_dataset(uploaded_file):
     """Upload dataset to backend"""
     try:
         files = {'file': uploaded_file}
-        response = requests.post(f"{API_BASE}/api/upload-dataset", files=files, timeout=30)
         if response.status_code == 200:
-            return response.json()
         else:
-            return None
     except Exception as e:
-        st.error(f"Upload error: {str(e)}")
-        return None
-def process_pipeline(uploaded_file, prompt):
-    """Process dataset with ML pipeline"""
     try:
-        files = {'file': uploaded_file}
-        data = {'prompt': prompt}
-        response = requests.post(f"{API_BASE}/process-pipeline", files=files, data=data, timeout=120)
         if response.status_code == 200:
-            return response.json()
         else:
-            st.error(f"Processing error: {response.text}")
-            return None
     except Exception as e:
-        st.error(f"Processing error: {str(e)}")
-        return None
 def download_file(filename):
-    """Generate download link for file"""
-    return f"{API_BASE}/api/download/{filename}"
 def main():
     # Main header
@@ -103,16 +122,21 @@ def main():
     st.markdown('<p style="text-align: center; color: #6b7280; font-size: 1.1rem;">Convert natural language prompts into ML-ready datasets</p>', unsafe_allow_html=True)
     # Check backend health
-    if not check_backend_health():
-        st.error("❌ Backend is not running! Please start the backend first:")
         st.code("""
 cd promptprepml/backend
 venv\\Scripts\\activate
 python app/main.py
 """)
         return
-    st.success("✅ Backend is connected and ready!")
     # Sidebar for navigation
     st.sidebar.title("📋 Processing Steps")
@@ -120,248 +144,179 @@ python app/main.py
     # Initialize session state
     if 'step' not in st.session_state:
         st.session_state.step = 'upload'
-    if 'dataset_info' not in st.session_state:
-        st.session_state.dataset_info = None
-    if 'processing_results' not in st.session_state:
-        st.session_state.processing_results = None
-    if 'uploaded_file' not in st.session_state:
-        st.session_state.uploaded_file = None
     # Step indicators
-    steps = ['📤 Upload Dataset', '💬 Enter Prompt', '⚡ Processing', '📊 Results']
-    step_mapping = {
-        'upload': 0,
-        'prompt': 1,
-        'processing': 2,
-        'results': 3
-    }
-    current_step_idx = step_mapping.get(st.session_state.step, 0)
     for i, step in enumerate(steps):
-        if i <= current_step_idx:
-            st.sidebar.markdown(f"✅ {step}")
         else:
-            st.sidebar.markdown(f"⏳ {step}")
-    # Main content based on current step
     if st.session_state.step == 'upload':
-        st.markdown('<h2 class="step-header">📤 Upload Your Dataset</h2>', unsafe_allow_html=True)
-        # File upload
         uploaded_file = st.file_uploader(
             "Choose a CSV file",
             type=['csv'],
-            help="Upload your dataset in CSV format. Maximum file size: 200MB"
         )
         if uploaded_file is not None:
-            # Display file info
-            st.markdown('<div class="info-box">', unsafe_allow_html=True)
-            st.write(f"**Filename:** {uploaded_file.name}")
-            st.write(f"**Size:** {uploaded_file.size / 1024 / 1024:.2f} MB")
-            st.markdown('</div>', unsafe_allow_html=True)
-            # Preview data
             try:
                 df = pd.read_csv(uploaded_file)
-                st.write("**Data Preview:**")
-                st.dataframe(df.head(), use_container_width=True)
-                st.write(f"**Shape:** {df.shape[0]} rows × {df.shape[1]} columns")
-                # Upload button
-                if st.button("🚀 Upload Dataset", type="primary"):
                     with st.spinner("Uploading dataset..."):
-                        # Reset file pointer
-                        uploaded_file.seek(0)
-                        result = upload_dataset(uploaded_file)
-                        if result:
-                            st.session_state.dataset_info = result
-                            st.session_state.uploaded_file = uploaded_file  # Store the file
-                            st.session_state.step = 'prompt'
-                            st.rerun()
                         else:
-                            st.error("Upload failed. Please try again.")
             except Exception as e:
-                st.error(f"Error reading CSV file: {str(e)}")
-    elif st.session_state.step == 'prompt':
-        st.markdown('<h2 class="step-header">💬 Describe Your Preprocessing Needs</h2>', unsafe_allow_html=True)
-        # Show dataset info
-        if st.session_state.dataset_info:
-            info = st.session_state.dataset_info['dataset_info']
             st.markdown('<div class="info-box">', unsafe_allow_html=True)
-            col1, col2, col3, col4 = st.columns(4)
-            with col1:
-                st.metric("Rows", info['shape'][0])
-            with col2:
-                st.metric("Columns", info['shape'][1])
-            with col3:
-                st.metric("Missing Values", sum(info['missing_values'].values()))
-            with col4:
-                st.metric("Duplicates", info['duplicates'])
             st.markdown('</div>', unsafe_allow_html=True)
-            # Show file info
-            if st.session_state.uploaded_file:
-                st.info(f"📁 File loaded: {st.session_state.uploaded_file.name} ({st.session_state.uploaded_file.size / 1024 / 1024:.2f} MB)")
-        # Prompt input
-        st.write("**Enter your preprocessing instructions in natural language:**")
-        # Example prompts
-        example_prompts = [
-            "Prepare this dataset for fraud classification, handle missing values, encode categorical variables, remove outliers, and scale numeric features.",
-            "Clean this dataset for customer churn prediction, fill missing values with median, one-hot encode categories, and apply standard scaling.",
-            "Preprocess data for regression analysis, handle null values, remove duplicates, and normalize numerical features.",
-            "Get this dataset ready for machine learning, handle missing data, encode categorical variables, and scale features.",
-            "Analyze this customer dataset and prepare it for machine learning. Remove duplicate rows and unnecessary identifier columns. Handle missing values appropriately. Encode categorical variables such as country, city, and company. Extract useful features from the subscription date. Scale any numerical features if present. Remove low-variance features and prepare the dataset for clustering or classification."
-        ]
-        # Prompt text area
         prompt = st.text_area(
-            "Your prompt:",
-            height=120,
-            placeholder="e.g., Handle missing values, encode categorical variables, remove outliers, and scale numeric features",
-            help="Describe how you want to preprocess your dataset in plain English"
         )
-        # Example prompts section
-        with st.expander("💡 Example Prompts"):
-            for i, example in enumerate(example_prompts, 1):
-                if st.button(f"Use Example {i}", key=f"example_{i}"):
-                    prompt = example
-                    st.rerun()
-                st.write(f"{i}. {example}")
-        # Supported operations info
-        with st.expander("🔧 Supported Operations"):
-            st.write("""
-            **Missing Values:**
-            - Mean/median/mode imputation
-            - Constant value filling
-            - Row deletion
-            **Categorical Encoding:**
-            - One-hot encoding
-            - Label encoding
-            **Feature Scaling:**
-            - Standard scaling (Z-score)
-            - Min-max scaling
-            - Robust scaling
-            **Outlier Detection:**
-            - Isolation Forest
-            - IQR method
-            - Z-score method
-            **Feature Engineering:**
-            - Variance threshold selection
-            - Correlation filtering
-            - Interaction features
-            """)
-        # Process button
-        if prompt and st.button("🚀 Process Dataset", type="primary"):
-            if st.session_state.uploaded_file:
-                with st.spinner("Processing dataset... This may take a few minutes."):
-                    # Reset file pointer
-                    st.session_state.uploaded_file.seek(0)
-                    result = process_pipeline(st.session_state.uploaded_file, prompt)
-                    if result:
-                        st.session_state.processing_results = result
-                        st.session_state.step = 'results'
-                        st.rerun()
-            else:
-                st.warning("No file found. Please upload your dataset again.")
     elif st.session_state.step == 'results':
-        st.markdown('<h2 class="step-header">🎉 Processing Complete!</h2>', unsafe_allow_html=True)
-        if st.session_state.processing_results:
-            results = st.session_state.processing_results
             # Success message
             st.markdown('<div class="success-box">', unsafe_allow_html=True)
-            st.success("✅ Your dataset has been successfully preprocessed and is ready for machine learning!")
             st.markdown('</div>', unsafe_allow_html=True)
-            # Dataset information
-            st.write("### 📊 Dataset Information")
-            info = results['dataset_info']['basic_info']
-            col1, col2, col3, col4 = st.columns(4)
-            with col1:
-                st.metric("Original Shape", f"{info['shape'][0]} × {info['shape'][1]}")
-            with col2:
-                st.metric("Numeric Columns", len(info['numeric_columns']))
-            with col3:
-                st.metric("Categorical Columns", len(info['categorical_columns']))
-            with col4:
-                missing_total = sum(results['dataset_info']['missing_values']['counts'].values())
-                st.metric("Missing Values", missing_total)
-            # Applied preprocessing steps
-            st.write("### 🔧 Applied Preprocessing Steps")
-            for i, step in enumerate(results['preprocessing_steps'], 1):
-                st.markdown(f"""
-                <div style="padding: 1rem; margin: 0.5rem 0; background-color: #f8fafc; border-left: 4px solid #3b82f6; border-radius: 0.25rem;">
-                    <strong>Step {i}:</strong> {step['description']}<br>
-                    <small>Method: {step.get('method', 'N/A')}</small>
-                </div>
-                """, unsafe_allow_html=True)
-            # Download files
-            st.write("### 📁 Download Files")
-            files_to_download = [
-                ("processed_dataset.csv", "📊 Processed Dataset", "Fully preprocessed dataset ready for ML"),
-                ("train.csv", "🚂 Training Set", "80% of data for model training"),
-                ("test.csv", "🧪 Test Set", "20% of data for model testing"),
-                ("pipeline.pkl", "⚙️ Pipeline", "Scikit-learn pipeline for reuse"),
-                ("eda_report.html", "📈 EDA Report", "Exploratory Data Analysis report")
-            ]
-            col1, col2 = st.columns(2)
-            for i, (filename, title, description) in enumerate(files_to_download):
-                with col1 if i % 2 == 0 else col2:
-                    st.markdown(f"""
-                    <div style="padding: 1rem; margin: 0.5rem 0; border: 1px solid #e5e7eb; border-radius: 0.5rem;">
-                        <h4>{title}</h4>
-                        <p><small>{description}</small></p>
-                        <a href="{download_file(filename)}" download="{filename}" style="text-decoration: none;">
-                            <button style="background-color: #3b82f6; color: white; padding: 0.5rem 1rem; border: none; border-radius: 0.25rem; cursor: pointer;">
-                                📥 Download {filename}
-                            </button>
-                        </a>
-                    </div>
-                    """, unsafe_allow_html=True)
-            # Quick actions
-            st.write("### ⚡ Quick Actions")
-            col1, col2, col3 = st.columns(3)
             with col1:
-                if st.button("📈 View EDA Report", type="secondary"):
-                    st.info(f"EDA Report will be available at: {download_file('eda_report.html')}")
             with col2:
-                if st.button("⚙️ Download Pipeline", type="secondary"):
-                    st.info(f"Pipeline file: {download_file('pipeline.pkl')}")
-            with col3:
-                if st.button("🔄 Process Another Dataset", type="primary"):
-                    # Reset session state
-                    for key in st.session_state.keys():
-                        del st.session_state[key]
-                    st.session_state.step = 'upload'
-                    st.rerun()
-        else:
-            st.error("No processing results available. Please start over.")
     # Footer
     st.markdown("---")

 </style>
 """, unsafe_allow_html=True)
+# API base URLs - try deployed backend first, fallback to localhost
+DEPLOYED_BACKEND = "https://promptprepml-backend.railway.app"
+LOCAL_BACKEND = "http://localhost:8000"
 def check_backend_health():
+    """Check if backend is running (try deployed first, then local)"""
+    backends = [DEPLOYED_BACKEND, LOCAL_BACKEND]
+    for backend_url in backends:
+        try:
+            response = requests.get(f"{backend_url}/health", timeout=5)
+            if response.status_code == 200:
+                st.session_state.backend_url = backend_url
+                return True, backend_url
+        except:
+            continue
+    return False, None
 def upload_dataset(uploaded_file):
     """Upload dataset to backend"""
+    if 'backend_url' not in st.session_state:
+        return None, "Backend not connected"
     try:
         files = {'file': uploaded_file}
+        response = requests.post(f"{st.session_state.backend_url}/api/upload", files=files)
         if response.status_code == 200:
+            return response.json(), None
         else:
+            return None, f"Upload failed: {response.text}"
     except Exception as e:
+        return None, f"Upload error: {str(e)}"
+def process_pipeline(file_path, prompt):
+    """Process dataset through ML pipeline"""
+    if 'backend_url' not in st.session_state:
+        return None, "Backend not connected"
     try:
+        data = {'file_path': file_path, 'prompt': prompt}
+        response = requests.post(f"{st.session_state.backend_url}/process-pipeline", json=data)
         if response.status_code == 200:
+            return response.json(), None
         else:
+            return None, f"Processing failed: {response.text}"
     except Exception as e:
+        return None, f"Processing error: {str(e)}"
 def download_file(filename):
+    """Download processed file"""
+    if 'backend_url' not in st.session_state:
+        return None, "Backend not connected"
+    try:
+        response = requests.get(f"{st.session_state.backend_url}/api/download/{filename}")
+        if response.status_code == 200:
+            return response.content, None
+        else:
+            return None, f"Download failed: {response.text}"
+    except Exception as e:
+        return None, f"Download error: {str(e)}"
 def main():
     # Main header
     st.markdown('<p style="text-align: center; color: #6b7280; font-size: 1.1rem;">Convert natural language prompts into ML-ready datasets</p>', unsafe_allow_html=True)
     # Check backend health
+    backend_healthy, backend_url = check_backend_health()
+    if not backend_healthy:
+        st.error("❌ Backend is not running! Please start the backend:")
         st.code("""
 cd promptprepml/backend
 venv\\Scripts\\activate
 python app/main.py
+# OR wait for deployed backend to be ready
 """)
+        st.info("🚀 **Deploying backend to cloud...** This will make the app work standalone!")
         return
+    st.success(f"✅ Backend connected at: {backend_url}")
     # Sidebar for navigation
     st.sidebar.title("📋 Processing Steps")
     # Initialize session state
     if 'step' not in st.session_state:
         st.session_state.step = 'upload'
+    if 'upload_result' not in st.session_state:
+        st.session_state.upload_result = None
+    if 'processing_result' not in st.session_state:
+        st.session_state.processing_result = None
     # Step indicators
+    steps = ['📤 Upload', '⚙️ Configure', '🚀 Process', '📊 Results']
+    current_step_index = 0
+    if st.session_state.step == 'upload':
+        current_step_index = 0
+    elif st.session_state.step == 'configure':
+        current_step_index = 1
+    elif st.session_state.step == 'process':
+        current_step_index = 2
+    elif st.session_state.step == 'results':
+        current_step_index = 3
+    # Display step indicators
     for i, step in enumerate(steps):
+        if i < current_step_index:
+            st.sidebar.success(f"✅ {step}")
+        elif i == current_step_index:
+            st.sidebar.info(f"🔄 {step}")
         else:
+            st.sidebar.write(f"⏳ {step}")
+    # Step 1: Upload Dataset
     if st.session_state.step == 'upload':
+        st.markdown('<h2 class="step-header">📤 Step 1: Upload Dataset</h2>', unsafe_allow_html=True)
         uploaded_file = st.file_uploader(
             "Choose a CSV file",
             type=['csv'],
+            help="Upload your dataset for preprocessing"
         )
         if uploaded_file is not None:
+            st.info(f"📄 File uploaded: `{uploaded_file.name}`")
+            # Show file preview
             try:
                 df = pd.read_csv(uploaded_file)
+                st.markdown('<div class="info-box">', unsafe_allow_html=True)
+                st.markdown(f"**Dataset Shape:** {df.shape}")
+                st.markdown(f"**Columns:** {', '.join(df.columns)}")
+                st.dataframe(df.head())
+                st.markdown('</div>', unsafe_allow_html=True)
+                if st.button("🚀 Continue to Configuration", type="primary"):
+                    # Upload to backend
                     with st.spinner("Uploading dataset..."):
+                        result, error = upload_dataset(uploaded_file)
+                        if error:
+                            st.error(f"❌ Upload failed: {error}")
                         else:
+                            st.session_state.upload_result = result
+                            st.session_state.step = 'configure'
+                            st.rerun()
             except Exception as e:
+                st.error(f"❌ Error reading file: {str(e)}")
+    # Step 2: Configure Processing
+    elif st.session_state.step == 'configure':
+        st.markdown('<h2 class="step-header">⚙️ Step 2: Configure Processing</h2>', unsafe_allow_html=True)
+        if st.session_state.upload_result:
+            file_info = st.session_state.upload_result
             st.markdown('<div class="info-box">', unsafe_allow_html=True)
+            st.markdown(f"**File:** {file_info.get('filename', 'Unknown')}")
+            st.markdown(f"**Size:** {file_info.get('size', 'Unknown')} bytes")
             st.markdown('</div>', unsafe_allow_html=True)
+        # Processing options
         prompt = st.text_area(
+            "Describe your preprocessing needs:",
+            value="Prepare this dataset for machine learning. Handle missing values, remove identifier columns, extract date features, encode categorical variables, and scale numeric features.",
+            height=100,
+            help="Describe what you want to do with your dataset in natural language"
         )
+        col1, col2 = st.columns([1, 1])
+        with col1:
+            if st.button("⬅️ Back", type="secondary"):
+                st.session_state.step = 'upload'
+                st.rerun()
+        with col2:
+            if st.button("🚀 Start Processing", type="primary"):
+                if st.session_state.upload_result:
+                    file_path = st.session_state.upload_result.get('file_path')
+                    with st.spinner("Processing dataset... This may take a few minutes."):
+                        result, error = process_pipeline(file_path, prompt)
+                        if error:
+                            st.error(f"❌ Processing failed: {error}")
+                        else:
+                            st.session_state.processing_result = result
+                            st.session_state.step = 'results'
+                            st.rerun()
+    # Step 3: Results
     elif st.session_state.step == 'results':
+        st.markdown('<h2 class="step-header">📊 Step 3: Results</h2>', unsafe_allow_html=True)
+        if st.session_state.processing_result:
+            result = st.session_state.processing_result
             # Success message
             st.markdown('<div class="success-box">', unsafe_allow_html=True)
+            st.success("✅ Dataset processed successfully!")
             st.markdown('</div>', unsafe_allow_html=True)
+            # Results summary
+            col1, col2 = st.columns([2, 1])
             with col1:
+                st.markdown("### 📈 Processing Summary")
+                dataset_info = result.get('dataset_info', {})
+                if dataset_info:
+                    basic_info = dataset_info.get('basic_info', {})
+                    st.markdown(f"- **Original Shape:** {basic_info.get('shape', 'Unknown')}")
+                    st.markdown(f"- **Columns:** {basic_info.get('columns', 'Unknown')}")
+                preprocessing_info = result.get('preprocessing_info', {})
+                if preprocessing_info:
+                    st.markdown(f"- **Processed Shape:** {preprocessing_info.get('processed_shape', 'Unknown')}")
+                # Dataset preview
+                st.markdown("### 👀 Dataset Preview")
+                preview_data = result.get('preview_data', [])
+                if preview_data:
+                    df_preview = pd.DataFrame(preview_data)
+                    st.dataframe(df_preview)
             with col2:
+                st.markdown("### 📥 Download Files")
+                download_links = [
+                    ("Processed Dataset", "processed_dataset.csv"),
+                    ("Training Set", "train.csv"),
+                    ("Test Set", "test.csv"),
+                    ("Pipeline", "pipeline.pkl"),
+                    ("EDA Report", "eda_report.html")
+                ]
+                for name, filename in download_links:
+                    if st.button(f"📥 {name}", key=f"download_{filename}"):
+                        with st.spinner(f"Downloading {filename}..."):
+                            file_content, error = download_file(filename)
+                            if error:
+                                st.error(f"❌ Download failed: {error}")
+                            else:
+                                st.download_button(
+                                    label=f"💾 Save {filename}",
+                                    data=file_content,
+                                    file_name=filename,
+                                    mime="application/octet-stream"
+                                )
+        # Action buttons
+        col1, col2 = st.columns([1, 1])
+        with col1:
+            if st.button("🔄 Process New Dataset", type="secondary"):
+                # Reset session state
+                for key in list(st.session_state.keys()):
+                    del st.session_state[key]
+                st.session_state.step = 'upload'
+                st.rerun()
+        with col2:
+            if st.button("📈 View EDA Report", type="primary"):
+                st.info("📊 EDA Report feature coming soon!")
     # Footer
     st.markdown("---")