beemabee commited on
Commit
9fb1c2a
·
1 Parent(s): 91c8ed7

add required file

Browse files
Data.xlsx ADDED
Binary file (47.6 kB). View file
 
README.md CHANGED
@@ -9,4 +9,87 @@ app_file: app.py
9
  pinned: false
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  pinned: false
10
  ---
11
 
12
+ ## Project Structure
13
+ - `src/`: Contains the source code
14
+ - `training.ipynb`: Jupyter notebook for data preprocessing and model training
15
+ - `app.py`: Streamlit application for predictions and SHAP analysis
16
+ - `model/`: Stores trained models and scalers
17
+ - `data/`: Contains the dataset (not included in the repository)
18
+
19
+ ## Installation
20
+ 1. Clone this repository:
21
+ ```
22
+ git clone https://github.com/yourusername/Factory_Prediction.git
23
+ cd Factory_Prediction
24
+ ```
25
+
26
+ 2. Create a virtual environment (optional but recommended):
27
+ ```
28
+ python -m venv venv
29
+ source venv/bin/activate # On Windows use `venv\Scripts\activate`
30
+ ```
31
+
32
+ 3. Install the required packages:
33
+ ```
34
+ pip install -r requirements.txt
35
+ ```
36
+
37
+ ## Data Preprocessing and Model Training
38
+ 1. Open and run the `src/training.ipynb` notebook in Jupyter or any compatible environment.
39
+ 2. The notebook covers:
40
+ - Data loading and cleaning
41
+ - Exploratory Data Analysis (EDA)
42
+ - Feature selection and engineering
43
+ - Model training (Linear Regression, Random Forest, XGBoost)
44
+ - Hyperparameter tuning
45
+ - Model evaluation
46
+
47
+ ## Running the Streamlit Application
48
+ 1. Ensure you have completed the model training step.
49
+ 2. Run the Streamlit app:
50
+ ```
51
+ streamlit run src/app.py
52
+ ```
53
+ 3. Open the provided URL in your web browser.
54
+
55
+ ## Using the Streamlit Application
56
+ 1. Input values for each feature (SamplingNC, SamplingChek, QTY, TimeProduce, Years).
57
+ 2. Click the "Prediksi dan Analisis" button.
58
+ 3. View the predicted NC percentage and SHAP analysis visualizations.
59
+
60
+ ## Project Methodology
61
+ 1. Data Cleaning:
62
+ - Handled missing values and outliers
63
+ - Standardized data formats
64
+
65
+ 2. Feature Selection:
66
+ - Used correlation analysis to identify relevant features
67
+ - Applied domain knowledge to select meaningful predictors
68
+
69
+ 3. Model Training:
70
+ - Experimented with Linear Regression, Random Forest, and XGBoost
71
+ - Performed hyperparameter tuning using GridSearchCV
72
+ - Selected the best performing model based on evaluation metrics
73
+
74
+ 4. Model Interpretation:
75
+ - Utilized SHAP (SHapley Additive exPlanations) for model interpretability
76
+ - Implemented visualizations to explain feature importance and impact
77
+
78
+ ## Technologies Used
79
+ - Python
80
+ - Pandas for data manipulation
81
+ - Scikit-learn for model training and evaluation
82
+ - XGBoost for advanced modeling
83
+ - Streamlit for web application development
84
+ - SHAP for model interpretation
85
+
86
+ ## Future Improvements
87
+ - Incorporate more advanced feature engineering techniques
88
+ - Experiment with ensemble methods for improved predictions
89
+ - Enhance the Streamlit UI for better user experience
90
+
91
+ ## Contributors
92
+ - Andika Atmanegara Putra
93
+
94
+ ## License
95
+ This project is licensed under the [MIT License](LICENSE).
model/best_model.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c591d53365f52836505f3fb3bf5fce5bda1fe03e90520f440d1239c91bb6129f
3
+ size 658382
model/scaler.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:14a9cf41df424ab093205e64fac0ff81c03ef70770ab23ebf8394c06f2eef3af
3
+ size 1087
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ numpy
2
+ pandas
3
+ seaborn
4
+ matplotlib
5
+ scikit-learn
6
+ xgboost
7
+ klib
8
+ openpyxl
9
+ streamlit
10
+ joblib
11
+ shap
src/app.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import joblib
3
+ import pandas as pd
4
+ import shap
5
+ import matplotlib.pyplot as plt
6
+ import numpy as np
7
+ from datetime import time, datetime
8
+
9
+ # Muat model dan scaler
10
+ @st.cache_resource
11
+ def load_model_and_scaler():
12
+ model = joblib.load('../model/best_model.joblib')
13
+ scaler = joblib.load('../model/scaler.joblib')
14
+ return model, scaler
15
+
16
+ model, scaler = load_model_and_scaler()
17
+
18
+ # Fitur yang digunakan oleh model
19
+ model_features = ['SamplingNC', 'SamplingChek', 'QTY', 'TimeProduce', 'Years']
20
+
21
+ # Fungsi untuk melakukan prediksi dan SHAP analysis
22
+ def predict_and_explain(features):
23
+ features_scaled = scaler.transform(features)
24
+ prediction = model.predict(features_scaled)
25
+
26
+ explainer = shap.TreeExplainer(model)
27
+ shap_values = explainer.shap_values(features_scaled)
28
+
29
+ return prediction, shap_values, explainer
30
+
31
+ # UI Streamlit
32
+ st.title('Prediksi NC dengan Analisis SHAP')
33
+
34
+ # Input fields untuk fitur yang digunakan model
35
+ input_data = {}
36
+ for col in model_features:
37
+ if col == 'TimeProduce':
38
+ time_input = st.time_input(f"Pilih {col}", value=time(0, 0))
39
+ input_data[col] = time_input.hour + time_input.minute / 60.0
40
+ elif col == 'Years':
41
+ input_data[col] = st.number_input(col, min_value=2000, max_value=2100, value=2000)
42
+ else:
43
+ input_data[col] = st.number_input(col, min_value=0, value=0)
44
+
45
+ if st.button('Prediksi dan Analisis'):
46
+ features = pd.DataFrame([input_data])
47
+ prediction, shap_values, explainer = predict_and_explain(features)
48
+
49
+ st.write(f'Prediksi NC %: {prediction[0]:.2f}%')
50
+
51
+ # Menampilkan data input yang digunakan
52
+ st.write("Data Input:")
53
+ display_features = features.copy()
54
+ display_features['TimeProduce'] = time(int(features['TimeProduce']), int((features['TimeProduce'] % 1) * 60)).strftime("%H:%M")
55
+ st.write(display_features)
56
+
57
+ # Visualisasi SHAP (Beeswarm plot)
58
+ st.write("Analisis SHAP (Pengaruh Fitur):")
59
+ fig, ax = plt.subplots(figsize=(10, 6))
60
+ shap.summary_plot(shap_values, features, plot_type="bar", show=False)
61
+ plt.title("Pengaruh Fitur terhadap Prediksi")
62
+ plt.xlabel("Rata-rata dampak pada prediksi")
63
+ plt.tight_layout()
64
+ st.pyplot(fig)
65
+ st.write("Interpretasi: Panjang bar menunjukkan seberapa besar pengaruh fitur terhadap prediksi. "
66
+ "Warna merah menunjukkan pengaruh positif (meningkatkan NC %), "
67
+ "sedangkan warna biru menunjukkan pengaruh negatif (menurunkan NC %).")
68
+
69
+ # Waterfall plot untuk feature importance
70
+ st.write("Kontribusi Fitur untuk Prediksi Ini:")
71
+ fig, ax = plt.subplots(figsize=(10, 6))
72
+ shap.plots._waterfall.waterfall_legacy(explainer.expected_value, shap_values[0], features.iloc[0], max_display=10, show=False)
73
+ plt.title("Kontribusi Setiap Fitur terhadap Prediksi")
74
+ plt.tight_layout()
75
+ st.pyplot(fig)
76
+ st.write("Interpretasi: Plot ini menunjukkan bagaimana setiap fitur berkontribusi terhadap prediksi akhir. "
77
+ "Batang merah menunjukkan peningkatan NC %, sedangkan batang biru menunjukkan penurunan NC %. "
78
+ "Nilai awal adalah rata-rata prediksi, dan nilai akhir adalah prediksi untuk input ini.")
src/training.ipynb ADDED
The diff for this file is too large to render. See raw diff