import pandas as pd import numpy as np from sklearn.model_selection import train_test_split, GridSearchCV from sklearn.preprocessing import StandardScaler, PolynomialFeatures from sklearn.ensemble import RandomForestRegressor from sklearn.linear_model import LinearRegression from sklearn.pipeline import Pipeline from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import ExtraTreesRegressor from sklearn.neighbors import KNeighborsRegressor from xgboost import XGBRegressor from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error import tensorflow as tf from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Dense, Input import matplotlib.pyplot as plt import seaborn as sns # Load dataset data = pd.read_excel("data.xlsx") data.info() <class 'pandas.core.frame.DataFrame'> RangeIndex: 56 entries, 0 to 55 Data columns (total 14 columns): # Column Non-Null Count --- ------------------0 ID 56 non-null 1 C 56 non-null 2 Mn 56 non-null 3 Si 56 non-null 4 Al 56 non-null 5 Nb 56 non-null 6 Ti 56 non-null 7 V 56 non-null 8 Pearlite fraction (%) 56 non-null 9 Ferrite grain size (μm) 56 non-null 10 Interlamellar spacing (μm) 56 non-null 11 Cementite thickness (μm) 56 non-null 12 Yield strength (MPa) 56 non-null 13 Tensile strength (MPa) 56 non-null dtypes: float64(11), int64(3) memory usage: 6.3 KB Dtype ----int64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 int64 int64 # Check for missing values if data.isnull().sum().sum() > 0: data = data.dropna() # Drop rows with missing values # Separate features and targets X = data.iloc[:, 1:-2] # Assuming the last two columns are 'Tensile Strength' and 'Yield Strength' y = data.iloc[:, -2:] # Outputs: Tensile Strength and Yield Strength # Split data into train and test sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Normalize features scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) # Function for evaluation def evaluate_model(y_true, y_pred, model_name): mse = mean_squared_error(y_true, y_pred) mae = mean_absolute_error(y_true, y_pred) r2 = r2_score(y_true, y_pred) mape = mean_absolute_percentage_error(y_true, y_pred) print(f"{model_name} - MSE: {mse:.4f}, MAE: {mae:.4f}, R2: {r2:.4f}, MAPE: {mape:.4f}") return mse, mae, r2 # Model 1: Linear Regression lr_model = LinearRegression() lr_model.fit(X_train, y_train) lr_pred = lr_model.predict(X_test) evaluate_model(y_test, lr_pred, "Linear Regression") Linear Regression - MSE: 2956.5988, MAE: 36.8101, R2: 0.4945, MAPE: 0.0592 (2956.598838744695, 36.81014941631003, 0.4945428555105083) # Model 2. Nonlinear Regression (Polynomial Features with Linear Regression) poly_pipeline = Pipeline([ ('poly', PolynomialFeatures(degree=3, include_bias=True)), ('lr', LinearRegression()) ]) poly_pipeline.fit(X_train, y_train) poly_pred = poly_pipeline.predict(X_test) evaluate_model(y_test, poly_pred, "Nonlinear Regression (Polynomial)") Nonlinear Regression (Polynomial) - MSE: 1792.6273, MAE: 29.8251, R2: 0.6935, MAPE: 0.0487 (1792.627278530938, 29.825132174539338, 0.693503141176109) # Model 3: Random Forest Regressor rf_model = RandomForestRegressor(random_state=42) param_grid = {'n_estimators': [50, 100, 200, 300, 400], 'max_depth': [None, 10, 20, 30, 40], } grid_search = GridSearchCV(rf_model, param_grid, cv=3, scoring='r2') grid_search.fit(X_train, y_train) rf_best = grid_search.best_estimator_ rf_pred = rf_best.predict(X_test) evaluate_model(y_test, rf_pred, "Random Forest") Random Forest - MSE: 1605.8832, MAE: 28.6347, R2: 0.7252, MAPE: 0.0469 (1605.8831527777775, 28.634722222222223, 0.7251631310033735) # Model 4: Artificial Neural Network (ANN) with TensorFlow ann_model = Sequential([ Input(shape=(X_train.shape[1],)), Dense(128, activation='relu'), Dense(128, activation='relu'), Dense(128, activation='relu'), Dense(128, activation='relu'), Dense(2, activation='linear') # 2 outputs for Tensile and Yield Strength ]) ann_model.compile(optimizer='adam', loss='mse', metrics=['mae']) history = ann_model.fit(X_train, y_train, validation_split=0.2, epochs=200, batch_size=5, verbose=0) ann_pred = ann_model.predict(X_test) evaluate_model(y_test, ann_pred, "ANN") 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 107ms/step ANN - MSE: 5806.9870, MAE: 53.5173, R2: 0.0098, MAPE: 0.0772 (5806.987028628471, 53.51728185017903, 0.009785771369934082) # Model 5: ExtraTreesRegressor et_model = ExtraTreesRegressor(random_state=42) param_grid = {'n_estimators': [50, 100, 200, 300, 400], 'max_depth': [None, 10, 20, 30, 40]} grid_search = GridSearchCV(et_model, param_grid, cv=3, scoring='r2') grid_search.fit(X_train, y_train) best_et = grid_search.best_estimator_ et_pred = best_et.predict(X_test) evaluate_model(y_test, et_pred, "Extra Trees") Extra Trees - MSE: 1519.4255, MAE: 24.2181, R2: 0.7411, MAPE: 0.0379 (1519.4254842592584, 24.218055555555555, 0.7411494870144324) # Model 6: Decision Tree Regressor dt_model = DecisionTreeRegressor(random_state=42) param_grid = {'max_depth': [None, 10, 20, 30, 40, 50], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4]} grid_search = GridSearchCV(dt_model, param_grid, cv=3, scoring='r2') grid_search.fit(X_train, y_train) dt_best = grid_search.best_estimator_ dt_pred = dt_best.predict(X_test) evaluate_model(y_test, dt_pred, "Decision Tree") Decision Tree - MSE: 1837.7995, MAE: 29.5104, R2: 0.6849, MAPE: 0.0501 (1837.7994791666667, 29.510416666666664, 0.6849293529099796) # Model 7: K-Nearest Neighbors Regressor knn_model = KNeighborsRegressor() param_grid = {'n_neighbors': [3, 5, 10, 20], 'weights': ['uniform', 'distance'], 'metric': ['euclidean', 'manhattan']} grid_search = GridSearchCV(knn_model, param_grid, cv=3, scoring='r2') grid_search.fit(X_train, y_train) knn_best = grid_search.best_estimator_ knn_pred = knn_best.predict(X_test) evaluate_model(y_test, knn_pred, "K-Nearest Neighbors") K-Nearest Neighbors - MSE: 2188.1334, MAE: 31.5887, R2: 0.6266, MAPE: 0.0512 (2188.1333734738328, 31.588736293230642, 0.6266350442194297) # Model 8: XGBoost Regressor xgb_model = XGBRegressor(random_state=42) param_grid = {'n_estimators': [50, 100, 200, 300], 'learning_rate': [0.01, 0.05, 0.1, 0.2], 'max_depth': [3, 5, 10]} grid_search = GridSearchCV(xgb_model, param_grid, cv=3, scoring='r2') grid_search.fit(X_train, y_train) xgb_best = grid_search.best_estimator_ xgb_pred = xgb_best.predict(X_test) evaluate_model(y_test, xgb_pred, "XGBoost") XGBoost - MSE: 1998.8179, MAE: 33.5360, R2: 0.6581, MAPE: 0.0555 (1998.8179343499942, 33.536014556884766, 0.6581072211265564) # Plot training history plt.plot(history.history['loss'], label='Training Loss') plt.plot(history.history['val_loss'], label='Validation Loss') plt.title('ANN Training History') plt.xlabel('Epoch') plt.ylabel('Loss') plt.legend() plt.show() # Summarize Results print("Comparison of Models:") print("Linear Regression:", evaluate_model(y_test, lr_pred, "Linear Regression")) print("Polynomial Regression:", evaluate_model(y_test, poly_pred, "Polynomial Regression")) print("Random Forest:", evaluate_model(y_test, rf_pred, "Random Forest")) print("ANN:", evaluate_model(y_test, ann_pred, "ANN")) print("ExtraTreesRegressor:", evaluate_model(y_test, et_pred, "ExtraTreesRegression")) print("Decision Tree:", evaluate_model(y_test, dt_pred, "Decision Tree")) print("K-Nearest Neighbors Regression:", evaluate_model(y_test, knn_pred, "K-Nearest Neighbors Regression")) print("XGBoost Regression:", evaluate_model(y_test, xgb_pred, "XGBoost Regression")) Comparison of Models: Linear Regression - MSE: 2956.5988, MAE: 36.8101, R2: 0.4945, MAPE: 0.0592 Linear Regression: (2956.598838744695, 36.81014941631003, 0.4945428555105083) Polynomial Regression - MSE: 1792.6273, MAE: 29.8251, R2: 0.6935, MAPE: 0.0487 Polynomial Regression: (1792.627278530938, 29.825132174539338, 0.693503141176109) Random Forest - MSE: 1605.8832, MAE: 28.6347, R2: 0.7252, MAPE: 0.0469 Random Forest: (1605.8831527777775, 28.634722222222223, 0.7251631310033735) ANN - MSE: 5806.9870, MAE: 53.5173, R2: 0.0098, MAPE: 0.0772 ANN: (5806.987028628471, 53.51728185017903, 0.009785771369934082) ExtraTreesRegression - MSE: 1519.4255, MAE: 24.2181, R2: 0.7411, MAPE: 0.0379 ExtraTreesRegressor: (1519.4254842592584, 24.218055555555555, 0.7411494870144324) Decision Tree - MSE: 1837.7995, MAE: 29.5104, R2: 0.6849, MAPE: 0.0501 Decision Tree: (1837.7994791666667, 29.510416666666664, 0.6849293529099796) K-Nearest Neighbors Regression - MSE: 2188.1334, MAE: 31.5887, R2: 0.6266, MAPE: 0.0512 K-Nearest Neighbors Regression: (2188.1333734738328, 31.588736293230642, 0.6266350442194297) XGBoost Regression - MSE: 1998.8179, MAE: 33.5360, R2: 0.6581, MAPE: 0.0555 XGBoost Regression: (1998.8179343499942, 33.536014556884766, 0.6581072211265564) X = np.concatenate((X_train, X_test), axis=0) y = np.concatenate((y_train, y_test), axis=0) best_model = best_et # Replace with the best model best_pred = best_model.predict(X) # Separate predictions for Tensile Strength and Yield Strength actual_tensile = y[:, 0] actual_yield = y[:, 1] pred_tensile = best_pred[:, 0] pred_yield = best_pred[:, 1] # Plot 1: Prediction vs Actual for Tensile Strength plt.figure(figsize=(10, 5)) plt.scatter(actual_tensile, pred_tensile, alpha=0.7, label="Tensile Strength") plt.plot([actual_tensile.min(), actual_tensile.max()], [actual_tensile.min(), actual_tensile.max()], 'r--', label="Perfect Fit") plt.title("Prediction vs Actual: Tensile Strength") plt.xlabel("Actual Tensile Strength") plt.ylabel("Predicted Tensile Strength") plt.legend() plt.grid() plt.show() # Plot 2: Prediction vs Actual for Yield Strength plt.figure(figsize=(10, 5)) plt.scatter(actual_yield, pred_yield, alpha=0.7, label="Yield Strength", color="orange") plt.plot([actual_yield.min(), actual_yield.max()], [actual_yield.min(), actual_yield.max()], 'r--', label="Perfect Fit") plt.title("Prediction vs Actual: Yield Strength") plt.xlabel("Actual Yield Strength") plt.ylabel("Predicted Yield Strength") plt.legend() plt.grid() plt.show() # Plot 3: Residuals for Tensile Strength residuals_tensile = actual_tensile - pred_tensile plt.figure(figsize=(10, 5)) sns.histplot(residuals_tensile, kde=True, bins=20, color="blue") plt.title("Residuals: Tensile Strength") plt.xlabel("Residuals") plt.ylabel("Frequency") plt.grid() plt.show() # Plot 4: Residuals for Yield Strength residuals_yield = actual_yield - pred_yield plt.figure(figsize=(10, 5)) sns.histplot(residuals_yield, kde=True, bins=20, color="green") plt.title("Residuals: Yield Strength") plt.xlabel("Residuals") plt.ylabel("Frequency") plt.grid() plt.show() # Feature importance plot feature_importances = best_model.feature_importances_ feature_names = data.columns[1:-2] # Assuming last two columns are outputs plt.figure(figsize=(12, 6)) sns.barplot(x=feature_importances, y=feature_names) plt.title("Feature Importance (Extra trees)") plt.xlabel("Importance") plt.ylabel("Features") plt.grid() plt.show() # Collect R² scores for all models model_scores = { "Random Forest": r2_score(y_test, rf_pred), "Linear Regression": r2_score(y_test, lr_pred), "Nonlinear Regression": r2_score(y_test, poly_pred), "ANN": r2_score(y_test, ann_pred), "ExtraTreesRegressor": r2_score(y_test, et_pred), "Decision tree regressor": r2_score(y_test, dt_pred), "K-Nearest Neighbors Regressor": r2_score(y_test, knn_pred), "XGBoost Regressor": r2_score(y_test, xgb_pred) } # Plot bar chart plt.figure(figsize=(10, 6)) plt.bar(model_scores.keys(), model_scores.values()) plt.title("Model Performance Comparison (R² Scores)") plt.ylabel("R² Score") plt.xlabel("Model") plt.ylim(0, 1) plt.grid(axis='y', linestyle='--', alpha=0.7) # Rotate x-axis labels vertically plt.xticks(rotation=90) plt.show() # Collect MAPE for all models model_scores_mape = { "Random Forest": mean_absolute_percentage_error(y_test, rf_pred), "Linear Regression": mean_absolute_percentage_error(y_test, lr_pred), "Nonlinear Regression": mean_absolute_percentage_error(y_test, poly_pred), "ANN": mean_absolute_percentage_error(y_test, ann_pred), "ExtraTreesRegressor": mean_absolute_percentage_error(y_test, et_pred), "Decision tree regressor": mean_absolute_percentage_error(y_test, dt_pred), "K-Nearest Neighbors Regressor": mean_absolute_percentage_error(y_test, knn_pred), "XGBoost Regressor": mean_absolute_percentage_error(y_test, xgb_pred) } # Plot bar chart plt.figure(figsize=(10, 6)) plt.bar(model_scores_mape.keys(), model_scores_mape.values()) plt.title("Model Performance Comparison (MAPE)") plt.ylabel("MAPE") plt.xlabel("Model") plt.ylim(0, 0.1) plt.grid(axis='y', linestyle='--', alpha=0.7) # Rotate x-axis labels vertically plt.xticks(rotation=90) plt.show()