Uploaded by Mourd Azim

Análisis de Regresión: Predicción de Resistencia de Materiales

advertisement
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score,
mean_absolute_error, mean_absolute_percentage_error
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
import matplotlib.pyplot as plt
import seaborn as sns
# Load dataset
data = pd.read_excel("data.xlsx")
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56 entries, 0 to 55
Data columns (total 14 columns):
#
Column
Non-Null Count
--- ------------------0
ID
56 non-null
1
C
56 non-null
2
Mn
56 non-null
3
Si
56 non-null
4
Al
56 non-null
5
Nb
56 non-null
6
Ti
56 non-null
7
V
56 non-null
8
Pearlite fraction (%)
56 non-null
9
Ferrite grain size (μm)
56 non-null
10 Interlamellar spacing (μm) 56 non-null
11 Cementite thickness (μm)
56 non-null
12 Yield strength (MPa)
56 non-null
13 Tensile strength (MPa)
56 non-null
dtypes: float64(11), int64(3)
memory usage: 6.3 KB
Dtype
----int64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
int64
int64
# Check for missing values
if data.isnull().sum().sum() > 0:
data = data.dropna() # Drop rows with missing values
# Separate features and targets
X = data.iloc[:, 1:-2] # Assuming the last two columns are 'Tensile
Strength' and 'Yield Strength'
y = data.iloc[:, -2:] # Outputs: Tensile Strength and Yield Strength
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.2, random_state=42)
# Normalize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# Function for evaluation
def evaluate_model(y_true, y_pred, model_name):
mse = mean_squared_error(y_true, y_pred)
mae = mean_absolute_error(y_true, y_pred)
r2 = r2_score(y_true, y_pred)
mape = mean_absolute_percentage_error(y_true, y_pred)
print(f"{model_name} - MSE: {mse:.4f}, MAE: {mae:.4f}, R2:
{r2:.4f}, MAPE: {mape:.4f}")
return mse, mae, r2
# Model 1: Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
lr_pred = lr_model.predict(X_test)
evaluate_model(y_test, lr_pred, "Linear Regression")
Linear Regression - MSE: 2956.5988, MAE: 36.8101, R2: 0.4945, MAPE:
0.0592
(2956.598838744695, 36.81014941631003, 0.4945428555105083)
# Model 2. Nonlinear Regression (Polynomial Features with Linear
Regression)
poly_pipeline = Pipeline([
('poly', PolynomialFeatures(degree=3, include_bias=True)),
('lr', LinearRegression())
])
poly_pipeline.fit(X_train, y_train)
poly_pred = poly_pipeline.predict(X_test)
evaluate_model(y_test, poly_pred, "Nonlinear Regression (Polynomial)")
Nonlinear Regression (Polynomial) - MSE: 1792.6273, MAE: 29.8251, R2:
0.6935, MAPE: 0.0487
(1792.627278530938, 29.825132174539338, 0.693503141176109)
# Model 3: Random Forest Regressor
rf_model = RandomForestRegressor(random_state=42)
param_grid = {'n_estimators': [50, 100, 200, 300, 400],
'max_depth': [None, 10, 20, 30, 40],
}
grid_search = GridSearchCV(rf_model, param_grid, cv=3, scoring='r2')
grid_search.fit(X_train, y_train)
rf_best = grid_search.best_estimator_
rf_pred = rf_best.predict(X_test)
evaluate_model(y_test, rf_pred, "Random Forest")
Random Forest - MSE: 1605.8832, MAE: 28.6347, R2: 0.7252, MAPE: 0.0469
(1605.8831527777775, 28.634722222222223, 0.7251631310033735)
# Model 4: Artificial Neural Network (ANN) with TensorFlow
ann_model = Sequential([
Input(shape=(X_train.shape[1],)),
Dense(128, activation='relu'),
Dense(128, activation='relu'),
Dense(128, activation='relu'),
Dense(128, activation='relu'),
Dense(2, activation='linear') # 2 outputs for Tensile and Yield
Strength
])
ann_model.compile(optimizer='adam', loss='mse', metrics=['mae'])
history = ann_model.fit(X_train, y_train, validation_split=0.2,
epochs=200, batch_size=5, verbose=0)
ann_pred = ann_model.predict(X_test)
evaluate_model(y_test, ann_pred, "ANN")
1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 107ms/step
ANN - MSE: 5806.9870, MAE: 53.5173, R2: 0.0098, MAPE: 0.0772
(5806.987028628471, 53.51728185017903, 0.009785771369934082)
# Model 5: ExtraTreesRegressor
et_model = ExtraTreesRegressor(random_state=42)
param_grid = {'n_estimators': [50, 100, 200, 300, 400],
'max_depth': [None, 10, 20, 30, 40]}
grid_search = GridSearchCV(et_model, param_grid, cv=3, scoring='r2')
grid_search.fit(X_train, y_train)
best_et = grid_search.best_estimator_
et_pred = best_et.predict(X_test)
evaluate_model(y_test, et_pred, "Extra Trees")
Extra Trees - MSE: 1519.4255, MAE: 24.2181, R2: 0.7411, MAPE: 0.0379
(1519.4254842592584, 24.218055555555555, 0.7411494870144324)
# Model 6: Decision Tree Regressor
dt_model = DecisionTreeRegressor(random_state=42)
param_grid = {'max_depth': [None, 10, 20, 30, 40, 50],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4]}
grid_search = GridSearchCV(dt_model, param_grid, cv=3, scoring='r2')
grid_search.fit(X_train, y_train)
dt_best = grid_search.best_estimator_
dt_pred = dt_best.predict(X_test)
evaluate_model(y_test, dt_pred, "Decision Tree")
Decision Tree - MSE: 1837.7995, MAE: 29.5104, R2: 0.6849, MAPE: 0.0501
(1837.7994791666667, 29.510416666666664, 0.6849293529099796)
# Model 7: K-Nearest Neighbors Regressor
knn_model = KNeighborsRegressor()
param_grid = {'n_neighbors': [3, 5, 10, 20],
'weights': ['uniform', 'distance'],
'metric': ['euclidean', 'manhattan']}
grid_search = GridSearchCV(knn_model, param_grid, cv=3, scoring='r2')
grid_search.fit(X_train, y_train)
knn_best = grid_search.best_estimator_
knn_pred = knn_best.predict(X_test)
evaluate_model(y_test, knn_pred, "K-Nearest Neighbors")
K-Nearest Neighbors - MSE: 2188.1334, MAE: 31.5887, R2: 0.6266, MAPE:
0.0512
(2188.1333734738328, 31.588736293230642, 0.6266350442194297)
# Model 8: XGBoost Regressor
xgb_model = XGBRegressor(random_state=42)
param_grid = {'n_estimators': [50, 100, 200, 300],
'learning_rate': [0.01, 0.05, 0.1, 0.2],
'max_depth': [3, 5, 10]}
grid_search = GridSearchCV(xgb_model, param_grid, cv=3, scoring='r2')
grid_search.fit(X_train, y_train)
xgb_best = grid_search.best_estimator_
xgb_pred = xgb_best.predict(X_test)
evaluate_model(y_test, xgb_pred, "XGBoost")
XGBoost - MSE: 1998.8179, MAE: 33.5360, R2: 0.6581, MAPE: 0.0555
(1998.8179343499942, 33.536014556884766, 0.6581072211265564)
# Plot training history
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('ANN Training History')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()
# Summarize Results
print("Comparison of Models:")
print("Linear Regression:", evaluate_model(y_test, lr_pred, "Linear
Regression"))
print("Polynomial Regression:", evaluate_model(y_test, poly_pred,
"Polynomial Regression"))
print("Random Forest:", evaluate_model(y_test, rf_pred, "Random
Forest"))
print("ANN:", evaluate_model(y_test, ann_pred, "ANN"))
print("ExtraTreesRegressor:", evaluate_model(y_test, et_pred,
"ExtraTreesRegression"))
print("Decision Tree:", evaluate_model(y_test, dt_pred, "Decision
Tree"))
print("K-Nearest Neighbors Regression:", evaluate_model(y_test,
knn_pred, "K-Nearest Neighbors Regression"))
print("XGBoost Regression:", evaluate_model(y_test, xgb_pred, "XGBoost
Regression"))
Comparison of Models:
Linear Regression - MSE: 2956.5988, MAE: 36.8101, R2: 0.4945, MAPE:
0.0592
Linear Regression: (2956.598838744695, 36.81014941631003,
0.4945428555105083)
Polynomial Regression - MSE: 1792.6273, MAE: 29.8251, R2: 0.6935,
MAPE: 0.0487
Polynomial Regression: (1792.627278530938, 29.825132174539338,
0.693503141176109)
Random Forest - MSE: 1605.8832, MAE: 28.6347, R2: 0.7252, MAPE: 0.0469
Random Forest: (1605.8831527777775, 28.634722222222223,
0.7251631310033735)
ANN - MSE: 5806.9870, MAE: 53.5173, R2: 0.0098, MAPE: 0.0772
ANN: (5806.987028628471, 53.51728185017903, 0.009785771369934082)
ExtraTreesRegression - MSE: 1519.4255, MAE: 24.2181, R2: 0.7411, MAPE:
0.0379
ExtraTreesRegressor: (1519.4254842592584, 24.218055555555555,
0.7411494870144324)
Decision Tree - MSE: 1837.7995, MAE: 29.5104, R2: 0.6849, MAPE: 0.0501
Decision Tree: (1837.7994791666667, 29.510416666666664,
0.6849293529099796)
K-Nearest Neighbors Regression - MSE: 2188.1334, MAE: 31.5887, R2:
0.6266, MAPE: 0.0512
K-Nearest Neighbors Regression: (2188.1333734738328,
31.588736293230642, 0.6266350442194297)
XGBoost Regression - MSE: 1998.8179, MAE: 33.5360, R2: 0.6581, MAPE:
0.0555
XGBoost Regression: (1998.8179343499942, 33.536014556884766,
0.6581072211265564)
X = np.concatenate((X_train, X_test), axis=0)
y = np.concatenate((y_train, y_test), axis=0)
best_model = best_et # Replace with the best model
best_pred = best_model.predict(X)
# Separate predictions for Tensile Strength and Yield Strength
actual_tensile = y[:, 0]
actual_yield = y[:, 1]
pred_tensile = best_pred[:, 0]
pred_yield = best_pred[:, 1]
# Plot 1: Prediction vs Actual for Tensile Strength
plt.figure(figsize=(10, 5))
plt.scatter(actual_tensile, pred_tensile, alpha=0.7, label="Tensile
Strength")
plt.plot([actual_tensile.min(), actual_tensile.max()],
[actual_tensile.min(), actual_tensile.max()], 'r--', label="Perfect
Fit")
plt.title("Prediction vs Actual: Tensile Strength")
plt.xlabel("Actual Tensile Strength")
plt.ylabel("Predicted Tensile Strength")
plt.legend()
plt.grid()
plt.show()
# Plot 2: Prediction vs Actual for Yield Strength
plt.figure(figsize=(10, 5))
plt.scatter(actual_yield, pred_yield, alpha=0.7, label="Yield
Strength", color="orange")
plt.plot([actual_yield.min(), actual_yield.max()],
[actual_yield.min(), actual_yield.max()], 'r--', label="Perfect Fit")
plt.title("Prediction vs Actual: Yield Strength")
plt.xlabel("Actual Yield Strength")
plt.ylabel("Predicted Yield Strength")
plt.legend()
plt.grid()
plt.show()
# Plot 3: Residuals for Tensile Strength
residuals_tensile = actual_tensile - pred_tensile
plt.figure(figsize=(10, 5))
sns.histplot(residuals_tensile, kde=True, bins=20, color="blue")
plt.title("Residuals: Tensile Strength")
plt.xlabel("Residuals")
plt.ylabel("Frequency")
plt.grid()
plt.show()
# Plot 4: Residuals for Yield Strength
residuals_yield = actual_yield - pred_yield
plt.figure(figsize=(10, 5))
sns.histplot(residuals_yield, kde=True, bins=20, color="green")
plt.title("Residuals: Yield Strength")
plt.xlabel("Residuals")
plt.ylabel("Frequency")
plt.grid()
plt.show()
# Feature importance plot
feature_importances = best_model.feature_importances_
feature_names = data.columns[1:-2] # Assuming last two columns are
outputs
plt.figure(figsize=(12, 6))
sns.barplot(x=feature_importances, y=feature_names)
plt.title("Feature Importance (Extra trees)")
plt.xlabel("Importance")
plt.ylabel("Features")
plt.grid()
plt.show()
# Collect R² scores for all models
model_scores = {
"Random Forest": r2_score(y_test, rf_pred),
"Linear Regression": r2_score(y_test, lr_pred),
"Nonlinear Regression": r2_score(y_test, poly_pred),
"ANN": r2_score(y_test, ann_pred),
"ExtraTreesRegressor": r2_score(y_test, et_pred),
"Decision tree regressor": r2_score(y_test, dt_pred),
"K-Nearest Neighbors Regressor": r2_score(y_test, knn_pred),
"XGBoost Regressor": r2_score(y_test, xgb_pred)
}
# Plot bar chart
plt.figure(figsize=(10, 6))
plt.bar(model_scores.keys(), model_scores.values())
plt.title("Model Performance Comparison (R² Scores)")
plt.ylabel("R² Score")
plt.xlabel("Model")
plt.ylim(0, 1)
plt.grid(axis='y', linestyle='--', alpha=0.7)
# Rotate x-axis labels vertically
plt.xticks(rotation=90)
plt.show()
# Collect MAPE for all models
model_scores_mape = {
"Random Forest": mean_absolute_percentage_error(y_test, rf_pred),
"Linear Regression": mean_absolute_percentage_error(y_test,
lr_pred),
"Nonlinear Regression": mean_absolute_percentage_error(y_test,
poly_pred),
"ANN": mean_absolute_percentage_error(y_test, ann_pred),
"ExtraTreesRegressor": mean_absolute_percentage_error(y_test,
et_pred),
"Decision tree regressor": mean_absolute_percentage_error(y_test,
dt_pred),
"K-Nearest Neighbors Regressor":
mean_absolute_percentage_error(y_test, knn_pred),
"XGBoost Regressor": mean_absolute_percentage_error(y_test,
xgb_pred)
}
# Plot bar chart
plt.figure(figsize=(10, 6))
plt.bar(model_scores_mape.keys(), model_scores_mape.values())
plt.title("Model Performance Comparison (MAPE)")
plt.ylabel("MAPE")
plt.xlabel("Model")
plt.ylim(0, 0.1)
plt.grid(axis='y', linestyle='--', alpha=0.7)
# Rotate x-axis labels vertically
plt.xticks(rotation=90)
plt.show()
Download