Resultado 03
In [1]:
Copied!
# librerias
from loguru import logger
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
# librerias
from loguru import logger
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
Lectura de datos¶
In [2]:
Copied!
logger.info("Leer Datos")
# paths
path_raw = "../../data/raw/"
path_procesed = "../../data/procesed/"
path_final = "../../data/final/"
logger.info("Leer Datos")
# paths
path_raw = "../../data/raw/"
path_procesed = "../../data/procesed/"
path_final = "../../data/final/"
2024-04-02 22:32:36.747 | INFO | __main__:<module>:1 - Leer Datos
In [3]:
Copied!
# leer datos
train = pd.read_csv(path_procesed + "train.csv")
test = pd.read_csv(path_procesed + "test.csv")
# leer datos
train = pd.read_csv(path_procesed + "train.csv")
test = pd.read_csv(path_procesed + "test.csv")
In [4]:
Copied!
columns_to_convert = ['Pclass', 'SibSp','Parch']
train[columns_to_convert] = train[columns_to_convert].astype(str)
test[columns_to_convert] = test[columns_to_convert].astype(str)
columns_to_convert = ['Pclass', 'SibSp','Parch']
train[columns_to_convert] = train[columns_to_convert].astype(str)
test[columns_to_convert] = test[columns_to_convert].astype(str)
In [5]:
Copied!
# Obtener nombres de columnas según tipos de datos
variable_objetivo = 'Survived'
columnas_flotantes = [x for x in list(train.select_dtypes(include=['float64']).columns) if x!=variable_objetivo]
columnas_enteras = [x for x in list(train.select_dtypes(include=['int32', 'int64']).columns) if x!=variable_objetivo]
columnas_objetos = [x for x in list(train.select_dtypes(include=['object']).columns) if x!=variable_objetivo]
# Obtener nombres de columnas según tipos de datos
variable_objetivo = 'Survived'
columnas_flotantes = [x for x in list(train.select_dtypes(include=['float64']).columns) if x!=variable_objetivo]
columnas_enteras = [x for x in list(train.select_dtypes(include=['int32', 'int64']).columns) if x!=variable_objetivo]
columnas_objetos = [x for x in list(train.select_dtypes(include=['object']).columns) if x!=variable_objetivo]
Modelos¶
In [6]:
Copied!
import joblib
import time
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import (
accuracy_score,
precision_score,
recall_score,
f1_score,
roc_curve,
roc_auc_score,
)
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import joblib
import time
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import (
accuracy_score,
precision_score,
recall_score,
f1_score,
roc_curve,
roc_auc_score,
)
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
In [7]:
Copied!
def train_and_evaluate_model(model, X_train, y_train, X_test, y_test):
start_time = time.time()
model.fit(X_train, y_train)
execution_time = time.time() - start_time
y_pred = model.predict(X_test)
accuracy = round(accuracy_score(y_test, y_pred), 3)
precision = round(precision_score(y_test, y_pred), 3)
recall = round(recall_score(y_test, y_pred), 3)
f1 = round(f1_score(y_test, y_pred), 3)
y_prob = model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_prob, pos_label=-1)
auc = round(roc_auc_score(y_test, y_prob), 3)
evaluation_metrics = {
"Accuracy": accuracy,
"Precision": precision,
"Recall": recall,
"F1-Score": f1,
"AUC": auc,
"Time": round(execution_time, 3),
}
return evaluation_metrics
# Función para entrenar y evaluar cada modelo
def train_and_evaluate_all_models(models_dict, X_train, y_train, X_test, y_test):
evaluation_results = {}
for model_name, model in models_dict.items():
evaluation_metrics = train_and_evaluate_model(
model, X_train, y_train, X_test, y_test
)
evaluation_results[model_name] = evaluation_metrics
# Convertir los resultados en un DataFrame
results_df = pd.DataFrame.from_dict(evaluation_results, orient="index")
return results_df
def train_and_evaluate_model(model, X_train, y_train, X_test, y_test):
start_time = time.time()
model.fit(X_train, y_train)
execution_time = time.time() - start_time
y_pred = model.predict(X_test)
accuracy = round(accuracy_score(y_test, y_pred), 3)
precision = round(precision_score(y_test, y_pred), 3)
recall = round(recall_score(y_test, y_pred), 3)
f1 = round(f1_score(y_test, y_pred), 3)
y_prob = model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_prob, pos_label=-1)
auc = round(roc_auc_score(y_test, y_prob), 3)
evaluation_metrics = {
"Accuracy": accuracy,
"Precision": precision,
"Recall": recall,
"F1-Score": f1,
"AUC": auc,
"Time": round(execution_time, 3),
}
return evaluation_metrics
# Función para entrenar y evaluar cada modelo
def train_and_evaluate_all_models(models_dict, X_train, y_train, X_test, y_test):
evaluation_results = {}
for model_name, model in models_dict.items():
evaluation_metrics = train_and_evaluate_model(
model, X_train, y_train, X_test, y_test
)
evaluation_results[model_name] = evaluation_metrics
# Convertir los resultados en un DataFrame
results_df = pd.DataFrame.from_dict(evaluation_results, orient="index")
return results_df
In [8]:
Copied!
logger.info("Dividir el conjunto de datos en entrenamiento y prueba")
# Dividir los datos en entrenamiento y prueba
vo = 'Survived'
set_index='PassengerId'
features = [x for x in train.columns if x not in [vo,set_index]]
X = train[features]
y = train[vo]
# Dividir los datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)
logger.info("Dividir el conjunto de datos en entrenamiento y prueba")
# Dividir los datos en entrenamiento y prueba
vo = 'Survived'
set_index='PassengerId'
features = [x for x in train.columns if x not in [vo,set_index]]
X = train[features]
y = train[vo]
# Dividir los datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)
2024-04-02 22:32:37.121 | INFO | __main__:<module>:1 - Dividir el conjunto de datos en entrenamiento y prueba
In [9]:
Copied!
# Ejemplo de variables numéricas y categóricas
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()
# Crear los transformadores para las variables numéricas y categóricas
numeric_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='most_frequent')),
('onehot', OneHotEncoder(handle_unknown='ignore'))
])
# Crear el ColumnTransformer para aplicar las transformaciones en un pipeline
preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, numeric_features),
('cat', categorical_transformer, categorical_features)
])
# Aplicar el preprocesamiento a los datos de entrenamiento y prueba
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)
# Obtener los nombres de las columnas después del preprocesamiento
numeric_feature_names = preprocessor.transformers_[0][-1]
categorical_feature_names = preprocessor.transformers_[1][-1]
# Obtener las categorías únicas de las variables categóricas
unique_categories = preprocessor.named_transformers_['cat']['onehot'].categories_
# Crear los nombres de las columnas después del OneHotEncoding
encoded_categorical_feature_names = []
for i, categories in enumerate(unique_categories):
for category in categories:
encoded_categorical_feature_names.append(f'{categorical_feature_names[i]}_{category}')
# Convertir la matriz dispersa a un DataFrame de Pandas
transformed_train_df = pd.DataFrame(X_train_processed.toarray(), columns=numeric_feature_names+ encoded_categorical_feature_names)
transformed_test_df = pd.DataFrame(X_test_processed.toarray(), columns=numeric_feature_names+ encoded_categorical_feature_names)
# Ejemplo de variables numéricas y categóricas
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()
# Crear los transformadores para las variables numéricas y categóricas
numeric_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='most_frequent')),
('onehot', OneHotEncoder(handle_unknown='ignore'))
])
# Crear el ColumnTransformer para aplicar las transformaciones en un pipeline
preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, numeric_features),
('cat', categorical_transformer, categorical_features)
])
# Aplicar el preprocesamiento a los datos de entrenamiento y prueba
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)
# Obtener los nombres de las columnas después del preprocesamiento
numeric_feature_names = preprocessor.transformers_[0][-1]
categorical_feature_names = preprocessor.transformers_[1][-1]
# Obtener las categorías únicas de las variables categóricas
unique_categories = preprocessor.named_transformers_['cat']['onehot'].categories_
# Crear los nombres de las columnas después del OneHotEncoding
encoded_categorical_feature_names = []
for i, categories in enumerate(unique_categories):
for category in categories:
encoded_categorical_feature_names.append(f'{categorical_feature_names[i]}_{category}')
# Convertir la matriz dispersa a un DataFrame de Pandas
transformed_train_df = pd.DataFrame(X_train_processed.toarray(), columns=numeric_feature_names+ encoded_categorical_feature_names)
transformed_test_df = pd.DataFrame(X_test_processed.toarray(), columns=numeric_feature_names+ encoded_categorical_feature_names)
In [10]:
Copied!
# Modelos con mejores hiperparámetros
# Inicialización del clasificador RandomForest con hiperparámetros optimizados
random_forest = RandomForestClassifier(random_state=42,
n_estimators=100,
max_depth=None,
min_samples_split=2,
min_samples_leaf=1)
# Inicialización del clasificador LGBM con hiperparámetros optimizados
lgbm = LGBMClassifier(random_state=42,
n_estimators=100,
learning_rate=0.1,
max_depth=-1)
# Inicialización del clasificador DecisionTree con hiperparámetros optimizados
decision_tree = DecisionTreeClassifier(random_state=42,
max_depth=None,
min_samples_split=2,
min_samples_leaf=1)
# Inicialización del clasificador KNeighbors con hiperparámetros optimizados
knn = KNeighborsClassifier(n_neighbors=5,
weights='uniform',
p=2)
# Inicialización del clasificador LogisticRegression con hiperparámetros optimizados
logistic_regression = LogisticRegression(random_state=42,
C=1.0,
penalty='l2')
# Inicialización del clasificador GaussianNB con hiperparámetros optimizados
gaussian_nb = GaussianNB(var_smoothing=1e-9)
# Inicialización del clasificador AdaBoost con hiperparámetros optimizados
ada_boost = AdaBoostClassifier(random_state=42,
n_estimators=50,
learning_rate=0.1)
# Crear un diccionario de modelos con sus parámetros para facilitar la iteración
models = {
'Random Forest': random_forest,
'LGBM': lgbm,
'Decision Tree': decision_tree,
'KNN': knn,
'Logistic Regression': logistic_regression,
'GaussianNB': gaussian_nb,
'AdaBoost': ada_boost
}
# Modelos con mejores hiperparámetros
# Inicialización del clasificador RandomForest con hiperparámetros optimizados
random_forest = RandomForestClassifier(random_state=42,
n_estimators=100,
max_depth=None,
min_samples_split=2,
min_samples_leaf=1)
# Inicialización del clasificador LGBM con hiperparámetros optimizados
lgbm = LGBMClassifier(random_state=42,
n_estimators=100,
learning_rate=0.1,
max_depth=-1)
# Inicialización del clasificador DecisionTree con hiperparámetros optimizados
decision_tree = DecisionTreeClassifier(random_state=42,
max_depth=None,
min_samples_split=2,
min_samples_leaf=1)
# Inicialización del clasificador KNeighbors con hiperparámetros optimizados
knn = KNeighborsClassifier(n_neighbors=5,
weights='uniform',
p=2)
# Inicialización del clasificador LogisticRegression con hiperparámetros optimizados
logistic_regression = LogisticRegression(random_state=42,
C=1.0,
penalty='l2')
# Inicialización del clasificador GaussianNB con hiperparámetros optimizados
gaussian_nb = GaussianNB(var_smoothing=1e-9)
# Inicialización del clasificador AdaBoost con hiperparámetros optimizados
ada_boost = AdaBoostClassifier(random_state=42,
n_estimators=50,
learning_rate=0.1)
# Crear un diccionario de modelos con sus parámetros para facilitar la iteración
models = {
'Random Forest': random_forest,
'LGBM': lgbm,
'Decision Tree': decision_tree,
'KNN': knn,
'Logistic Regression': logistic_regression,
'GaussianNB': gaussian_nb,
'AdaBoost': ada_boost
}
In [11]:
Copied!
# Llamada a la función para entrenar y evaluar todos los modelos
logger.info("Entrenar y evaluar todos los modelos")
results_df = train_and_evaluate_all_models(models, transformed_train_df, y_train, transformed_test_df, y_test)
# Llamada a la función para entrenar y evaluar todos los modelos
logger.info("Entrenar y evaluar todos los modelos")
results_df = train_and_evaluate_all_models(models, transformed_train_df, y_train, transformed_test_df, y_test)
2024-04-02 22:32:37.183 | INFO | __main__:<module>:2 - Entrenar y evaluar todos los modelos
[LightGBM] [Info] Number of positive: 268, number of negative: 444 [LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000146 seconds. You can set `force_col_wise=true` to remove the overhead. [LightGBM] [Info] Total Bins 208 [LightGBM] [Info] Number of data points in the train set: 712, number of used features: 20 [LightGBM] [Info] [binary:BoostFromScore]: pavg=0.376404 -> initscore=-0.504838 [LightGBM] [Info] Start training from score -0.504838 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
In [12]:
Copied!
# Mostrar el DataFrame con los resultados
logger.info("Ordenar los resultados por la métrica AUC")
results_df.sort_values('AUC',ascending = False)
# Mostrar el DataFrame con los resultados
logger.info("Ordenar los resultados por la métrica AUC")
results_df.sort_values('AUC',ascending = False)
2024-04-02 22:32:37.603 | INFO | __main__:<module>:2 - Ordenar los resultados por la métrica AUC
Out[12]:
| Accuracy | Precision | Recall | F1-Score | AUC | Time | |
|---|---|---|---|---|---|---|
| Random Forest | 0.810 | 0.794 | 0.730 | 0.761 | 0.886 | 0.103 |
| Logistic Regression | 0.810 | 0.786 | 0.743 | 0.764 | 0.875 | 0.014 |
| KNN | 0.821 | 0.809 | 0.743 | 0.775 | 0.872 | 0.001 |
| LGBM | 0.793 | 0.768 | 0.716 | 0.741 | 0.871 | 0.101 |
| AdaBoost | 0.788 | 0.757 | 0.716 | 0.736 | 0.866 | 0.056 |
| Decision Tree | 0.788 | 0.743 | 0.743 | 0.743 | 0.788 | 0.003 |
| GaussianNB | 0.419 | 0.415 | 0.986 | 0.584 | 0.778 | 0.001 |
In [13]:
Copied!
logger.info("Seleccionar modelo")
#model = LGBMClassifier(random_state=42)
model = RandomForestClassifier(random_state=42)
model.fit(transformed_train_df, y_train)
logger.info("Seleccionar modelo")
#model = LGBMClassifier(random_state=42)
model = RandomForestClassifier(random_state=42)
model.fit(transformed_train_df, y_train)
2024-04-02 22:32:37.619 | INFO | __main__:<module>:1 - Seleccionar modelo
Out[13]:
RandomForestClassifier(random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier(random_state=42)
In [14]:
Copied!
# Obtener los hiperparámetros
hiperparametros = model.get_params()
print(hiperparametros)
# Obtener los hiperparámetros
hiperparametros = model.get_params()
print(hiperparametros)
{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False}
In [15]:
Copied!
# Obtener la importancia de las características
feature_importance = model.feature_importances_
# Obtener los nombres de las características
feature_names = transformed_train_df.columns
# Crear un DataFrame con las características y su importancia
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})
# Ordenar el DataFrame por importancia en orden descendente
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
# Obtener la importancia de las características
feature_importance = model.feature_importances_
# Obtener los nombres de las características
feature_names = transformed_train_df.columns
# Crear un DataFrame con las características y su importancia
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})
# Ordenar el DataFrame por importancia en orden descendente
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
In [16]:
Copied!
# Mostrar las características más importantes
feature_importance_df
# Mostrar las características más importantes
feature_importance_df
Out[16]:
| Feature | Importance | |
|---|---|---|
| 0 | Age | 0.220 |
| 1 | Fare | 0.220 |
| 5 | Sex_female | 0.135 |
| 6 | Sex_male | 0.134 |
| 4 | Pclass_3 | 0.044 |
| 28 | Cabin_N | 0.032 |
| 3 | Pclass_2 | 0.020 |
| 2 | Pclass_1 | 0.020 |
| 8 | SibSp_1 | 0.017 |
| 32 | Embarked_S | 0.017 |
| 14 | Parch_0 | 0.016 |
| 30 | Embarked_C | 0.016 |
| 7 | SibSp_0 | 0.014 |
| 15 | Parch_1 | 0.011 |
| 16 | Parch_2 | 0.011 |
| 25 | Cabin_E | 0.010 |
| 31 | Embarked_Q | 0.009 |
| 22 | Cabin_B | 0.008 |
| 23 | Cabin_C | 0.006 |
| 10 | SibSp_3 | 0.006 |
| 24 | Cabin_D | 0.006 |
| 9 | SibSp_2 | 0.005 |
| 11 | SibSp_4 | 0.004 |
| 21 | Cabin_A | 0.003 |
| 26 | Cabin_F | 0.003 |
| 13 | SibSp_8 | 0.002 |
| 27 | Cabin_G | 0.002 |
| 18 | Parch_4 | 0.002 |
| 19 | Parch_5 | 0.001 |
| 12 | SibSp_5 | 0.001 |
| 17 | Parch_3 | 0.001 |
| 29 | Cabin_T | 0.000 |
| 20 | Parch_6 | 0.000 |
In [17]:
Copied!
# Gráfico de barras para visualizar las características más importantes
plt.figure(figsize=(10, 6))
plt.barh(feature_importance_df['Feature'][:10], feature_importance_df['Importance'][:10])
plt.xlabel('Importance')
plt.title('Top 10 Feature Importance')
plt.gca().invert_yaxis() # Invertir el eje y para que las características más importantes estén en la parte superior
plt.show()
# Gráfico de barras para visualizar las características más importantes
plt.figure(figsize=(10, 6))
plt.barh(feature_importance_df['Feature'][:10], feature_importance_df['Importance'][:10])
plt.xlabel('Importance')
plt.title('Top 10 Feature Importance')
plt.gca().invert_yaxis() # Invertir el eje y para que las características más importantes estén en la parte superior
plt.show()
In [18]:
Copied!
logger.info("Realizar predicciones")
def preprocess_applier(preprocessor, X_data):
# Aplicar el preprocesamiento a los datos
X_data_processed = preprocessor.transform(X_data)
# Obtener los nombres de las columnas después del preprocesamiento
numeric_feature_names = preprocessor.transformers_[0][-1]
categorical_feature_names = preprocessor.transformers_[1][-1]
# Obtener las categorías únicas de las variables categóricas
unique_categories = preprocessor.named_transformers_["cat"]["onehot"].categories_
# Crear los nombres de las columnas después del OneHotEncoding
encoded_categorical_feature_names = []
for i, categories in enumerate(unique_categories):
for category in categories:
encoded_categorical_feature_names.append(
f"{categorical_feature_names[i]}_{category}"
)
# Convertir la matriz dispersa a un DataFrame de Pandas
transformed_df = pd.DataFrame(
X_data_processed.toarray(),
columns=numeric_feature_names + encoded_categorical_feature_names,
)
return transformed_df
X_test_processed2 = preprocess_applier(preprocessor, test.drop('PassengerId',axis=1))
predictions = model.predict(X_test_processed2)
test["Survived"] = predictions
test.head()
logger.info("Realizar predicciones")
def preprocess_applier(preprocessor, X_data):
# Aplicar el preprocesamiento a los datos
X_data_processed = preprocessor.transform(X_data)
# Obtener los nombres de las columnas después del preprocesamiento
numeric_feature_names = preprocessor.transformers_[0][-1]
categorical_feature_names = preprocessor.transformers_[1][-1]
# Obtener las categorías únicas de las variables categóricas
unique_categories = preprocessor.named_transformers_["cat"]["onehot"].categories_
# Crear los nombres de las columnas después del OneHotEncoding
encoded_categorical_feature_names = []
for i, categories in enumerate(unique_categories):
for category in categories:
encoded_categorical_feature_names.append(
f"{categorical_feature_names[i]}_{category}"
)
# Convertir la matriz dispersa a un DataFrame de Pandas
transformed_df = pd.DataFrame(
X_data_processed.toarray(),
columns=numeric_feature_names + encoded_categorical_feature_names,
)
return transformed_df
X_test_processed2 = preprocess_applier(preprocessor, test.drop('PassengerId',axis=1))
predictions = model.predict(X_test_processed2)
test["Survived"] = predictions
test.head()
2024-04-02 22:32:37.902 | INFO | __main__:<module>:1 - Realizar predicciones
Out[18]:
| PassengerId | Pclass | Sex | Age | SibSp | Parch | Fare | Cabin | Embarked | Survived | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 892 | 3 | male | 34.500 | 0 | 0 | 7.829 | N | Q | 0 |
| 1 | 893 | 3 | female | 47.000 | 1 | 0 | 7.000 | N | S | 0 |
| 2 | 894 | 2 | male | 62.000 | 0 | 0 | 9.688 | N | Q | 0 |
| 3 | 895 | 3 | male | 27.000 | 0 | 0 | 8.662 | N | S | 1 |
| 4 | 896 | 3 | female | 22.000 | 1 | 1 | 12.287 | N | S | 0 |
In [19]:
Copied!
logger.info("Guardar Resultados")
logger.info("Guardar Resultados")
2024-04-02 22:32:37.931 | INFO | __main__:<module>:1 - Guardar Resultados
In [20]:
Copied!
logger.info("Guardar resultados de las predicciones")
test.to_csv(path_final + "predictions.csv",index=False,sep=',')
logger.info("Guardar resultados de las predicciones")
test.to_csv(path_final + "predictions.csv",index=False,sep=',')
2024-04-02 22:32:37.946 | INFO | __main__:<module>:1 - Guardar resultados de las predicciones
In [21]:
Copied!
logger.info("Guardar resultados de los distintos modelos")
results_df.to_csv(path_final + "models_metrics.csv",index=False,sep=',')
logger.info("Guardar resultados de los distintos modelos")
results_df.to_csv(path_final + "models_metrics.csv",index=False,sep=',')
2024-04-02 22:32:37.961 | INFO | __main__:<module>:1 - Guardar resultados de los distintos modelos
In [22]:
Copied!
logger.info("Guardar Modelo" )
path_models = "../models/"
joblib.dump(model, path_models + 'modelo_titanic_rf.pkl')
logger.info("Guardar Modelo" )
path_models = "../models/"
joblib.dump(model, path_models + 'modelo_titanic_rf.pkl')
2024-04-02 22:32:37.976 | INFO | __main__:<module>:1 - Guardar Modelo
Out[22]:
['../models/modelo_titanic_rf.pkl']