diff --git a/app/preprocessing/transform_dataset.py b/app/preprocessing/transform_dataset.py index bc44285e9b19c9f0ac22236b60f13833a6a289cb..702f92776b928ffe243d13cda8bdc01701d4c127 100644 --- a/app/preprocessing/transform_dataset.py +++ b/app/preprocessing/transform_dataset.py @@ -178,16 +178,16 @@ def _remove_unimportant_predictors(train_files: List[Path], all_files: List[Path columns_to_analyze = [col for col in df.columns if col not in columns_to_keep] columns_to_keep.update([col for col in columns_to_analyze if np.var(df[col]) > VARIANCE_THRESHOLD]) - df: pd.DataFrame = df.drop(columns=columns_to_keep) + # df: pd.DataFrame = df.drop(columns=columns_to_keep) - correlation_matrix = df.corr().abs() + # correlation_matrix = df.corr().abs() - # Select the upper triangle of the correlation matrix - upper_tri = correlation_matrix.where( - np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool) - ) + # # Select the upper triangle of the correlation matrix + # upper_tri = correlation_matrix.where( + # np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool) + # ) - columns_to_keep.update([col for col in upper_tri.columns if all(upper_tri[col] <= CORRELATION_THRESHOLD)]) + # columns_to_keep.update([col for col in upper_tri.columns if all(upper_tri[col] <= CORRELATION_THRESHOLD)]) for file in all_files: print(f'Removing not important predictors from {file.name}')