From 3fa701ef4cbff8f4a388ff70949e6f69699e2b3f Mon Sep 17 00:00:00 2001 From: Andri Joos <andri@joos.io> Date: Fri, 15 Nov 2024 00:16:34 +0100 Subject: [PATCH] set variance threshold to ~0 --- app/preprocessing/transform_dataset.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/app/preprocessing/transform_dataset.py b/app/preprocessing/transform_dataset.py index 914be5a..bc44285 100644 --- a/app/preprocessing/transform_dataset.py +++ b/app/preprocessing/transform_dataset.py @@ -18,7 +18,7 @@ from .json_maneuver_data import JsonManeuverData DOUBLE_PATTERN = r'Double(\d+)' MAX_DATASET_MEMORY_SIZE = 16602933278 MIN_JOBS = 2 -VARIANCE_THRESHOLD = 0.01 +VARIANCE_THRESHOLD = 1e-10 CORRELATION_THRESHOLD = 0.9 Y_CLASS_COLUMN = 'Maneuver' MANUALLY_EXCLUDED_COLUMNS = [ @@ -176,7 +176,7 @@ def _remove_unimportant_predictors(train_files: List[Path], all_files: List[Path df = pd.read_parquet(file) columns_to_analyze = [col for col in df.columns if col not in columns_to_keep] - columns_to_keep.update([col for col in columns_to_analyze if np.std(df[col]) >= VARIANCE_THRESHOLD]) + columns_to_keep.update([col for col in columns_to_analyze if np.var(df[col]) > VARIANCE_THRESHOLD]) df: pd.DataFrame = df.drop(columns=columns_to_keep) -- GitLab