From 87e929259a22b714aed9cc62a2e4bdc04d498cf6 Mon Sep 17 00:00:00 2001 From: Andri Joos <andri@joos.io> Date: Wed, 20 Nov 2024 15:20:03 +0100 Subject: [PATCH] add covariance matrix calculation --- .vscode/launch.json | 7 ++++ app/preprocessing/covariance_matrices.py | 48 ++++++++++++++++++++++++ pyproject.toml | 3 ++ 3 files changed, 58 insertions(+) create mode 100644 app/preprocessing/covariance_matrices.py diff --git a/.vscode/launch.json b/.vscode/launch.json index d27f7ce..5a76904 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -19,5 +19,12 @@ "--no-parallelization", ], }, + { + "name": "Python Debugger: [test] covariance matrices", + "type": "debugpy", + "request": "launch", + "program": "${workspaceFolder}/app/preprocessing/covariance_matrices.py", + "console": "integratedTerminal" + }, ] } diff --git a/app/preprocessing/covariance_matrices.py b/app/preprocessing/covariance_matrices.py new file mode 100644 index 0000000..a954af9 --- /dev/null +++ b/app/preprocessing/covariance_matrices.py @@ -0,0 +1,48 @@ +import pandas as pd +import numpy as np +import os +from collections import defaultdict +from sklearn.preprocessing import StandardScaler +import matplotlib.pyplot as plt +import seaborn as sns + +# Define the path to your parquet files directory +parquet_dir = 'dataset/preprocessing/transformed_data/full_dataset_transformation/removed_unimportant_predictors' + +# Dictionary to store covariance matrices per class +covariances = defaultdict(list) + +parquet_files = [f for f in os.listdir(parquet_dir) if f.endswith('.parquet')] # TODO: use utils + +# Process each parquet file individually +for file in parquet_files: + # Load the current parquet file + file_path = os.path.join(parquet_dir, file) + print(f'Collecting covariance from {file}') + df = pd.read_parquet(file_path) + + # Compute covariance for each class (Maneuver) in the current file + for maneuver_class in df['Maneuver'].unique(): + class_data: pd.DataFrame = df[df['Maneuver'] == maneuver_class].drop(columns=['Maneuver']) + # Calculate and store the covariance matrix for the class + # cov_matrix = np.cov(class_data, rowvar=False) + cov_matrix = class_data.cov().abs() + covariances[maneuver_class].append(cov_matrix) + +# Now, average the covariances for each class across all files +avg_covariances = {} +for maneuver_class, cov_matrices in covariances.items(): + # Stack matrices along a new axis and compute mean along that axis + avg_covariances[maneuver_class] = np.mean(cov_matrices, axis=0) + +# Display results +for maneuver_class, cov_matrix in avg_covariances.items(): + print(f"Average Covariance Matrix for class {maneuver_class}:\n{cov_matrix}\n") + +for maneuver_class, cov_matrix in avg_covariances.items(): + plt.figure(figsize=(10, 8)) + sns.heatmap(cov_matrix, annot=False, fmt=".2f", cmap="viridis", cbar=True) + plt.title(f"Average Covariance Matrix for Class '{maneuver_class}'") + plt.xlabel("Features") + plt.ylabel("Features") + plt.savefig(f'out/{maneuver_class}.png') diff --git a/pyproject.toml b/pyproject.toml index ecccbe7..b15d586 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,6 +7,9 @@ dependencies = [ "pyarrow >= 18.0.0, < 19.0.0", "joblib >= 1.4.2, < 2.0.0", "psutil >= 6.1.0, < 7.0.0", + "scikit-learn >= 1.5.2, < 2.0.0", + "matplotlib >= 3.9.2, < 4.0.0", + "seaborn >= 0.13.2, < 1.0.0", ] maintainers = [ { name = "Andri Joos" }, -- GitLab