diff --git a/.vscode/launch.json b/.vscode/launch.json index d27f7ce6ce1aca86a170f9dfcecc720eafb26dd9..5a7690422ec58358efbe6e573bd2716a80ed926b 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -19,5 +19,12 @@ "--no-parallelization", ], }, + { + "name": "Python Debugger: [test] covariance matrices", + "type": "debugpy", + "request": "launch", + "program": "${workspaceFolder}/app/preprocessing/covariance_matrices.py", + "console": "integratedTerminal" + }, ] } diff --git a/app/preprocessing/covariance_matrices.py b/app/preprocessing/covariance_matrices.py new file mode 100644 index 0000000000000000000000000000000000000000..a954af994f264dd68a2534b67b6011a53053dd86 --- /dev/null +++ b/app/preprocessing/covariance_matrices.py @@ -0,0 +1,48 @@ +import pandas as pd +import numpy as np +import os +from collections import defaultdict +from sklearn.preprocessing import StandardScaler +import matplotlib.pyplot as plt +import seaborn as sns + +# Define the path to your parquet files directory +parquet_dir = 'dataset/preprocessing/transformed_data/full_dataset_transformation/removed_unimportant_predictors' + +# Dictionary to store covariance matrices per class +covariances = defaultdict(list) + +parquet_files = [f for f in os.listdir(parquet_dir) if f.endswith('.parquet')] # TODO: use utils + +# Process each parquet file individually +for file in parquet_files: + # Load the current parquet file + file_path = os.path.join(parquet_dir, file) + print(f'Collecting covariance from {file}') + df = pd.read_parquet(file_path) + + # Compute covariance for each class (Maneuver) in the current file + for maneuver_class in df['Maneuver'].unique(): + class_data: pd.DataFrame = df[df['Maneuver'] == maneuver_class].drop(columns=['Maneuver']) + # Calculate and store the covariance matrix for the class + # cov_matrix = np.cov(class_data, rowvar=False) + cov_matrix = class_data.cov().abs() + covariances[maneuver_class].append(cov_matrix) + +# Now, average the covariances for each class across all files +avg_covariances = {} +for maneuver_class, cov_matrices in covariances.items(): + # Stack matrices along a new axis and compute mean along that axis + avg_covariances[maneuver_class] = np.mean(cov_matrices, axis=0) + +# Display results +for maneuver_class, cov_matrix in avg_covariances.items(): + print(f"Average Covariance Matrix for class {maneuver_class}:\n{cov_matrix}\n") + +for maneuver_class, cov_matrix in avg_covariances.items(): + plt.figure(figsize=(10, 8)) + sns.heatmap(cov_matrix, annot=False, fmt=".2f", cmap="viridis", cbar=True) + plt.title(f"Average Covariance Matrix for Class '{maneuver_class}'") + plt.xlabel("Features") + plt.ylabel("Features") + plt.savefig(f'out/{maneuver_class}.png') diff --git a/pyproject.toml b/pyproject.toml index ecccbe70c040824f9a3ab5e4d1af7e93fdd5be94..b15d5863b4c2decc4b0a839d5c69cdb595606321 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,6 +7,9 @@ dependencies = [ "pyarrow >= 18.0.0, < 19.0.0", "joblib >= 1.4.2, < 2.0.0", "psutil >= 6.1.0, < 7.0.0", + "scikit-learn >= 1.5.2, < 2.0.0", + "matplotlib >= 3.9.2, < 4.0.0", + "seaborn >= 0.13.2, < 1.0.0", ] maintainers = [ { name = "Andri Joos" },