diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 0c5c160674bc408afdab31b9b64cf67d14027d3c..07fdc2b2e3353714c87e4bdcda39db7657275e5c 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -14,7 +14,8 @@ } }, "containerEnv": { - "SHELL": "/bin/bash" + "SHELL": "/bin/bash", + "MPLBACKEND": "Agg" }, "remoteEnv": { "PATH": "/home/vscode/.local/bin:${containerEnv:PATH}" diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 9aeb88aa8b2be32a2e4ca920fddfa5d5881cb9a5..540ac63ddff3d1258c88aa227cc9fa2c9445af46 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -19,3 +19,12 @@ correlation_analysis: - out/ script: - vqcfim correlation-analysis --train-data dataset/InjectionMolding_Train.csv --out out --correlation-threshold 0.9 + +best_single_feature_regression: + extends: .run_script + artifacts: + expire_in: 1d + paths: + - out/ + script: + - vqcfim best-single-feature-regression --train-data dataset/InjectionMolding_Train.csv --out out --target 'mass' --p-value-threshold 0.05 diff --git a/.vscode/launch.json b/.vscode/launch.json index d478b9d80d1985421ebfa0fafb66f7d55d20b1da..1c202b64322dfd0aec4c5265a4c968a619c3e328 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -19,6 +19,24 @@ "-c", "0.9" ] + }, + { + "name": "Python Debugger: Best Single Feature Regression", + "type": "debugpy", + "request": "launch", + "program": "${workspaceFolder}/src/app.py", + "console": "integratedTerminal", + "args": [ + "best-single-feature-regression", + "-t", + "dataset/InjectionMolding_Train.csv", + "--target", + "mass", + "-o", + "out", + "--p-value-threshold", + "0.05" + ] } ] } diff --git a/pyproject.toml b/pyproject.toml index 68b42c6b58d0fcc098daa6808d7521e90ead2593..96bd934250ea9c5bf9faf748e62552449dd54356 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,8 @@ description = "Virtual Quality Control for Injection Molding" dependencies = [ "pandas >= 2.2.3, < 3.0.0", "seaborn >= 0.13.2, < 1.0.0", - "matplotlib >= 3.9.2, < 4.0.0" + "matplotlib >= 3.9.2, < 4.0.0", + "statsmodels >= 0.14.4, < 1.0.0", ] maintainers = [ {name = "Andri Joos"}, diff --git a/src/app.py b/src/app.py index a8ee507f1ccbe1798131a72c20e056b21eaa4c41..328a6be30c5b0169abd5607afef5f777951ed502 100644 --- a/src/app.py +++ b/src/app.py @@ -5,6 +5,7 @@ import seaborn as sns import matplotlib.pyplot as plt import math from typing import List, Tuple +import statsmodels.api as sm TRAIN_DATA_ARG = '--train-data' TRAIN_DATA_ARG_SHORT = '-t' @@ -14,6 +15,12 @@ DEFAULT_OUT_DIR = 'out/' CORRELATION_THRESHOLD_ARG = '--correlation-threshold' CORRELATION_THRESHOLD_ARG_SHORT = '-c' DEFAULT_CORRELATION_THRESHOLD = 0.9 +TARGET_ARG = '--target' +P_VALUE_THRESHOLD_ARG = '--p-value-threshold' +DEFAULT_P_VALUE_THRESHOLD = 0.05 + +PVALUE_COLUMN_NAME = 'p-value' +RSQUARED_COLUMN_NAME = 'R^2' def ensure_directory(directory: Path): directory.mkdir(parents=True, exist_ok=True) @@ -25,7 +32,7 @@ def correlation_analysis(train_data_file: Path, out_dir: Path, correlation_thres # Display correlation matrix correlation_matrix = train_data.corr() plt.figure(figsize=(10, 8)) - sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm') + sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1) plt.title('Correlation Matrix') ensure_directory(out_dir) @@ -55,16 +62,87 @@ def correlation_analysis(train_data_file: Path, out_dir: Path, correlation_thres with open(correlations_file, 'w') as f: f.writelines(correlations) +def single_feature_regression(data: pd.DataFrame, feature: str, target: str): + X = sm.add_constant(data[[feature]]) # Add constant for intercept + y = data[target] + model = sm.OLS(y, X).fit() + return model.pvalues.iloc[1], model.rsquared + +def best_single_feature_regression(train_data_file: Path, target: str, p_value_threshold: float, out_dir: Path): + train_data = pd.read_csv(train_data_file) + features = train_data.columns + features = features.drop(target) + + evaluated_features = pd.DataFrame({ + PVALUE_COLUMN_NAME: pd.Series(dtype='float'), + RSQUARED_COLUMN_NAME: pd.Series(dtype='float'), + }) + for feature in features: + pvalue, rsquared = single_feature_regression(train_data, feature, target) + evaluated_features.loc[feature] = {PVALUE_COLUMN_NAME: pvalue, RSQUARED_COLUMN_NAME: rsquared} + + print('Evaluated features') + print(evaluated_features) + + plt.figure(figsize=(1.75, 4.8)) + evaluated_pvalues = evaluated_features[[PVALUE_COLUMN_NAME]] + sns.heatmap(evaluated_pvalues, annot=True, cmap='coolwarm', vmin=0, vmax=1) + + ensure_directory(out_dir) + evaluated_pvalues_file_path = out_dir / "evaluated_pvalues.png" + plt.savefig(evaluated_pvalues_file_path, bbox_inches='tight') + + plt.figure(figsize=(1.75, 4.8)) + evaluated_rsquares = evaluated_features[[RSQUARED_COLUMN_NAME]] + sns.heatmap(evaluated_rsquares, annot=True, cmap='coolwarm', vmin=0, vmax=1) + + ensure_directory(out_dir) + evaluated_rsquares_file_path = out_dir / "evaluated_rsquares.png" + plt.savefig(evaluated_rsquares_file_path, bbox_inches='tight') + + possible_features = evaluated_features.where(evaluated_features[PVALUE_COLUMN_NAME] < p_value_threshold).dropna() + possible_features = possible_features.sort_values(RSQUARED_COLUMN_NAME, ascending=False) + + print() + print('Features matching p-value-threshold') + print(possible_features) + + best_feature: pd.Series = None + if possible_features.shape[0] > 0: + best_feature = possible_features.iloc[0] + else: + print('No feature meets all criteria') + return + + print() + print('Best Feature') + print(best_feature) + + ensure_directory(out_dir) + best_feature_file = out_dir / 'best_feature.txt' + with open(best_feature_file, 'w') as f: + f.write(f'''Name: {best_feature.name} +p-value: {best_feature[PVALUE_COLUMN_NAME]} +R^2: {best_feature[RSQUARED_COLUMN_NAME]} +''') + def main(): argument_parser = ArgumentParser('vqcfim', description='Virtual Quality Control for Injection Molding') subparsers = argument_parser.add_subparsers(title='action') - correlation_analysis_subparser = subparsers.add_parser('correlation-analysis', aliases=['c'], description='Gets variables with correlation coefficients >= 0.9') + correlation_analysis_subparser = subparsers.add_parser('correlation-analysis', aliases=['ca'], description='Gets variables with correlation coefficients >= --correlation-coefficient') correlation_analysis_subparser.add_argument(TRAIN_DATA_ARG, TRAIN_DATA_ARG_SHORT, action='store', type=Path, required=True) correlation_analysis_subparser.add_argument(OUT_DIR_ARG, OUT_DIR_ARG_SHORT, action='store', type=Path, required=False, default=DEFAULT_OUT_DIR) correlation_analysis_subparser.add_argument(CORRELATION_THRESHOLD_ARG, CORRELATION_THRESHOLD_ARG_SHORT, action='store', type=float, required=False, default=DEFAULT_CORRELATION_THRESHOLD) correlation_analysis_subparser.set_defaults(func=lambda train_data, out, correlation_threshold, func: correlation_analysis(train_data, out, correlation_threshold)) + best_single_feature_regression_subparser = subparsers.add_parser('best-single-feature-regression', aliases=['bsfr'], description='Evaluates the best single feature regression feature from the dataset') + best_single_feature_regression_subparser.add_argument(TRAIN_DATA_ARG, TRAIN_DATA_ARG_SHORT, action='store', type=Path, required=True) + best_single_feature_regression_subparser.add_argument(TARGET_ARG, action='store', type=str, required=True) + best_single_feature_regression_subparser.add_argument(OUT_DIR_ARG, OUT_DIR_ARG_SHORT, action='store', type=Path, required=False, default=DEFAULT_OUT_DIR) + best_single_feature_regression_subparser.add_argument(P_VALUE_THRESHOLD_ARG, action='store', type=float, required=False, default=DEFAULT_P_VALUE_THRESHOLD) + best_single_feature_regression_subparser.set_defaults(func=lambda train_data, target, out, p_value_threshold, func: best_single_feature_regression(train_data, target, p_value_threshold, out)) + parsed_args = argument_parser.parse_args() args = vars(parsed_args) parsed_args.func(**args)