diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 9aeb88aa8b2be32a2e4ca920fddfa5d5881cb9a5..540ac63ddff3d1258c88aa227cc9fa2c9445af46 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -19,3 +19,12 @@ correlation_analysis: - out/ script: - vqcfim correlation-analysis --train-data dataset/InjectionMolding_Train.csv --out out --correlation-threshold 0.9 + +best_single_feature_regression: + extends: .run_script + artifacts: + expire_in: 1d + paths: + - out/ + script: + - vqcfim best-single-feature-regression --train-data dataset/InjectionMolding_Train.csv --out out --target 'mass' --p-value-threshold 0.05 diff --git a/.vscode/launch.json b/.vscode/launch.json index d478b9d80d1985421ebfa0fafb66f7d55d20b1da..1c202b64322dfd0aec4c5265a4c968a619c3e328 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -19,6 +19,24 @@ "-c", "0.9" ] + }, + { + "name": "Python Debugger: Best Single Feature Regression", + "type": "debugpy", + "request": "launch", + "program": "${workspaceFolder}/src/app.py", + "console": "integratedTerminal", + "args": [ + "best-single-feature-regression", + "-t", + "dataset/InjectionMolding_Train.csv", + "--target", + "mass", + "-o", + "out", + "--p-value-threshold", + "0.05" + ] } ] } diff --git a/pyproject.toml b/pyproject.toml index 4575b5f9b16fc087ad33f90a51d2671b1e3f6d5d..96bd934250ea9c5bf9faf748e62552449dd54356 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,6 +6,7 @@ dependencies = [ "pandas >= 2.2.3, < 3.0.0", "seaborn >= 0.13.2, < 1.0.0", "matplotlib >= 3.9.2, < 4.0.0", + "statsmodels >= 0.14.4, < 1.0.0", ] maintainers = [ {name = "Andri Joos"}, diff --git a/src/app.py b/src/app.py index 49c94ebed45ae64fe4fc078d87b2cc7fb99c5bac..d58463d3f22753fb75e635bf65091eafe2f12a51 100644 --- a/src/app.py +++ b/src/app.py @@ -5,6 +5,7 @@ import seaborn as sns import matplotlib.pyplot as plt import math from typing import List, Tuple +import statsmodels.api as sm TRAIN_DATA_ARG = '--train-data' TRAIN_DATA_ARG_SHORT = '-t' @@ -14,6 +15,12 @@ DEFAULT_OUT_DIR = 'out/' CORRELATION_THRESHOLD_ARG = '--correlation-threshold' CORRELATION_THRESHOLD_ARG_SHORT = '-c' DEFAULT_CORRELATION_THRESHOLD = 0.9 +TARGET_ARG = '--target' +P_VALUE_THRESHOLD_ARG = '--p-value-threshold' +DEFAULT_P_VALUE_THRESHOLD = 0.05 + +PVALUE_COLUMN_NAME = 'p-value' +RSQUARED_COLUMN_NAME = 'R^2' def ensure_directory(directory: Path): directory.mkdir(parents=True, exist_ok=True) @@ -55,6 +62,64 @@ def correlation_analysis(train_data_file: Path, out_dir: Path, correlation_thres with open(correlations_file, 'w') as f: f.writelines(correlations) +def single_feature_regression(data: pd.DataFrame, feature: str, target: str): + X = sm.add_constant(data[[feature]]) # Add constant for intercept + y = data[target] + model = sm.OLS(y, X).fit() + return model.pvalues.iloc[1], model.rsquared + +def best_single_feature_regression(train_data_file: Path, target: str, p_value_threshold: float, out_dir: Path): + train_data = pd.read_csv(train_data_file) + features = train_data.columns + features = features.drop(target) + + evaluated_features = pd.DataFrame({ + PVALUE_COLUMN_NAME: pd.Series(dtype='float'), + RSQUARED_COLUMN_NAME: pd.Series(dtype='float'), + }) + for feature in features: + pvalue, rsquared = single_feature_regression(train_data, feature, target) + evaluated_features.loc[feature] = {PVALUE_COLUMN_NAME: pvalue, RSQUARED_COLUMN_NAME: rsquared} + + print('Evaluated features') + print(evaluated_features) + + plt.figure(figsize=(1.75, 4.8)) + evaluated_pvalues = evaluated_features[[PVALUE_COLUMN_NAME]] + sns.heatmap(evaluated_pvalues, annot=True, cmap='coolwarm', vmin=0, vmax=1) + + ensure_directory(out_dir) + evaluated_pvalues_file_path = out_dir / "evaluated_pvalues.png" + plt.savefig(evaluated_pvalues_file_path, bbox_inches='tight') + + plt.figure(figsize=(1.75, 4.8)) + evaluated_rsquares = evaluated_features[[RSQUARED_COLUMN_NAME]] + sns.heatmap(evaluated_rsquares, annot=True, cmap='coolwarm', vmin=0, vmax=1) + + ensure_directory(out_dir) + evaluated_rsquares_file_path = out_dir / "evaluated_rsquares.png" + plt.savefig(evaluated_rsquares_file_path, bbox_inches='tight') + + best_feature: pd.Series = None + for _, row in evaluated_features.iterrows(): + pvalue = row[PVALUE_COLUMN_NAME] + rsquared = row[RSQUARED_COLUMN_NAME] + + if best_feature is None or (pvalue < best_feature[PVALUE_COLUMN_NAME] and rsquared > best_feature[RSQUARED_COLUMN_NAME]): + best_feature = row + + print() + print('Best Feature') + print(best_feature) + + ensure_directory(out_dir) + best_feature_file = out_dir / 'best_feature.txt' + with open(best_feature_file, 'w') as f: + f.write(f'''Name: {best_feature.name} +p-value: {best_feature[PVALUE_COLUMN_NAME]} +R^2: {best_feature[RSQUARED_COLUMN_NAME]} +''') + def main(): argument_parser = ArgumentParser('vqcfim', description='Virtual Quality Control for Injection Molding') subparsers = argument_parser.add_subparsers(title='action') @@ -65,6 +130,13 @@ def main(): correlation_analysis_subparser.add_argument(CORRELATION_THRESHOLD_ARG, CORRELATION_THRESHOLD_ARG_SHORT, action='store', type=float, required=False, default=DEFAULT_CORRELATION_THRESHOLD) correlation_analysis_subparser.set_defaults(func=lambda train_data, out, correlation_threshold, func: correlation_analysis(train_data, out, correlation_threshold)) + best_single_feature_regression_subparser = subparsers.add_parser('best-single-feature-regression', aliases=['bsfr'], description='Evaluates the best single feature regression feature from the dataset') + best_single_feature_regression_subparser.add_argument(TRAIN_DATA_ARG, TRAIN_DATA_ARG_SHORT, action='store', type=Path, required=True) + best_single_feature_regression_subparser.add_argument(TARGET_ARG, action='store', type=str, required=True) + best_single_feature_regression_subparser.add_argument(OUT_DIR_ARG, OUT_DIR_ARG_SHORT, action='store', type=Path, required=False, default=DEFAULT_OUT_DIR) + best_single_feature_regression_subparser.add_argument(P_VALUE_THRESHOLD_ARG, action='store', type=float, required=False, default=DEFAULT_P_VALUE_THRESHOLD) + best_single_feature_regression_subparser.set_defaults(func=lambda train_data, target, out, p_value_threshold, func: best_single_feature_regression(train_data, target, p_value_threshold, out)) + parsed_args = argument_parser.parse_args() args = vars(parsed_args) parsed_args.func(**args)