From 175985230b99ab170b9cd94ed343577f9f3bd326 Mon Sep 17 00:00:00 2001 From: Andri Joos <andri@joos.io> Date: Fri, 11 Oct 2024 20:29:54 +0200 Subject: [PATCH] add multi feature regression --- .gitlab-ci.yml | 19 ++++++++++++++ .vscode/launch.json | 38 +++++++++++++++++++++++++++ src/app.py | 64 +++++++++++++++++++++++++++++++++++++-------- 3 files changed, 110 insertions(+), 11 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 540ac63..87a75ee 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -28,3 +28,22 @@ best_single_feature_regression: - out/ script: - vqcfim best-single-feature-regression --train-data dataset/InjectionMolding_Train.csv --out out --target 'mass' --p-value-threshold 0.05 + +multi_feature_regression_p_value: + extends: .run_script + artifacts: + expire_in: 1d + paths: + - out/ + script: + - vqcfim multi-feature-regression --train-data dataset/InjectionMolding_Train.csv --out out --target 'mass' --p-value-threshold 0.05 + +multi_feature_regression_manual_features: + extends: .run_script + artifacts: + expire_in: 1d + paths: + - out/ + script: + - vqcfim multi-feature-regression --train-data dataset/InjectionMolding_Train.csv --out out --target 'mass' + --features Inj1PosVolAct_Var Inj1PrsAct_meanOfInjPhase ClpFceAct_1stPCscore diff --git a/.vscode/launch.json b/.vscode/launch.json index 1c202b6..7095acd 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -37,6 +37,44 @@ "--p-value-threshold", "0.05" ] + }, + { + "name": "Python Debugger: Multi Feature Regression with p-value", + "type": "debugpy", + "request": "launch", + "program": "${workspaceFolder}/src/app.py", + "console": "integratedTerminal", + "args": [ + "multi-feature-regression", + "-t", + "dataset/InjectionMolding_Train.csv", + "--target", + "mass", + "-o", + "out", + "--p-value-threshold", + "0.05" + ] + }, + { + "name": "Python Debugger: Multi Feature Regression with features", + "type": "debugpy", + "request": "launch", + "program": "${workspaceFolder}/src/app.py", + "console": "integratedTerminal", + "args": [ + "multi-feature-regression", + "-t", + "dataset/InjectionMolding_Train.csv", + "--target", + "mass", + "-o", + "out", + "-f", + "Inj1PosVolAct_Var", + "Inj1PrsAct_meanOfInjPhase", + "ClpFceAct_1stPCscore" + ] } ] } diff --git a/src/app.py b/src/app.py index 328a6be..8502fb2 100644 --- a/src/app.py +++ b/src/app.py @@ -18,6 +18,8 @@ DEFAULT_CORRELATION_THRESHOLD = 0.9 TARGET_ARG = '--target' P_VALUE_THRESHOLD_ARG = '--p-value-threshold' DEFAULT_P_VALUE_THRESHOLD = 0.05 +FEATURES_ARG = '--features' +FEATURES_ARG_SHORT = '-f' PVALUE_COLUMN_NAME = 'p-value' RSQUARED_COLUMN_NAME = 'R^2' @@ -25,6 +27,34 @@ RSQUARED_COLUMN_NAME = 'R^2' def ensure_directory(directory: Path): directory.mkdir(parents=True, exist_ok=True) +def get_possible_features_from_p_value(data: pd.DataFrame, target: str, p_value_threshold: float) -> pd.DataFrame: + features = data.columns + features = features.drop(target) + possible_features = pd.DataFrame({ + PVALUE_COLUMN_NAME: pd.Series(dtype='float'), + RSQUARED_COLUMN_NAME: pd.Series(dtype='float'), + }) + for feature in features: + pvalue, rsquared = single_feature_regression(data, feature, target) + possible_features.loc[feature] = {PVALUE_COLUMN_NAME: pvalue, RSQUARED_COLUMN_NAME: rsquared} + + return possible_features.where(possible_features[PVALUE_COLUMN_NAME] < p_value_threshold).dropna() + +def multi_feature_regression_model(train_data: pd.DataFrame, selected_features: List[str] | None, p_value_threshold: float | None, target: str) -> sm.OLS: + features: List[str] = None + if selected_features is not None and p_value_threshold is None: + features = selected_features + elif p_value_threshold is not None and selected_features is None: + features = list(get_possible_features_from_p_value(train_data, target, p_value_threshold).index) + else: + raise ValueError(f'selected_features is {selected_features} and p_value_threshold is {p_value_threshold}, but expected exactly one to be set') + + X = sm.add_constant(train_data[features]) + y = train_data[target] + model = sm.OLS(y, X).fit() + + return model, features + def correlation_analysis(train_data_file: Path, out_dir: Path, correlation_threshold: float): # Load training and test data train_data = pd.read_csv(train_data_file) @@ -62,7 +92,7 @@ def correlation_analysis(train_data_file: Path, out_dir: Path, correlation_thres with open(correlations_file, 'w') as f: f.writelines(correlations) -def single_feature_regression(data: pd.DataFrame, feature: str, target: str): +def single_feature_regression(data: pd.DataFrame, feature: str, target: str) -> Tuple[float, float]: X = sm.add_constant(data[[feature]]) # Add constant for intercept y = data[target] model = sm.OLS(y, X).fit() @@ -70,16 +100,7 @@ def single_feature_regression(data: pd.DataFrame, feature: str, target: str): def best_single_feature_regression(train_data_file: Path, target: str, p_value_threshold: float, out_dir: Path): train_data = pd.read_csv(train_data_file) - features = train_data.columns - features = features.drop(target) - - evaluated_features = pd.DataFrame({ - PVALUE_COLUMN_NAME: pd.Series(dtype='float'), - RSQUARED_COLUMN_NAME: pd.Series(dtype='float'), - }) - for feature in features: - pvalue, rsquared = single_feature_regression(train_data, feature, target) - evaluated_features.loc[feature] = {PVALUE_COLUMN_NAME: pvalue, RSQUARED_COLUMN_NAME: rsquared} + evaluated_features = get_possible_features_from_p_value(train_data, target, p_value_threshold) print('Evaluated features') print(evaluated_features) @@ -125,6 +146,18 @@ def best_single_feature_regression(train_data_file: Path, target: str, p_value_t p-value: {best_feature[PVALUE_COLUMN_NAME]} R^2: {best_feature[RSQUARED_COLUMN_NAME]} ''') + +def multi_feature_regression(train_data_file: Path, target: str, selected_features: List[str] | None, p_value_threshold: float | None, out_dir: Path): + train_data = pd.read_csv(train_data_file) + model, features = multi_feature_regression_model(train_data, selected_features, p_value_threshold, target) + print(model.summary()) + + ensure_directory(out_dir) + multi_feature_regression_results_file = out_dir / 'multi_feature_regression_results.txt' + with open(multi_feature_regression_results_file, 'w') as f: + f.write(f'''features: {features} +rsquared: {model.rsquared} +''') def main(): argument_parser = ArgumentParser('vqcfim', description='Virtual Quality Control for Injection Molding') @@ -143,6 +176,15 @@ def main(): best_single_feature_regression_subparser.add_argument(P_VALUE_THRESHOLD_ARG, action='store', type=float, required=False, default=DEFAULT_P_VALUE_THRESHOLD) best_single_feature_regression_subparser.set_defaults(func=lambda train_data, target, out, p_value_threshold, func: best_single_feature_regression(train_data, target, p_value_threshold, out)) + multi_feature_regression_subparser = subparsers.add_parser('multi-feature-regression', aliases=['mfr'], description='Performs a linear regression using multiple features') + multi_feature_regression_subparser.add_argument(TRAIN_DATA_ARG, TRAIN_DATA_ARG_SHORT, action='store', type=Path, required=True) + multi_feature_regression_subparser.add_argument(TARGET_ARG, action='store', type=str, required=True) + multi_feature_regression_subparser.add_argument(OUT_DIR_ARG, OUT_DIR_ARG_SHORT, action='store', type=Path, required=False, default=DEFAULT_OUT_DIR) + multi_feature_regression_features_group = multi_feature_regression_subparser.add_mutually_exclusive_group(required=True) + multi_feature_regression_features_group.add_argument(FEATURES_ARG, FEATURES_ARG_SHORT, action='store', type=str, required=False, nargs='+') + multi_feature_regression_features_group.add_argument(P_VALUE_THRESHOLD_ARG, action='store', type=float, required=False, default=None) + multi_feature_regression_subparser.set_defaults(func=lambda train_data, target, out, features, p_value_threshold, func: multi_feature_regression(train_data, target, features, p_value_threshold, out)) + parsed_args = argument_parser.parse_args() args = vars(parsed_args) parsed_args.func(**args) -- GitLab