diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 60a71e69a0ef8c27c13e2f9be1d0e2e8a5b895b6..156899f384734b871abcd8c869c94d27406fa211 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -60,3 +60,13 @@ model_evaluation: script: - vqcfim model-evaluation --model out/multi_feature_regression_model.pickle --test-data dataset/InjectionMolding_Test.csv --target mass --out out --features Inj1PosVolAct_Var Inj1PrsAct_meanOfInjPhase Inj1HtgEd3Act_1stPCscore ClpFceAct_1stPCscore + +quadratic_term: + extends: .run_script + artifacts: + expire_in: 1d + paths: + - out/ + script: + - vqcfim higher-order-terms --train-data dataset/InjectionMolding_Train.csv --test-data dataset/InjectionMolding_Test.csv --target mass --out out + --features Inj1PosVolAct_Var Inj1PrsAct_meanOfInjPhase Inj1HtgEd3Act_1stPCscore ClpFceAct_1stPCscore --degree 2 diff --git a/.vscode/launch.json b/.vscode/launch.json index dc0c266e3137de7a184e21d8ab0969c99353307d..0a103a2083f2b83200829e500c88b61b0be222fe 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -98,6 +98,58 @@ "Inj1HtgEd3Act_1stPCscore", "ClpFceAct_1stPCscore" ] + }, + { + "name": "Python Debugger: Higher Order Terms", + "type": "debugpy", + "request": "launch", + "program": "${workspaceFolder}/src/app.py", + "console": "integratedTerminal", + "args": [ + "higher-order-terms", + "--train-data", + "dataset/InjectionMolding_Train.csv", + "--test-data", + "dataset/InjectionMolding_Test.csv", + "--target", + "mass", + "-o", + "out", + "-f", + "Inj1PosVolAct_Var", + "Inj1PrsAct_meanOfInjPhase", + "Inj1HtgEd3Act_1stPCscore", + "ClpFceAct_1stPCscore", + "--degree", + "2" + ] + }, + { + "name": "Python Debugger: Best Higher Order Terms", + "type": "debugpy", + "request": "launch", + "program": "${workspaceFolder}/src/app.py", + "console": "integratedTerminal", + "args": [ + "best-higher-order-terms", + "--train-data", + "dataset/InjectionMolding_Train.csv", + "--test-data", + "dataset/InjectionMolding_Test.csv", + "--target", + "mass", + "-o", + "out/higher-order-terms", + "-f", + "Inj1PosVolAct_Var", + "Inj1PrsAct_meanOfInjPhase", + "Inj1HtgEd3Act_1stPCscore", + "ClpFceAct_1stPCscore", + "--degree-bottom-boundary", + "1", + "--degree-top-boundary", + "50" + ] } ] } diff --git a/src/app.py b/src/app.py index 248c8d42119ec8e52d51a7c5c3c7cbcc6cacfabc..6fc71bdce151d33a4673f2cfa7946ebf53fedfc7 100644 --- a/src/app.py +++ b/src/app.py @@ -7,6 +7,9 @@ import math from typing import List, Tuple import statsmodels.api as sm from sklearn.metrics import mean_squared_error # is not deprecated, only squared param +from sklearn.linear_model import LinearRegression +from sklearn.preprocessing import PolynomialFeatures +from sklearn.pipeline import Pipeline TRAIN_DATA_ARG = '--train-data' TRAIN_DATA_ARG_SHORT = '-t' @@ -24,9 +27,17 @@ FEATURES_ARG_SHORT = '-f' MODEL_ARG = '--model' MODEL_ARG_SHORT = '-m' TEST_DATA_ARG = '--test-data' +DEGREE_ARG = '--degree' +INTERACTION_ONLY_ARG = '--interaction-only' +DEGREE_BOTTOM_BOUNDARY_ARG = '--degree-bottom-boundary' +DEGREE_TOP_ARG = '--degree-top-boundary' PVALUE_COLUMN_NAME = 'p-value' RSQUARED_COLUMN_NAME = 'R^2' +DEGREE_COLUMN_NAME = 'degree' +INTERACTION_ONLY_COLUMN_NAME = 'interaction_only' +TRAIN_MSE_COLUMN_NAME = 'train_mse' +TEST_MSE_COLUMN_NAME = 'test_mse' def ensure_directory(directory: Path): directory.mkdir(parents=True, exist_ok=True) @@ -185,6 +196,79 @@ def model_evaluation(model_file: Path, test_data_file: Path, features: List[str] with open(model_evaluation_file, 'w') as f: f.write(f'''test_MSE: {test_mse} ''') + +def higher_order_terms(train_data_file: Path, test_data_file: Path, features: List[str], target: str, interaction_only: bool, degree: int, out_dir: Path): + train_data = pd.read_csv(train_data_file) + test_data = pd.read_csv(test_data_file) + + X_train = train_data[features] + y_train = train_data[target] + + X_test = test_data[features] + y_test = test_data[target] + + pipeline = Pipeline([ + ('poly', PolynomialFeatures(degree=degree, include_bias=False, interaction_only=interaction_only)), + ('linear', LinearRegression()) + ]) + + pipeline.fit(X_train, y_train) + y_pred = pipeline.predict(X_train) + train_mse = mean_squared_error(y_train, y_pred) + rsquared = pipeline.score(X_train, y_train) + + y_pred = pipeline.predict(X_test) + test_mse = mean_squared_error(y_test, y_pred) + + print(f'train MSE: {train_mse}, test mse: {test_mse}, R^2: {rsquared}') + + ensure_directory(out_dir) + output_file = out_dir / f'higher_order_terms_{degree}_{interaction_only}.txt' + with open(output_file, 'w') as f: + f.write(f'''train MSE: {train_mse} +test MSE: {test_mse} +rsquared: {rsquared} +''') + + return train_mse, test_mse, rsquared + +def best_higher_order_terms(train_data_file: Path, test_data_file: Path, features: List[str], target: str, degree_bottom_boundary: int, degree_top_boundary: int, out_dir: Path): + if degree_bottom_boundary > degree_top_boundary: + raise ValueError('degree bottom boundary must be smaller or equal than degree top boundary') + + configurations = pd.DataFrame({ + DEGREE_COLUMN_NAME: pd.Series(dtype=int), + INTERACTION_ONLY_COLUMN_NAME: pd.Series(dtype=bool), + TRAIN_MSE_COLUMN_NAME: pd.Series(dtype=float), + TEST_MSE_COLUMN_NAME: pd.Series(dtype=float), + RSQUARED_COLUMN_NAME: pd.Series(dtype='float'), + }) + for degree in range(degree_bottom_boundary, degree_top_boundary + 1): + interaction_only = False + train_mse, test_mse, rsquared = higher_order_terms(train_data_file, test_data_file, features, target, interaction_only, degree, out_dir) + configurations.loc[-1] = { + DEGREE_COLUMN_NAME: degree, + INTERACTION_ONLY_COLUMN_NAME: interaction_only, + TRAIN_MSE_COLUMN_NAME: train_mse, + TEST_MSE_COLUMN_NAME: test_mse, + RSQUARED_COLUMN_NAME: rsquared, + } + configurations.index = configurations.index + 1 + + interaction_only = True + train_mse, test_mse, rsquared = higher_order_terms(train_data_file, test_data_file, features, target, interaction_only, degree, out_dir) + configurations.loc[-1] = { + DEGREE_COLUMN_NAME: degree, + INTERACTION_ONLY_COLUMN_NAME: interaction_only, + TRAIN_MSE_COLUMN_NAME: train_mse, + TEST_MSE_COLUMN_NAME: test_mse, + RSQUARED_COLUMN_NAME: rsquared, + } + configurations.index = configurations.index + 1 + + configurations = configurations.sort_values(RSQUARED_COLUMN_NAME, ascending=False) + + print(configurations) def main(): argument_parser = ArgumentParser('vqcfim', description='Virtual Quality Control for Injection Molding') @@ -220,6 +304,26 @@ def main(): model_evaluation_subparser.add_argument(FEATURES_ARG, FEATURES_ARG_SHORT, action='store', type=str, required=True, nargs='+') model_evaluation_subparser.set_defaults(func=lambda model, test_data, target, out, features, func: model_evaluation(model, test_data, features, target, out)) + higher_order_terms_subparser = subparsers.add_parser('higher-order-terms', aliases=['hot'], description='Creates and evaluates a model with higher order terms') + higher_order_terms_subparser.add_argument(TRAIN_DATA_ARG, TRAIN_DATA_ARG_SHORT, action='store', type=Path, required=True) + higher_order_terms_subparser.add_argument(TEST_DATA_ARG, action='store', type=Path, required=True) + higher_order_terms_subparser.add_argument(FEATURES_ARG, FEATURES_ARG_SHORT, action='store', type=str, required=True, nargs='+') + higher_order_terms_subparser.add_argument(TARGET_ARG, action='store', type=str, required=True) + higher_order_terms_subparser.add_argument(OUT_DIR_ARG, OUT_DIR_ARG_SHORT, action='store', type=Path, required=False, default=DEFAULT_OUT_DIR) + higher_order_terms_subparser.add_argument(DEGREE_ARG, action='store', type=int, required=True) + higher_order_terms_subparser.add_argument(INTERACTION_ONLY_ARG, action='store_true', required=False) + higher_order_terms_subparser.set_defaults(func=lambda train_data, test_data, features, target, degree, interaction_only, out, func: higher_order_terms(train_data, test_data, features, target, interaction_only, degree, out)) + + best_higher_order_terms_subparser = subparsers.add_parser('best-higher-order-terms', aliases=['bhot'], description='Creates and evaluates models with higher order terms') + best_higher_order_terms_subparser.add_argument(TRAIN_DATA_ARG, TRAIN_DATA_ARG_SHORT, action='store', type=Path, required=True) + best_higher_order_terms_subparser.add_argument(TEST_DATA_ARG, action='store', type=Path, required=True) + best_higher_order_terms_subparser.add_argument(FEATURES_ARG, FEATURES_ARG_SHORT, action='store', type=str, required=True, nargs='+') + best_higher_order_terms_subparser.add_argument(TARGET_ARG, action='store', type=str, required=True) + best_higher_order_terms_subparser.add_argument(OUT_DIR_ARG, OUT_DIR_ARG_SHORT, action='store', type=Path, required=False, default=DEFAULT_OUT_DIR) + best_higher_order_terms_subparser.add_argument(DEGREE_BOTTOM_BOUNDARY_ARG, action='store', type=int, required=True) + best_higher_order_terms_subparser.add_argument(DEGREE_TOP_ARG, action='store', type=int, required=True) + best_higher_order_terms_subparser.set_defaults(func=lambda train_data, test_data, features, degree_bottom_boundary, degree_top_boundary, target, out, func: best_higher_order_terms(train_data, test_data, features, target, degree_bottom_boundary, degree_top_boundary, out)) + parsed_args = argument_parser.parse_args() args = vars(parsed_args) parsed_args.func(**args)