diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 87a75eee642b4b6fc5cfacfa2ec6ba9a385c72f1..60a71e69a0ef8c27c13e2f9be1d0e2e8a5b895b6 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -3,9 +3,9 @@ stages: .run_script: stage: run_scripts - image: python:3-alpine + image: python:3-bookworm tags: - - amd64 # needed for matplotlib + - amd64 # needed for matplotlib & scikit-learn variables: PIP_ROOT_USER_ACTION: ignore before_script: @@ -47,3 +47,16 @@ multi_feature_regression_manual_features: script: - vqcfim multi-feature-regression --train-data dataset/InjectionMolding_Train.csv --out out --target 'mass' --features Inj1PosVolAct_Var Inj1PrsAct_meanOfInjPhase ClpFceAct_1stPCscore + +model_evaluation: + extends: .run_script + needs: + - job: multi_feature_regression_p_value + artifacts: true + artifacts: + expire_in: 1d + paths: + - out/ + script: + - vqcfim model-evaluation --model out/multi_feature_regression_model.pickle --test-data dataset/InjectionMolding_Test.csv --target mass --out out + --features Inj1PosVolAct_Var Inj1PrsAct_meanOfInjPhase Inj1HtgEd3Act_1stPCscore ClpFceAct_1stPCscore diff --git a/.vscode/launch.json b/.vscode/launch.json index 7095acd0afa4655ca6828c956b177f15a774d751..dc0c266e3137de7a184e21d8ab0969c99353307d 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -75,6 +75,29 @@ "Inj1PrsAct_meanOfInjPhase", "ClpFceAct_1stPCscore" ] + }, + { + "name": "Python Debugger: Model Evaluation", + "type": "debugpy", + "request": "launch", + "program": "${workspaceFolder}/src/app.py", + "console": "integratedTerminal", + "args": [ + "model-evaluation", + "-m", + "out/multi_feature_regression_model.pickle", + "--test-data", + "dataset/InjectionMolding_Test.csv", + "--target", + "mass", + "-o", + "out", + "-f", + "Inj1PosVolAct_Var", + "Inj1PrsAct_meanOfInjPhase", + "Inj1HtgEd3Act_1stPCscore", + "ClpFceAct_1stPCscore" + ] } ] } diff --git a/pyproject.toml b/pyproject.toml index 96bd934250ea9c5bf9faf748e62552449dd54356..2a8a3d189d3594d6122af6483c8895dc53beb33d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,6 +7,7 @@ dependencies = [ "seaborn >= 0.13.2, < 1.0.0", "matplotlib >= 3.9.2, < 4.0.0", "statsmodels >= 0.14.4, < 1.0.0", + "scikit-learn >= 1.5.2, < 2.0.0", ] maintainers = [ {name = "Andri Joos"}, diff --git a/src/app.py b/src/app.py index 8502fb2702134e5ca0e3c504131fc95821805195..248c8d42119ec8e52d51a7c5c3c7cbcc6cacfabc 100644 --- a/src/app.py +++ b/src/app.py @@ -6,6 +6,7 @@ import matplotlib.pyplot as plt import math from typing import List, Tuple import statsmodels.api as sm +from sklearn.metrics import mean_squared_error # is not deprecated, only squared param TRAIN_DATA_ARG = '--train-data' TRAIN_DATA_ARG_SHORT = '-t' @@ -20,6 +21,9 @@ P_VALUE_THRESHOLD_ARG = '--p-value-threshold' DEFAULT_P_VALUE_THRESHOLD = 0.05 FEATURES_ARG = '--features' FEATURES_ARG_SHORT = '-f' +MODEL_ARG = '--model' +MODEL_ARG_SHORT = '-m' +TEST_DATA_ARG = '--test-data' PVALUE_COLUMN_NAME = 'p-value' RSQUARED_COLUMN_NAME = 'R^2' @@ -40,7 +44,7 @@ def get_possible_features_from_p_value(data: pd.DataFrame, target: str, p_value_ return possible_features.where(possible_features[PVALUE_COLUMN_NAME] < p_value_threshold).dropna() -def multi_feature_regression_model(train_data: pd.DataFrame, selected_features: List[str] | None, p_value_threshold: float | None, target: str) -> sm.OLS: +def multi_feature_regression_model(train_data: pd.DataFrame, selected_features: List[str] | None, p_value_threshold: float | None, target: str) -> Tuple[sm.OLS, List[str]]: features: List[str] = None if selected_features is not None and p_value_threshold is None: features = selected_features @@ -149,14 +153,37 @@ R^2: {best_feature[RSQUARED_COLUMN_NAME]} def multi_feature_regression(train_data_file: Path, target: str, selected_features: List[str] | None, p_value_threshold: float | None, out_dir: Path): train_data = pd.read_csv(train_data_file) + y = train_data[target] model, features = multi_feature_regression_model(train_data, selected_features, p_value_threshold, target) print(model.summary()) ensure_directory(out_dir) - multi_feature_regression_results_file = out_dir / 'multi_feature_regression_results.txt' + multi_feature_regression_results_file = out_dir / 'multi_feature_regression_train_results.txt' with open(multi_feature_regression_results_file, 'w') as f: f.write(f'''features: {features} -rsquared: {model.rsquared} +rsquared: {model.rsquared}, +train_MSE: {mean_squared_error(y, model.fittedvalues)} +''') + + ensure_directory(out_dir) + model_file = out_dir / 'multi_feature_regression_model.pickle' + model.save(model_file) + +def model_evaluation(model_file: Path, test_data_file: Path, features: List[str], target: str, out_dir: Path): + test_data = pd.read_csv(test_data_file) + model: sm.OLS = sm.load(model_file) + + X_test = sm.add_constant(test_data[features]) + y_test = test_data[target] + + y_pred = model.predict(X_test) + test_mse = mean_squared_error(y_test, y_pred) + print(f'Test MSE: {test_mse}') + + ensure_directory(out_dir) + model_evaluation_file = out_dir / 'model_evaluation.txt' + with open(model_evaluation_file, 'w') as f: + f.write(f'''test_MSE: {test_mse} ''') def main(): @@ -185,6 +212,14 @@ def main(): multi_feature_regression_features_group.add_argument(P_VALUE_THRESHOLD_ARG, action='store', type=float, required=False, default=None) multi_feature_regression_subparser.set_defaults(func=lambda train_data, target, out, features, p_value_threshold, func: multi_feature_regression(train_data, target, features, p_value_threshold, out)) + model_evaluation_subparser = subparsers.add_parser('model-evaluation', aliases=['me'], description='Evaluates a model') + model_evaluation_subparser.add_argument(MODEL_ARG, MODEL_ARG_SHORT, action='store', type=Path, required=True) + model_evaluation_subparser.add_argument(TEST_DATA_ARG, action='store', type=Path, required=True) + model_evaluation_subparser.add_argument(TARGET_ARG, action='store', type=str, required=True) + model_evaluation_subparser.add_argument(OUT_DIR_ARG, OUT_DIR_ARG_SHORT, action='store', type=Path, required=False, default=DEFAULT_OUT_DIR) + model_evaluation_subparser.add_argument(FEATURES_ARG, FEATURES_ARG_SHORT, action='store', type=str, required=True, nargs='+') + model_evaluation_subparser.set_defaults(func=lambda model, test_data, target, out, features, func: model_evaluation(model, test_data, features, target, out)) + parsed_args = argument_parser.parse_args() args = vars(parsed_args) parsed_args.func(**args)