Compare revisions

Andri Joos · Andri Joos · 6ba32d31 · 6ba32d31 · 6ba32d31
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -60,3 +60,13 @@ model_evaluation:
  script:
  - vqcfim model-evaluation --model out/multi_feature_regression_model.pickle --test-data dataset/InjectionMolding_Test.csv --target mass --out out
    --features Inj1PosVolAct_Var Inj1PrsAct_meanOfInjPhase Inj1HtgEd3Act_1stPCscore ClpFceAct_1stPCscore
+
+quadratic_term:
+  extends: .run_script
+  artifacts:
+    expire_in: 1d
+    paths:
+    - out/
+  script:
+  - vqcfim higher-order-terms --train-data dataset/InjectionMolding_Train.csv --test-data dataset/InjectionMolding_Test.csv --target mass --out out
+    --features Inj1PosVolAct_Var Inj1PrsAct_meanOfInjPhase Inj1HtgEd3Act_1stPCscore ClpFceAct_1stPCscore --degree 2
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -98,6 +98,58 @@
                "Inj1HtgEd3Act_1stPCscore",
                "ClpFceAct_1stPCscore"
            ]
+        },
+        {
+            "name": "Python Debugger: Higher Order Terms",
+            "type": "debugpy",
+            "request": "launch",
+            "program": "${workspaceFolder}/src/app.py",
+            "console": "integratedTerminal",
+            "args": [
+                "higher-order-terms",
+                "--train-data",
+                "dataset/InjectionMolding_Train.csv",
+                "--test-data",
+                "dataset/InjectionMolding_Test.csv",
+                "--target",
+                "mass",
+                "-o",
+                "out",
+                "-f",
+                "Inj1PosVolAct_Var",
+                "Inj1PrsAct_meanOfInjPhase",
+                "Inj1HtgEd3Act_1stPCscore",
+                "ClpFceAct_1stPCscore",
+                "--degree",
+                "2"
+            ]
+        },
+        {
+            "name": "Python Debugger: Best Higher Order Terms",
+            "type": "debugpy",
+            "request": "launch",
+            "program": "${workspaceFolder}/src/app.py",
+            "console": "integratedTerminal",
+            "args": [
+                "best-higher-order-terms",
+                "--train-data",
+                "dataset/InjectionMolding_Train.csv",
+                "--test-data",
+                "dataset/InjectionMolding_Test.csv",
+                "--target",
+                "mass",
+                "-o",
+                "out/higher-order-terms",
+                "-f",
+                "Inj1PosVolAct_Var",
+                "Inj1PrsAct_meanOfInjPhase",
+                "Inj1HtgEd3Act_1stPCscore",
+                "ClpFceAct_1stPCscore",
+                "--degree-bottom-boundary",
+                "1",
+                "--degree-top-boundary",
+                "50"
+            ]
        }
    ]
 }
--- a/src/app.py
+++ b/src/app.py
@@ -7,6 +7,9 @@ import math
 from typing import List, Tuple
 import statsmodels.api as sm
 from sklearn.metrics import mean_squared_error # is not deprecated, only squared param
+from sklearn.linear_model import LinearRegression
+from sklearn.preprocessing import PolynomialFeatures
+from sklearn.pipeline import Pipeline

 TRAIN_DATA_ARG = '--train-data'
 TRAIN_DATA_ARG_SHORT = '-t'
@@ -24,9 +27,17 @@ FEATURES_ARG_SHORT = '-f'
 MODEL_ARG = '--model'
 MODEL_ARG_SHORT = '-m'
 TEST_DATA_ARG = '--test-data'
+DEGREE_ARG = '--degree'
+INTERACTION_ONLY_ARG = '--interaction-only'
+DEGREE_BOTTOM_BOUNDARY_ARG = '--degree-bottom-boundary'
+DEGREE_TOP_ARG = '--degree-top-boundary'

 PVALUE_COLUMN_NAME = 'p-value'
 RSQUARED_COLUMN_NAME = 'R^2'
+DEGREE_COLUMN_NAME = 'degree'
+INTERACTION_ONLY_COLUMN_NAME = 'interaction_only'
+TRAIN_MSE_COLUMN_NAME = 'train_mse'
+TEST_MSE_COLUMN_NAME = 'test_mse'

 def ensure_directory(directory: Path):
    directory.mkdir(parents=True, exist_ok=True)
@@ -185,6 +196,79 @@ def model_evaluation(model_file: Path, test_data_file: Path, features: List[str]
    with open(model_evaluation_file, 'w') as f:
        f.write(f'''test_MSE: {test_mse}
 ''')
+        
+def higher_order_terms(train_data_file: Path, test_data_file: Path, features: List[str], target: str, interaction_only: bool, degree: int, out_dir: Path):
+    train_data = pd.read_csv(train_data_file)
+    test_data = pd.read_csv(test_data_file)
+
+    X_train = train_data[features]
+    y_train = train_data[target]
+
+    X_test = test_data[features]
+    y_test = test_data[target]
+
+    pipeline = Pipeline([
+        ('poly', PolynomialFeatures(degree=degree, include_bias=False, interaction_only=interaction_only)),
+        ('linear', LinearRegression())
+    ])
+
+    pipeline.fit(X_train, y_train)
+    y_pred = pipeline.predict(X_train)
+    train_mse = mean_squared_error(y_train, y_pred)
+    rsquared = pipeline.score(X_train, y_train)
+
+    y_pred = pipeline.predict(X_test)
+    test_mse = mean_squared_error(y_test, y_pred)
+
+    print(f'train MSE: {train_mse}, test mse: {test_mse}, R^2: {rsquared}')
+
+    ensure_directory(out_dir)
+    output_file = out_dir / f'higher_order_terms_{degree}_{interaction_only}.txt'
+    with open(output_file, 'w') as f:
+        f.write(f'''train MSE: {train_mse}
+test MSE: {test_mse}
+rsquared: {rsquared}
+''')
+    
+    return train_mse, test_mse, rsquared
+        
+def best_higher_order_terms(train_data_file: Path, test_data_file: Path, features: List[str], target: str, degree_bottom_boundary: int, degree_top_boundary: int, out_dir: Path):
+    if degree_bottom_boundary > degree_top_boundary:
+        raise ValueError('degree bottom boundary must be smaller or equal than degree top boundary')
+    
+    configurations = pd.DataFrame({
+        DEGREE_COLUMN_NAME: pd.Series(dtype=int),
+        INTERACTION_ONLY_COLUMN_NAME: pd.Series(dtype=bool),
+        TRAIN_MSE_COLUMN_NAME: pd.Series(dtype=float),
+        TEST_MSE_COLUMN_NAME: pd.Series(dtype=float),
+        RSQUARED_COLUMN_NAME: pd.Series(dtype='float'),
+    })
+    for degree in range(degree_bottom_boundary, degree_top_boundary + 1):
+        interaction_only = False
+        train_mse, test_mse, rsquared = higher_order_terms(train_data_file, test_data_file, features, target, interaction_only, degree, out_dir)
+        configurations.loc[-1] = {
+            DEGREE_COLUMN_NAME: degree,
+            INTERACTION_ONLY_COLUMN_NAME: interaction_only,
+            TRAIN_MSE_COLUMN_NAME: train_mse,
+            TEST_MSE_COLUMN_NAME: test_mse,
+            RSQUARED_COLUMN_NAME: rsquared,
+        }        
+        configurations.index = configurations.index + 1
+
+        interaction_only = True
+        train_mse, test_mse, rsquared = higher_order_terms(train_data_file, test_data_file, features, target, interaction_only, degree, out_dir)
+        configurations.loc[-1] = {
+            DEGREE_COLUMN_NAME: degree,
+            INTERACTION_ONLY_COLUMN_NAME: interaction_only,
+            TRAIN_MSE_COLUMN_NAME: train_mse,
+            TEST_MSE_COLUMN_NAME: test_mse,
+            RSQUARED_COLUMN_NAME: rsquared,
+        }
+        configurations.index = configurations.index + 1
+
+    configurations = configurations.sort_values(RSQUARED_COLUMN_NAME, ascending=False)
+    
+    print(configurations)

 def main():
    argument_parser = ArgumentParser('vqcfim', description='Virtual Quality Control for Injection Molding')
@@ -220,6 +304,26 @@ def main():
    model_evaluation_subparser.add_argument(FEATURES_ARG, FEATURES_ARG_SHORT, action='store', type=str, required=True, nargs='+')
    model_evaluation_subparser.set_defaults(func=lambda model, test_data, target, out, features, func: model_evaluation(model, test_data, features, target, out))

+    higher_order_terms_subparser = subparsers.add_parser('higher-order-terms', aliases=['hot'], description='Creates and evaluates a model with higher order terms')
+    higher_order_terms_subparser.add_argument(TRAIN_DATA_ARG, TRAIN_DATA_ARG_SHORT, action='store', type=Path, required=True)
+    higher_order_terms_subparser.add_argument(TEST_DATA_ARG, action='store', type=Path, required=True)
+    higher_order_terms_subparser.add_argument(FEATURES_ARG, FEATURES_ARG_SHORT, action='store', type=str, required=True, nargs='+')
+    higher_order_terms_subparser.add_argument(TARGET_ARG, action='store', type=str, required=True)
+    higher_order_terms_subparser.add_argument(OUT_DIR_ARG, OUT_DIR_ARG_SHORT, action='store', type=Path, required=False, default=DEFAULT_OUT_DIR)
+    higher_order_terms_subparser.add_argument(DEGREE_ARG, action='store', type=int, required=True)
+    higher_order_terms_subparser.add_argument(INTERACTION_ONLY_ARG, action='store_true', required=False)
+    higher_order_terms_subparser.set_defaults(func=lambda train_data, test_data, features, target, degree, interaction_only, out, func: higher_order_terms(train_data, test_data, features, target, interaction_only, degree, out))
+
+    best_higher_order_terms_subparser = subparsers.add_parser('best-higher-order-terms', aliases=['bhot'], description='Creates and evaluates models with higher order terms')
+    best_higher_order_terms_subparser.add_argument(TRAIN_DATA_ARG, TRAIN_DATA_ARG_SHORT, action='store', type=Path, required=True)
+    best_higher_order_terms_subparser.add_argument(TEST_DATA_ARG, action='store', type=Path, required=True)
+    best_higher_order_terms_subparser.add_argument(FEATURES_ARG, FEATURES_ARG_SHORT, action='store', type=str, required=True, nargs='+')
+    best_higher_order_terms_subparser.add_argument(TARGET_ARG, action='store', type=str, required=True)
+    best_higher_order_terms_subparser.add_argument(OUT_DIR_ARG, OUT_DIR_ARG_SHORT, action='store', type=Path, required=False, default=DEFAULT_OUT_DIR)
+    best_higher_order_terms_subparser.add_argument(DEGREE_BOTTOM_BOUNDARY_ARG, action='store', type=int, required=True)
+    best_higher_order_terms_subparser.add_argument(DEGREE_TOP_ARG, action='store', type=int, required=True)
+    best_higher_order_terms_subparser.set_defaults(func=lambda train_data, test_data, features, degree_bottom_boundary, degree_top_boundary, target, out, func: best_higher_order_terms(train_data, test_data, features, target, degree_bottom_boundary, degree_top_boundary, out))
+
    parsed_args = argument_parser.parse_args()
    args = vars(parsed_args)
    parsed_args.func(**args)
No results found