Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • ost/ml/virtual-quality-control-for-injection-molding/project
1 result
Show changes
Commits on Source (7)
......@@ -14,7 +14,8 @@
}
},
"containerEnv": {
"SHELL": "/bin/bash"
"SHELL": "/bin/bash",
"MPLBACKEND": "Agg"
},
"remoteEnv": {
"PATH": "/home/vscode/.local/bin:${containerEnv:PATH}"
......
......@@ -19,3 +19,12 @@ correlation_analysis:
- out/
script:
- vqcfim correlation-analysis --train-data dataset/InjectionMolding_Train.csv --out out --correlation-threshold 0.9
best_single_feature_regression:
extends: .run_script
artifacts:
expire_in: 1d
paths:
- out/
script:
- vqcfim best-single-feature-regression --train-data dataset/InjectionMolding_Train.csv --out out --target 'mass' --p-value-threshold 0.05
......@@ -19,6 +19,24 @@
"-c",
"0.9"
]
},
{
"name": "Python Debugger: Best Single Feature Regression",
"type": "debugpy",
"request": "launch",
"program": "${workspaceFolder}/src/app.py",
"console": "integratedTerminal",
"args": [
"best-single-feature-regression",
"-t",
"dataset/InjectionMolding_Train.csv",
"--target",
"mass",
"-o",
"out",
"--p-value-threshold",
"0.05"
]
}
]
}
......@@ -5,7 +5,8 @@ description = "Virtual Quality Control for Injection Molding"
dependencies = [
"pandas >= 2.2.3, < 3.0.0",
"seaborn >= 0.13.2, < 1.0.0",
"matplotlib >= 3.9.2, < 4.0.0"
"matplotlib >= 3.9.2, < 4.0.0",
"statsmodels >= 0.14.4, < 1.0.0",
]
maintainers = [
{name = "Andri Joos"},
......
......@@ -5,6 +5,7 @@ import seaborn as sns
import matplotlib.pyplot as plt
import math
from typing import List, Tuple
import statsmodels.api as sm
TRAIN_DATA_ARG = '--train-data'
TRAIN_DATA_ARG_SHORT = '-t'
......@@ -14,6 +15,12 @@ DEFAULT_OUT_DIR = 'out/'
CORRELATION_THRESHOLD_ARG = '--correlation-threshold'
CORRELATION_THRESHOLD_ARG_SHORT = '-c'
DEFAULT_CORRELATION_THRESHOLD = 0.9
TARGET_ARG = '--target'
P_VALUE_THRESHOLD_ARG = '--p-value-threshold'
DEFAULT_P_VALUE_THRESHOLD = 0.05
PVALUE_COLUMN_NAME = 'p-value'
RSQUARED_COLUMN_NAME = 'R^2'
def ensure_directory(directory: Path):
directory.mkdir(parents=True, exist_ok=True)
......@@ -25,7 +32,7 @@ def correlation_analysis(train_data_file: Path, out_dir: Path, correlation_thres
# Display correlation matrix
correlation_matrix = train_data.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Matrix')
ensure_directory(out_dir)
......@@ -55,16 +62,87 @@ def correlation_analysis(train_data_file: Path, out_dir: Path, correlation_thres
with open(correlations_file, 'w') as f:
f.writelines(correlations)
def single_feature_regression(data: pd.DataFrame, feature: str, target: str):
X = sm.add_constant(data[[feature]]) # Add constant for intercept
y = data[target]
model = sm.OLS(y, X).fit()
return model.pvalues.iloc[1], model.rsquared
def best_single_feature_regression(train_data_file: Path, target: str, p_value_threshold: float, out_dir: Path):
train_data = pd.read_csv(train_data_file)
features = train_data.columns
features = features.drop(target)
evaluated_features = pd.DataFrame({
PVALUE_COLUMN_NAME: pd.Series(dtype='float'),
RSQUARED_COLUMN_NAME: pd.Series(dtype='float'),
})
for feature in features:
pvalue, rsquared = single_feature_regression(train_data, feature, target)
evaluated_features.loc[feature] = {PVALUE_COLUMN_NAME: pvalue, RSQUARED_COLUMN_NAME: rsquared}
print('Evaluated features')
print(evaluated_features)
plt.figure(figsize=(1.75, 4.8))
evaluated_pvalues = evaluated_features[[PVALUE_COLUMN_NAME]]
sns.heatmap(evaluated_pvalues, annot=True, cmap='coolwarm', vmin=0, vmax=1)
ensure_directory(out_dir)
evaluated_pvalues_file_path = out_dir / "evaluated_pvalues.png"
plt.savefig(evaluated_pvalues_file_path, bbox_inches='tight')
plt.figure(figsize=(1.75, 4.8))
evaluated_rsquares = evaluated_features[[RSQUARED_COLUMN_NAME]]
sns.heatmap(evaluated_rsquares, annot=True, cmap='coolwarm', vmin=0, vmax=1)
ensure_directory(out_dir)
evaluated_rsquares_file_path = out_dir / "evaluated_rsquares.png"
plt.savefig(evaluated_rsquares_file_path, bbox_inches='tight')
possible_features = evaluated_features.where(evaluated_features[PVALUE_COLUMN_NAME] < p_value_threshold).dropna()
possible_features = possible_features.sort_values(RSQUARED_COLUMN_NAME, ascending=False)
print()
print('Features matching p-value-threshold')
print(possible_features)
best_feature: pd.Series = None
if possible_features.shape[0] > 0:
best_feature = possible_features.iloc[0]
else:
print('No feature meets all criteria')
return
print()
print('Best Feature')
print(best_feature)
ensure_directory(out_dir)
best_feature_file = out_dir / 'best_feature.txt'
with open(best_feature_file, 'w') as f:
f.write(f'''Name: {best_feature.name}
p-value: {best_feature[PVALUE_COLUMN_NAME]}
R^2: {best_feature[RSQUARED_COLUMN_NAME]}
''')
def main():
argument_parser = ArgumentParser('vqcfim', description='Virtual Quality Control for Injection Molding')
subparsers = argument_parser.add_subparsers(title='action')
correlation_analysis_subparser = subparsers.add_parser('correlation-analysis', aliases=['c'], description='Gets variables with correlation coefficients >= 0.9')
correlation_analysis_subparser = subparsers.add_parser('correlation-analysis', aliases=['ca'], description='Gets variables with correlation coefficients >= --correlation-coefficient')
correlation_analysis_subparser.add_argument(TRAIN_DATA_ARG, TRAIN_DATA_ARG_SHORT, action='store', type=Path, required=True)
correlation_analysis_subparser.add_argument(OUT_DIR_ARG, OUT_DIR_ARG_SHORT, action='store', type=Path, required=False, default=DEFAULT_OUT_DIR)
correlation_analysis_subparser.add_argument(CORRELATION_THRESHOLD_ARG, CORRELATION_THRESHOLD_ARG_SHORT, action='store', type=float, required=False, default=DEFAULT_CORRELATION_THRESHOLD)
correlation_analysis_subparser.set_defaults(func=lambda train_data, out, correlation_threshold, func: correlation_analysis(train_data, out, correlation_threshold))
best_single_feature_regression_subparser = subparsers.add_parser('best-single-feature-regression', aliases=['bsfr'], description='Evaluates the best single feature regression feature from the dataset')
best_single_feature_regression_subparser.add_argument(TRAIN_DATA_ARG, TRAIN_DATA_ARG_SHORT, action='store', type=Path, required=True)
best_single_feature_regression_subparser.add_argument(TARGET_ARG, action='store', type=str, required=True)
best_single_feature_regression_subparser.add_argument(OUT_DIR_ARG, OUT_DIR_ARG_SHORT, action='store', type=Path, required=False, default=DEFAULT_OUT_DIR)
best_single_feature_regression_subparser.add_argument(P_VALUE_THRESHOLD_ARG, action='store', type=float, required=False, default=DEFAULT_P_VALUE_THRESHOLD)
best_single_feature_regression_subparser.set_defaults(func=lambda train_data, target, out, p_value_threshold, func: best_single_feature_regression(train_data, target, p_value_threshold, out))
parsed_args = argument_parser.parse_args()
args = vars(parsed_args)
parsed_args.func(**args)
......