Skip to content
Snippets Groups Projects
Commit 28f554b1 authored by Andri Joos's avatar Andri Joos :blush:
Browse files

add single feature regression

parent dc6265c2
No related merge requests found
Pipeline #6715 passed with stage
in 2 minutes and 23 seconds
......@@ -19,3 +19,12 @@ correlation_analysis:
- out/
script:
- vqcfim correlation-analysis --train-data dataset/InjectionMolding_Train.csv --out out --correlation-threshold 0.9
best_single_feature_regression:
extends: .run_script
artifacts:
expire_in: 1d
paths:
- out/
script:
- vqcfim best-single-feature-regression --train-data dataset/InjectionMolding_Train.csv --out out --target 'mass' --p-value-threshold 0.05
......@@ -19,6 +19,24 @@
"-c",
"0.9"
]
},
{
"name": "Python Debugger: Best Single Feature Regression",
"type": "debugpy",
"request": "launch",
"program": "${workspaceFolder}/src/app.py",
"console": "integratedTerminal",
"args": [
"best-single-feature-regression",
"-t",
"dataset/InjectionMolding_Train.csv",
"--target",
"mass",
"-o",
"out",
"--p-value-threshold",
"0.05"
]
}
]
}
......@@ -6,6 +6,7 @@ dependencies = [
"pandas >= 2.2.3, < 3.0.0",
"seaborn >= 0.13.2, < 1.0.0",
"matplotlib >= 3.9.2, < 4.0.0",
"statsmodels >= 0.14.4, < 1.0.0",
]
maintainers = [
{name = "Andri Joos"},
......
......@@ -5,6 +5,7 @@ import seaborn as sns
import matplotlib.pyplot as plt
import math
from typing import List, Tuple
import statsmodels.api as sm
TRAIN_DATA_ARG = '--train-data'
TRAIN_DATA_ARG_SHORT = '-t'
......@@ -14,6 +15,12 @@ DEFAULT_OUT_DIR = 'out/'
CORRELATION_THRESHOLD_ARG = '--correlation-threshold'
CORRELATION_THRESHOLD_ARG_SHORT = '-c'
DEFAULT_CORRELATION_THRESHOLD = 0.9
TARGET_ARG = '--target'
P_VALUE_THRESHOLD_ARG = '--p-value-threshold'
DEFAULT_P_VALUE_THRESHOLD = 0.05
PVALUE_COLUMN_NAME = 'p-value'
RSQUARED_COLUMN_NAME = 'R^2'
def ensure_directory(directory: Path):
directory.mkdir(parents=True, exist_ok=True)
......@@ -55,6 +62,64 @@ def correlation_analysis(train_data_file: Path, out_dir: Path, correlation_thres
with open(correlations_file, 'w') as f:
f.writelines(correlations)
def single_feature_regression(data: pd.DataFrame, feature: str, target: str):
X = sm.add_constant(data[[feature]]) # Add constant for intercept
y = data[target]
model = sm.OLS(y, X).fit()
return model.pvalues.iloc[1], model.rsquared
def best_single_feature_regression(train_data_file: Path, target: str, p_value_threshold: float, out_dir: Path):
train_data = pd.read_csv(train_data_file)
features = train_data.columns
features = features.drop(target)
evaluated_features = pd.DataFrame({
PVALUE_COLUMN_NAME: pd.Series(dtype='float'),
RSQUARED_COLUMN_NAME: pd.Series(dtype='float'),
})
for feature in features:
pvalue, rsquared = single_feature_regression(train_data, feature, target)
evaluated_features.loc[feature] = {PVALUE_COLUMN_NAME: pvalue, RSQUARED_COLUMN_NAME: rsquared}
print('Evaluated features')
print(evaluated_features)
plt.figure(figsize=(1.75, 4.8))
evaluated_pvalues = evaluated_features[[PVALUE_COLUMN_NAME]]
sns.heatmap(evaluated_pvalues, annot=True, cmap='coolwarm', vmin=0, vmax=1)
ensure_directory(out_dir)
evaluated_pvalues_file_path = out_dir / "evaluated_pvalues.png"
plt.savefig(evaluated_pvalues_file_path, bbox_inches='tight')
plt.figure(figsize=(1.75, 4.8))
evaluated_rsquares = evaluated_features[[RSQUARED_COLUMN_NAME]]
sns.heatmap(evaluated_rsquares, annot=True, cmap='coolwarm', vmin=0, vmax=1)
ensure_directory(out_dir)
evaluated_rsquares_file_path = out_dir / "evaluated_rsquares.png"
plt.savefig(evaluated_rsquares_file_path, bbox_inches='tight')
best_feature: pd.Series = None
for _, row in evaluated_features.iterrows():
pvalue = row[PVALUE_COLUMN_NAME]
rsquared = row[RSQUARED_COLUMN_NAME]
if best_feature is None or (pvalue < best_feature[PVALUE_COLUMN_NAME] and rsquared > best_feature[RSQUARED_COLUMN_NAME]):
best_feature = row
print()
print('Best Feature')
print(best_feature)
ensure_directory(out_dir)
best_feature_file = out_dir / 'best_feature.txt'
with open(best_feature_file, 'w') as f:
f.write(f'''Name: {best_feature.name}
p-value: {best_feature[PVALUE_COLUMN_NAME]}
R^2: {best_feature[RSQUARED_COLUMN_NAME]}
''')
def main():
argument_parser = ArgumentParser('vqcfim', description='Virtual Quality Control for Injection Molding')
subparsers = argument_parser.add_subparsers(title='action')
......@@ -65,6 +130,13 @@ def main():
correlation_analysis_subparser.add_argument(CORRELATION_THRESHOLD_ARG, CORRELATION_THRESHOLD_ARG_SHORT, action='store', type=float, required=False, default=DEFAULT_CORRELATION_THRESHOLD)
correlation_analysis_subparser.set_defaults(func=lambda train_data, out, correlation_threshold, func: correlation_analysis(train_data, out, correlation_threshold))
best_single_feature_regression_subparser = subparsers.add_parser('best-single-feature-regression', aliases=['bsfr'], description='Evaluates the best single feature regression feature from the dataset')
best_single_feature_regression_subparser.add_argument(TRAIN_DATA_ARG, TRAIN_DATA_ARG_SHORT, action='store', type=Path, required=True)
best_single_feature_regression_subparser.add_argument(TARGET_ARG, action='store', type=str, required=True)
best_single_feature_regression_subparser.add_argument(OUT_DIR_ARG, OUT_DIR_ARG_SHORT, action='store', type=Path, required=False, default=DEFAULT_OUT_DIR)
best_single_feature_regression_subparser.add_argument(P_VALUE_THRESHOLD_ARG, action='store', type=float, required=False, default=DEFAULT_P_VALUE_THRESHOLD)
best_single_feature_regression_subparser.set_defaults(func=lambda train_data, target, out, p_value_threshold, func: best_single_feature_regression(train_data, target, p_value_threshold, out))
parsed_args = argument_parser.parse_args()
args = vars(parsed_args)
parsed_args.func(**args)
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment