diff --git a/app/preprocessing/blocksize_finding.py b/app/preprocessing/blocksize_finding.py new file mode 100644 index 0000000000000000000000000000000000000000..1943d1899bc852f1f55081628b8d5b4421db22c0 --- /dev/null +++ b/app/preprocessing/blocksize_finding.py @@ -0,0 +1,163 @@ +import pandas as pd +import numpy as np +from sklearn.model_selection import train_test_split +from sklearn.metrics import accuracy_score, classification_report +from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis +import matplotlib.pyplot as plt +import json +from typing import List, Dict, Tuple + +def load_data(parquet_file: str, json_file: str) -> Tuple[pd.DataFrame, Dict]: + """ + Load the flight data and corresponding maneuver labels + """ + # Load flight data + df = pd.read_parquet(parquet_file) + + # Load maneuver labels + with open(json_file, 'r') as f: + labels = json.load(f) + + return df, labels + +def create_blocks(df: pd.DataFrame, block_size: int, overlap: float = 0.5) -> List[pd.DataFrame]: + """ + Divide time series data into overlapping blocks + + Args: + df: Input DataFrame with time series data + block_size: Number of time steps in each block + overlap: Fraction of overlap between consecutive blocks (0 to 1) + """ + blocks = [] + step_size = int(block_size * (1 - overlap)) + + for i in range(0, len(df) - block_size + 1, step_size): + block = df.iloc[i:i + block_size] + blocks.append(block) + + return blocks + +def extract_features(block: pd.DataFrame) -> Dict: + """ + Extract statistical features from a block of time series data + """ + features = {} + + # Select numerical columns only + numeric_cols = block.drop(columns=['Maneuver']).reset_index(drop=True) + + for col in numeric_cols: + # Basic statistical features + features[f'{col}_mean'] = block[col].mean() + features[f'{col}_std'] = block[col].std() + features[f'{col}_min'] = block[col].min() + features[f'{col}_max'] = block[col].max() + features[f'{col}_median'] = block[col].median() + + # Additional statistical features + features[f'{col}_skew'] = block[col].skew() + features[f'{col}_kurtosis'] = block[col].kurtosis() + + # Range and rate of change features + features[f'{col}_range'] = features[f'{col}_max'] - features[f'{col}_min'] + features[f'{col}_roc'] = block[col].diff().mean() # Rate of change + + return features + +def evaluate_block_size(parquet_files: List[str], block_sizes: List[int]) -> Dict: + """ + Evaluate different block sizes and return performance metrics + + Args: + data_files: List of tuples containing (parquet_file, json_file) paths + block_sizes: List of block sizes to evaluate + """ + results = {} + + for block_size in block_sizes: + print(f"Evaluating block size: {block_size}") + + # Store features and labels for all files + X = [] + y = [] + + # Process each file + for parquet_file in parquet_files: + # Load data + # df, labels = load_data(parquet_file, json_file) + df = pd.read_parquet(parquet_file) + + # Create blocks + blocks = create_blocks(df, block_size) + + # Extract features from each block + for block in blocks: + features = extract_features(block) + X.append(features) + dominant_label = block['Maneuver'].mode()[0] + + # Determine the dominant maneuver in this block + # (You'll need to implement logic to match timestamps with labels) + # This is a placeholder - implement actual label assignment + y.append(dominant_label) + + # Convert to DataFrame + X_df = pd.DataFrame(X) + + # Split data + X_train, X_test, y_train, y_test = train_test_split(X_df, y, test_size=0.2, random_state=42) + + # Train a simple classifier (Random Forest as an example) + # clf = RandomForestClassifier(n_estimators=100, random_state=42) + # clf.fit(X_train, y_train) + qda = QuadraticDiscriminantAnalysis() + qda.fit(X_train, y_train) + + # Evaluate + y_pred = qda.predict(X_test) + accuracy = accuracy_score(y_test, y_pred) + + # Store results + results[block_size] = { + 'accuracy': accuracy, + 'report': classification_report(y_test, y_pred) + } + + return results + +def plot_results(results: Dict): + """ + Plot the performance metrics for different block sizes + """ + block_sizes = list(results.keys()) + accuracies = [results[size]['accuracy'] for size in block_sizes] + + plt.figure(figsize=(10, 6)) + plt.plot(block_sizes, accuracies, marker='o') + plt.xlabel('Block Size (time steps)') + plt.ylabel('Accuracy') + plt.title('Performance vs Block Size') + plt.grid(True) + plt.show() + +# Example usage +data_files = [ + ('path/to/file1.parquet', 'path/to/file1.json'), + ('path/to/file2.parquet', 'path/to/file2.json') +] + +block_sizes = [10, 20, 50, 100, 200, 500] + +# Evaluate different block sizes +results = evaluate_block_size(data_files, block_sizes) + +# Plot results +plot_results(results) + +# Print detailed results +for block_size, metrics in results.items(): + print(f"\nBlock Size: {block_size}") + print(f"Accuracy: {metrics['accuracy']:.4f}") + print("Classification Report:") + print(metrics['report'])