Skip to content
Snippets Groups Projects
Commit 58cce594 authored by Andri Joos's avatar Andri Joos :blush:
Browse files

add blocksize_finding

parent 198c1ce0
No related branches found
No related tags found
No related merge requests found
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
import matplotlib.pyplot as plt
import json
from typing import List, Dict, Tuple
def load_data(parquet_file: str, json_file: str) -> Tuple[pd.DataFrame, Dict]:
"""
Load the flight data and corresponding maneuver labels
"""
# Load flight data
df = pd.read_parquet(parquet_file)
# Load maneuver labels
with open(json_file, 'r') as f:
labels = json.load(f)
return df, labels
def create_blocks(df: pd.DataFrame, block_size: int, overlap: float = 0.5) -> List[pd.DataFrame]:
"""
Divide time series data into overlapping blocks
Args:
df: Input DataFrame with time series data
block_size: Number of time steps in each block
overlap: Fraction of overlap between consecutive blocks (0 to 1)
"""
blocks = []
step_size = int(block_size * (1 - overlap))
for i in range(0, len(df) - block_size + 1, step_size):
block = df.iloc[i:i + block_size]
blocks.append(block)
return blocks
def extract_features(block: pd.DataFrame) -> Dict:
"""
Extract statistical features from a block of time series data
"""
features = {}
# Select numerical columns only
numeric_cols = block.drop(columns=['Maneuver']).reset_index(drop=True)
for col in numeric_cols:
# Basic statistical features
features[f'{col}_mean'] = block[col].mean()
features[f'{col}_std'] = block[col].std()
features[f'{col}_min'] = block[col].min()
features[f'{col}_max'] = block[col].max()
features[f'{col}_median'] = block[col].median()
# Additional statistical features
features[f'{col}_skew'] = block[col].skew()
features[f'{col}_kurtosis'] = block[col].kurtosis()
# Range and rate of change features
features[f'{col}_range'] = features[f'{col}_max'] - features[f'{col}_min']
features[f'{col}_roc'] = block[col].diff().mean() # Rate of change
return features
def evaluate_block_size(parquet_files: List[str], block_sizes: List[int]) -> Dict:
"""
Evaluate different block sizes and return performance metrics
Args:
data_files: List of tuples containing (parquet_file, json_file) paths
block_sizes: List of block sizes to evaluate
"""
results = {}
for block_size in block_sizes:
print(f"Evaluating block size: {block_size}")
# Store features and labels for all files
X = []
y = []
# Process each file
for parquet_file in parquet_files:
# Load data
# df, labels = load_data(parquet_file, json_file)
df = pd.read_parquet(parquet_file)
# Create blocks
blocks = create_blocks(df, block_size)
# Extract features from each block
for block in blocks:
features = extract_features(block)
X.append(features)
dominant_label = block['Maneuver'].mode()[0]
# Determine the dominant maneuver in this block
# (You'll need to implement logic to match timestamps with labels)
# This is a placeholder - implement actual label assignment
y.append(dominant_label)
# Convert to DataFrame
X_df = pd.DataFrame(X)
# Split data
X_train, X_test, y_train, y_test = train_test_split(X_df, y, test_size=0.2, random_state=42)
# Train a simple classifier (Random Forest as an example)
# clf = RandomForestClassifier(n_estimators=100, random_state=42)
# clf.fit(X_train, y_train)
qda = QuadraticDiscriminantAnalysis()
qda.fit(X_train, y_train)
# Evaluate
y_pred = qda.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
# Store results
results[block_size] = {
'accuracy': accuracy,
'report': classification_report(y_test, y_pred)
}
return results
def plot_results(results: Dict):
"""
Plot the performance metrics for different block sizes
"""
block_sizes = list(results.keys())
accuracies = [results[size]['accuracy'] for size in block_sizes]
plt.figure(figsize=(10, 6))
plt.plot(block_sizes, accuracies, marker='o')
plt.xlabel('Block Size (time steps)')
plt.ylabel('Accuracy')
plt.title('Performance vs Block Size')
plt.grid(True)
plt.show()
# Example usage
data_files = [
('path/to/file1.parquet', 'path/to/file1.json'),
('path/to/file2.parquet', 'path/to/file2.json')
]
block_sizes = [10, 20, 50, 100, 200, 500]
# Evaluate different block sizes
results = evaluate_block_size(data_files, block_sizes)
# Plot results
plot_results(results)
# Print detailed results
for block_size, metrics in results.items():
print(f"\nBlock Size: {block_size}")
print(f"Accuracy: {metrics['accuracy']:.4f}")
print("Classification Report:")
print(metrics['report'])
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment