Skip to content
Snippets Groups Projects
Commit 66876851 authored by Andri Joos's avatar Andri Joos :blush:
Browse files

drop string columns

parent 55ba8aee
No related branches found
No related tags found
No related merge requests found
...@@ -55,7 +55,7 @@ def _cast_columns(df: pd.DataFrame, column_type_mapping: Dict[str | int, str]) - ...@@ -55,7 +55,7 @@ def _cast_columns(df: pd.DataFrame, column_type_mapping: Dict[str | int, str]) -
return df return df
def _split_array_column(df: pd.DataFrame) -> pd.DataFrame: def _split_array_column(df: pd.DataFrame) -> pd.DataFrame:
array_columns = [col for col in df.columns if isinstance(df[col].values[0], np.ndarray)] # Data is consistent in each row array_columns = [col for col in df.columns if preprocessing_utils.is_column_of_type(df[col], np.ndarray)]
for column in array_columns: for column in array_columns:
array_dtype = df[column].iloc[0].dtype # First row must have a value array_dtype = df[column].iloc[0].dtype # First row must have a value
stacked_arrays = np.stack(df[column].values, dtype=array_dtype) # is faster than df[column].apply(lambda vec: pd.Series(vec, dtype=array_dtype)) stacked_arrays = np.stack(df[column].values, dtype=array_dtype) # is faster than df[column].apply(lambda vec: pd.Series(vec, dtype=array_dtype))
...@@ -66,6 +66,10 @@ def _split_array_column(df: pd.DataFrame) -> pd.DataFrame: ...@@ -66,6 +66,10 @@ def _split_array_column(df: pd.DataFrame) -> pd.DataFrame:
return df return df
def _remove_string_columns(df: pd.DataFrame) -> pd.DataFrame:
string_columns = [col for col in df.columns if preprocessing_utils.is_column_of_type(df[col], str)]
return df.drop(columns=string_columns)
def _drop_non_shared_columns(df: pd.DataFrame, shared_columns: Set[str]) -> pd.DataFrame: def _drop_non_shared_columns(df: pd.DataFrame, shared_columns: Set[str]) -> pd.DataFrame:
columns_to_drop = [column for column in df.columns if str(column) not in shared_columns] columns_to_drop = [column for column in df.columns if str(column) not in shared_columns]
df = df.drop(columns=columns_to_drop) df = df.drop(columns=columns_to_drop)
...@@ -102,6 +106,9 @@ def _transform_parquet_file( ...@@ -102,6 +106,9 @@ def _transform_parquet_file(
# Split arrays # Split arrays
df = _split_array_column(df) df = _split_array_column(df)
# Drop string columns
df = _remove_string_columns(df)
print(f'Saving {filename}') print(f'Saving {filename}')
df.to_parquet(out_dir / filename) df.to_parquet(out_dir / filename)
# df.to_csv(out_dir / f'{file.stem}.csv') # df.to_csv(out_dir / f'{file.stem}.csv')
......
...@@ -2,6 +2,8 @@ from typing import List ...@@ -2,6 +2,8 @@ from typing import List
import shutil import shutil
import os import os
from pathlib import Path from pathlib import Path
import pandas as pd
from .file_type import FileType from .file_type import FileType
def recreate_dir(dir: Path) -> None: def recreate_dir(dir: Path) -> None:
...@@ -12,3 +14,6 @@ def recreate_dir(dir: Path) -> None: ...@@ -12,3 +14,6 @@ def recreate_dir(dir: Path) -> None:
def files_from_dataset(dataset_dir: Path, dataType: FileType): def files_from_dataset(dataset_dir: Path, dataType: FileType):
return [path for path in dataset_dir.glob(f'*{dataType.file_extension}') if path.is_file()] return [path for path in dataset_dir.glob(f'*{dataType.file_extension}') if path.is_file()]
def is_column_of_type(column: pd.Series, type: type):
return isinstance(column.values[0], type)
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment