From b9fbb7093dee1b5f6f35f1848ffab1e535cc707d Mon Sep 17 00:00:00 2001 From: Andri Joos <andri@joos.io> Date: Sat, 9 Nov 2024 01:15:50 +0100 Subject: [PATCH] allow nan values in column casting --- app/preprocessing/transform_dataset.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/app/preprocessing/transform_dataset.py b/app/preprocessing/transform_dataset.py index 899e74f..ee8e110 100644 --- a/app/preprocessing/transform_dataset.py +++ b/app/preprocessing/transform_dataset.py @@ -32,20 +32,22 @@ def _cast_columns(df: pd.DataFrame, column_type_mapping: Dict[str | int, str]) - double_match = re.match(DOUBLE_PATTERN, column_type) if column_type == 'Double': - df[column] = df[column].astype(np.float64) + df[column] = df[column].astype('Float64') elif column_type == 'Int32': - df[column] = df[column].astype(np.int32) + df[column] = df[column].astype('Int32') elif column_type == 'Boolean': - df[column] = df[column].astype(np.int32) + df[column] = df[column].astype('Int32') elif column_type == 'String': df[column] = df[column].astype(str) elif double_match: vector_rows = int(double_match.group(1)) # it is certain that this is an int because of the pattern - df[column] = df[column].apply(lambda vec: np.array(vec, dtype=np.float64)).apply(lambda arr: _ensure_shape(arr, (vector_rows,))) + df[column] = df[column] \ + .apply(lambda vec: np.array(vec, dtype=np.float64)) \ + .apply(lambda arr: _ensure_shape(arr, (vector_rows,)) if isinstance(arr, np.ndarray) else arr) # if it is not instance of np.ndarray, it is NaN (empty cell) else: raise ValueError(f'Unexpected type {column_type}') -- GitLab