diff --git a/app/preprocessing/transform_dataset.py b/app/preprocessing/transform_dataset.py index 899e74f21f21c2eb853dcd194a884c5303abc193..ee8e110ee3f0aa1915f9a42e63f2920b8b9348af 100644 --- a/app/preprocessing/transform_dataset.py +++ b/app/preprocessing/transform_dataset.py @@ -32,20 +32,22 @@ def _cast_columns(df: pd.DataFrame, column_type_mapping: Dict[str | int, str]) - double_match = re.match(DOUBLE_PATTERN, column_type) if column_type == 'Double': - df[column] = df[column].astype(np.float64) + df[column] = df[column].astype('Float64') elif column_type == 'Int32': - df[column] = df[column].astype(np.int32) + df[column] = df[column].astype('Int32') elif column_type == 'Boolean': - df[column] = df[column].astype(np.int32) + df[column] = df[column].astype('Int32') elif column_type == 'String': df[column] = df[column].astype(str) elif double_match: vector_rows = int(double_match.group(1)) # it is certain that this is an int because of the pattern - df[column] = df[column].apply(lambda vec: np.array(vec, dtype=np.float64)).apply(lambda arr: _ensure_shape(arr, (vector_rows,))) + df[column] = df[column] \ + .apply(lambda vec: np.array(vec, dtype=np.float64)) \ + .apply(lambda arr: _ensure_shape(arr, (vector_rows,)) if isinstance(arr, np.ndarray) else arr) # if it is not instance of np.ndarray, it is NaN (empty cell) else: raise ValueError(f'Unexpected type {column_type}')