From b9fbb7093dee1b5f6f35f1848ffab1e535cc707d Mon Sep 17 00:00:00 2001
From: Andri Joos <andri@joos.io>
Date: Sat, 9 Nov 2024 01:15:50 +0100
Subject: [PATCH] allow nan values in column casting

---
 app/preprocessing/transform_dataset.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/app/preprocessing/transform_dataset.py b/app/preprocessing/transform_dataset.py
index 899e74f..ee8e110 100644
--- a/app/preprocessing/transform_dataset.py
+++ b/app/preprocessing/transform_dataset.py
@@ -32,20 +32,22 @@ def _cast_columns(df: pd.DataFrame, column_type_mapping: Dict[str | int, str]) -
         double_match = re.match(DOUBLE_PATTERN, column_type)
 
         if column_type == 'Double':
-            df[column] = df[column].astype(np.float64)
+            df[column] = df[column].astype('Float64')
 
         elif column_type == 'Int32':
-            df[column] = df[column].astype(np.int32)
+            df[column] = df[column].astype('Int32')
 
         elif column_type == 'Boolean':
-            df[column] = df[column].astype(np.int32)
+            df[column] = df[column].astype('Int32')
         
         elif column_type == 'String':
             df[column] = df[column].astype(str)
 
         elif double_match:
             vector_rows = int(double_match.group(1)) # it is certain that this is an int because of the pattern
-            df[column] = df[column].apply(lambda vec: np.array(vec, dtype=np.float64)).apply(lambda arr: _ensure_shape(arr, (vector_rows,)))
+            df[column] = df[column] \
+                .apply(lambda vec: np.array(vec, dtype=np.float64)) \
+                .apply(lambda arr: _ensure_shape(arr, (vector_rows,)) if isinstance(arr, np.ndarray) else arr) # if it is not instance of np.ndarray, it is NaN (empty cell)
         
         else:
             raise ValueError(f'Unexpected type {column_type}')
-- 
GitLab