From 6bfb896279dcd17510b66b48d3ad3261ff273a15 Mon Sep 17 00:00:00 2001 From: Andri Joos <andri@joos.io> Date: Mon, 11 Nov 2024 13:14:33 +0100 Subject: [PATCH] fix memory usage --- app/preprocessing/transform_dataset.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/app/preprocessing/transform_dataset.py b/app/preprocessing/transform_dataset.py index e06441b..c087805 100644 --- a/app/preprocessing/transform_dataset.py +++ b/app/preprocessing/transform_dataset.py @@ -16,7 +16,7 @@ from .file_type import FileType from .json_maneuver_data import JsonManeuverData DOUBLE_PATTERN = r'Double(\d+)' -MAX_DATASET_MEMORY_SIZE = 7408802660 +MAX_DATASET_MEMORY_SIZE = 16602933278 MIN_JOBS = 2 VARIANCE_THRESHOLD = 0.01 CORRELATION_THRESHOLD = 0.9 @@ -223,6 +223,7 @@ def transform_dataset(dataset_dir: Path, out_dir: Path, state_description_file: print('Your system may run out of memory. In this case, don\'t use parallelization.') n_jobs = max(MIN_JOBS, min(n_jobs_based_on_cpu, n_jobs_based_on_memory)) + print(f'Using {n_jobs} jobs') Parallel(n_jobs=n_jobs)(delayed(_transform_parquet_file_function_with_args)(parquet_file, json_file) for parquet_file, json_file in file_tuples) else: for parquet_file, json_file in file_tuples: -- GitLab