diff --git a/app/preprocessing/transform_dataset.py b/app/preprocessing/transform_dataset.py index e06441b4f3861ab7be801b95a22059659d589808..c087805da051863680bb5cd1800fa5ccb60ba772 100644 --- a/app/preprocessing/transform_dataset.py +++ b/app/preprocessing/transform_dataset.py @@ -16,7 +16,7 @@ from .file_type import FileType from .json_maneuver_data import JsonManeuverData DOUBLE_PATTERN = r'Double(\d+)' -MAX_DATASET_MEMORY_SIZE = 7408802660 +MAX_DATASET_MEMORY_SIZE = 16602933278 MIN_JOBS = 2 VARIANCE_THRESHOLD = 0.01 CORRELATION_THRESHOLD = 0.9 @@ -223,6 +223,7 @@ def transform_dataset(dataset_dir: Path, out_dir: Path, state_description_file: print('Your system may run out of memory. In this case, don\'t use parallelization.') n_jobs = max(MIN_JOBS, min(n_jobs_based_on_cpu, n_jobs_based_on_memory)) + print(f'Using {n_jobs} jobs') Parallel(n_jobs=n_jobs)(delayed(_transform_parquet_file_function_with_args)(parquet_file, json_file) for parquet_file, json_file in file_tuples) else: for parquet_file, json_file in file_tuples: