Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
M
Maneuver Detection
Manage
Activity
Members
Labels
Plan
Issues
0
Issue boards
Milestones
Wiki
Code
Merge requests
0
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
OST
ML
Maneuver Detection
Commits
66876851
Commit
66876851
authored
5 months ago
by
Andri Joos
Browse files
Options
Downloads
Patches
Plain Diff
drop string columns
parent
55ba8aee
No related branches found
No related tags found
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
app/preprocessing/transform_dataset.py
+8
-1
8 additions, 1 deletion
app/preprocessing/transform_dataset.py
app/preprocessing/utils.py
+5
-0
5 additions, 0 deletions
app/preprocessing/utils.py
with
13 additions
and
1 deletion
app/preprocessing/transform_dataset.py
+
8
−
1
View file @
66876851
...
@@ -55,7 +55,7 @@ def _cast_columns(df: pd.DataFrame, column_type_mapping: Dict[str | int, str]) -
...
@@ -55,7 +55,7 @@ def _cast_columns(df: pd.DataFrame, column_type_mapping: Dict[str | int, str]) -
return
df
return
df
def
_split_array_column
(
df
:
pd
.
DataFrame
)
->
pd
.
DataFrame
:
def
_split_array_column
(
df
:
pd
.
DataFrame
)
->
pd
.
DataFrame
:
array_columns
=
[
col
for
col
in
df
.
columns
if
isinstance
(
df
[
col
].
values
[
0
],
np
.
ndarray
)]
# Data is consistent in each row
array_columns
=
[
col
for
col
in
df
.
columns
if
preprocessing_utils
.
is_column_of_type
(
df
[
col
],
np
.
ndarray
)]
for
column
in
array_columns
:
for
column
in
array_columns
:
array_dtype
=
df
[
column
].
iloc
[
0
].
dtype
# First row must have a value
array_dtype
=
df
[
column
].
iloc
[
0
].
dtype
# First row must have a value
stacked_arrays
=
np
.
stack
(
df
[
column
].
values
,
dtype
=
array_dtype
)
# is faster than df[column].apply(lambda vec: pd.Series(vec, dtype=array_dtype))
stacked_arrays
=
np
.
stack
(
df
[
column
].
values
,
dtype
=
array_dtype
)
# is faster than df[column].apply(lambda vec: pd.Series(vec, dtype=array_dtype))
...
@@ -66,6 +66,10 @@ def _split_array_column(df: pd.DataFrame) -> pd.DataFrame:
...
@@ -66,6 +66,10 @@ def _split_array_column(df: pd.DataFrame) -> pd.DataFrame:
return
df
return
df
def
_remove_string_columns
(
df
:
pd
.
DataFrame
)
->
pd
.
DataFrame
:
string_columns
=
[
col
for
col
in
df
.
columns
if
preprocessing_utils
.
is_column_of_type
(
df
[
col
],
str
)]
return
df
.
drop
(
columns
=
string_columns
)
def
_drop_non_shared_columns
(
df
:
pd
.
DataFrame
,
shared_columns
:
Set
[
str
])
->
pd
.
DataFrame
:
def
_drop_non_shared_columns
(
df
:
pd
.
DataFrame
,
shared_columns
:
Set
[
str
])
->
pd
.
DataFrame
:
columns_to_drop
=
[
column
for
column
in
df
.
columns
if
str
(
column
)
not
in
shared_columns
]
columns_to_drop
=
[
column
for
column
in
df
.
columns
if
str
(
column
)
not
in
shared_columns
]
df
=
df
.
drop
(
columns
=
columns_to_drop
)
df
=
df
.
drop
(
columns
=
columns_to_drop
)
...
@@ -102,6 +106,9 @@ def _transform_parquet_file(
...
@@ -102,6 +106,9 @@ def _transform_parquet_file(
# Split arrays
# Split arrays
df
=
_split_array_column
(
df
)
df
=
_split_array_column
(
df
)
# Drop string columns
df
=
_remove_string_columns
(
df
)
print
(
f
'
Saving
{
filename
}
'
)
print
(
f
'
Saving
{
filename
}
'
)
df
.
to_parquet
(
out_dir
/
filename
)
df
.
to_parquet
(
out_dir
/
filename
)
# df.to_csv(out_dir / f'{file.stem}.csv')
# df.to_csv(out_dir / f'{file.stem}.csv')
...
...
This diff is collapsed.
Click to expand it.
app/preprocessing/utils.py
+
5
−
0
View file @
66876851
...
@@ -2,6 +2,8 @@ from typing import List
...
@@ -2,6 +2,8 @@ from typing import List
import
shutil
import
shutil
import
os
import
os
from
pathlib
import
Path
from
pathlib
import
Path
import
pandas
as
pd
from
.file_type
import
FileType
from
.file_type
import
FileType
def
recreate_dir
(
dir
:
Path
)
->
None
:
def
recreate_dir
(
dir
:
Path
)
->
None
:
...
@@ -12,3 +14,6 @@ def recreate_dir(dir: Path) -> None:
...
@@ -12,3 +14,6 @@ def recreate_dir(dir: Path) -> None:
def
files_from_dataset
(
dataset_dir
:
Path
,
dataType
:
FileType
):
def
files_from_dataset
(
dataset_dir
:
Path
,
dataType
:
FileType
):
return
[
path
for
path
in
dataset_dir
.
glob
(
f
'
*
{
dataType
.
file_extension
}
'
)
if
path
.
is_file
()]
return
[
path
for
path
in
dataset_dir
.
glob
(
f
'
*
{
dataType
.
file_extension
}
'
)
if
path
.
is_file
()]
def
is_column_of_type
(
column
:
pd
.
Series
,
type
:
type
):
return
isinstance
(
column
.
values
[
0
],
type
)
This diff is collapsed.
Click to expand it.
Preview
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment