Data transformation and outlier detection¶

Gensor provides some simple ways to find and eliminate outliers from the data. It can also be used in combination, for example we can first transform the data and then remove outliers, what can be an effective strategy in some cases. Below we quickly get the dataset created in the previous tutorial.

In [1]:

Copied!





import gensor as gs
from gensor import read_from_csv
from gensor.testdata import all_paths, pb02a_plain

pattern = r"[A-Za-z]{2}\d{2}[A-Za-z]{1}|Barodiver"

ds = read_from_csv(path=all_paths, file_format="vanessen", location_pattern=pattern)


ds2 = read_from_csv(
    path=pb02a_plain, file_format="plain", location="PB02A", sensor="AV336"
)

ds.add(ds2)

baro = ds.filter(stations="Barodiver", variables="pressure")

alts = {"PB01A": 31.48, "PB02A": 31.68}

groundwater_ds = ds.filter(stations=alts.keys(), variables="pressure")

for ts in groundwater_ds:
    ts.sensor_alt = alts.get(ts.location)

# This is the version where we just exclude the fieldwork dates:
# fieldwork_days = {"PB01A": ['2020-08-25', '2020-10-20', '2020-11-18', '2020-12-03', '2020-12-08',
#                   '2021-02-17', '2021-03-10', '2021-04-14', '2021-05-18', '2021-05-27',
#                   '2021-08-17', '2021-09-16'],
#                   "PB02A": ['2020-08-25', '2020-10-20', '2020-11-18', '2020-12-03', '2020-12-08',
#                   '2021-02-17', '2021-03-10', '2021-04-14', '2021-05-18', '2021-05-27',
#                   '2021-08-17', '2021-09-16', '2021-04-26']}

# compensated_ds = gs.compensate(groundwater_ds, baro, fieldwork_dates=fieldwork_days, interpolate_method='linear')
compensated_ds = gs.compensate(groundwater_ds, baro)

compensated_ds.plot()
import gensor as gs
from gensor import read_from_csv
from gensor.testdata import all_paths, pb02a_plain

pattern = r"[A-Za-z]{2}\d{2}[A-Za-z]{1}|Barodiver"

ds = read_from_csv(path=all_paths, file_format="vanessen", location_pattern=pattern)


ds2 = read_from_csv(
    path=pb02a_plain, file_format="plain", location="PB02A", sensor="AV336"
)

ds.add(ds2)

baro = ds.filter(stations="Barodiver", variables="pressure")

alts = {"PB01A": 31.48, "PB02A": 31.68}

groundwater_ds = ds.filter(stations=alts.keys(), variables="pressure")

for ts in groundwater_ds:
    ts.sensor_alt = alts.get(ts.location)

# This is the version where we just exclude the fieldwork dates:
# fieldwork_days = {"PB01A": ['2020-08-25', '2020-10-20', '2020-11-18', '2020-12-03', '2020-12-08',
#                   '2021-02-17', '2021-03-10', '2021-04-14', '2021-05-18', '2021-05-27',
#                   '2021-08-17', '2021-09-16'],
#                   "PB02A": ['2020-08-25', '2020-10-20', '2020-11-18', '2020-12-03', '2020-12-08',
#                   '2021-02-17', '2021-03-10', '2021-04-14', '2021-05-18', '2021-05-27',
#                   '2021-08-17', '2021-09-16', '2021-04-26']}

# compensated_ds = gs.compensate(groundwater_ds, baro, fieldwork_dates=fieldwork_days, interpolate_method='linear')
compensated_ds = gs.compensate(groundwater_ds, baro)

compensated_ds.plot()

INFO: Loading file: /home/runner/work/gensor/gensor/gensor/testdata/PB02A_plain.csv

INFO: Skipping file /home/runner/work/gensor/gensor/gensor/testdata/PB02A_plain.csv due to missing metadata.

INFO: Loading file: /home/runner/work/gensor/gensor/gensor/testdata/Barodiver_220427183008_BY222.csv

INFO: Loading file: /home/runner/work/gensor/gensor/gensor/testdata/PB01A_moni_AV319_220427183019_AV319.csv

INFO: Loading file: /home/runner/work/gensor/gensor/gensor/testdata/PB02A_plain.csv

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
Cell In[1], line 16
     10 ds2 = read_from_csv(
     11     path=pb02a_plain, file_format="plain", location="PB02A", sensor="AV336"
     12 )
     14 ds.add(ds2)
---> 16 baro = ds.filter(stations="Barodiver", variables="pressure")
     18 alts = {"PB01A": 31.48, "PB02A": 31.68}
     20 groundwater_ds = ds.filter(stations=alts.keys(), variables="pressure")

File ~/work/gensor/gensor/gensor/core/dataset.py:133, in Dataset.filter(self, location, variable, unit, **kwargs)
    130     if isinstance(value, str):
    131         kwargs[key] = [value]
--> 133 matching_timeseries = [
    134     ts
    135     for ts in self.timeseries
    136     if ts is not None
    137     and (location is None or ts.location in location)
    138     and (variable is None or ts.variable in variable)
    139     and (unit is None or ts.unit in unit)
    140     and all(matches(ts, attr, value) for attr, value in kwargs.items())
    141 ]
    143 if not matching_timeseries:
    144     return Dataset()

File ~/work/gensor/gensor/gensor/core/dataset.py:140, in <listcomp>(.0)
    130     if isinstance(value, str):
    131         kwargs[key] = [value]
    133 matching_timeseries = [
    134     ts
    135     for ts in self.timeseries
    136     if ts is not None
    137     and (location is None or ts.location in location)
    138     and (variable is None or ts.variable in variable)
    139     and (unit is None or ts.unit in unit)
--> 140     and all(matches(ts, attr, value) for attr, value in kwargs.items())
    141 ]
    143 if not matching_timeseries:
    144     return Dataset()

File ~/work/gensor/gensor/gensor/core/dataset.py:140, in <genexpr>(.0)
    130     if isinstance(value, str):
    131         kwargs[key] = [value]
    133 matching_timeseries = [
    134     ts
    135     for ts in self.timeseries
    136     if ts is not None
    137     and (location is None or ts.location in location)
    138     and (variable is None or ts.variable in variable)
    139     and (unit is None or ts.unit in unit)
--> 140     and all(matches(ts, attr, value) for attr, value in kwargs.items())
    141 ]
    143 if not matching_timeseries:
    144     return Dataset()

File ~/work/gensor/gensor/gensor/core/dataset.py:120, in Dataset.filter.<locals>.matches(ts, attr, value)
    118 if not hasattr(ts, attr):
    119     message = f"'{ts.__class__.__name__}' object has no attribute '{attr}'"
--> 120     raise AttributeError(message)
    121 return getattr(ts, attr) in value

AttributeError: 'Timeseries' object has no attribute 'stations'

Transformation¶

In Gensor, transformations are implemented to allow flexible data processing and scaling of time series data for normalization, trend removal, variance stabilization, etc.. These transformations are important when working with sensor data, where raw measurements may need to be adjusted to enhance interpretation.

The Transformation class in Gensor handles multiple types of transformations, including:

Difference: Used to remove trends by differencing the data over a specified period.
Logarithmic (Log): Applied to stabilize variance and reduce the impact of large outliers.
Square Root: Another method for stabilizing variance, commonly used for data skewness.
Box-Cox: A powerful transformation that normalizes non-normal data, often used when data contains only positive values.
Scaling Methods (Standard, MinMax, Robust, MaxAbs): Common normalization techniques that adjust data based on its distribution, commonly used to prepare data for machine learning models.

The Timeseries class integrates this functionality, allowing application of transformations to time series data. Subsequently, user can perform operations on that transformed serie, like outlier removal, and then filter the original timeseries.

Below is an example workflow on how to use differencing to enhance outlier detection:

In [2]:

Copied!

ts = compensated_ds[0].model_copy(deep=True)
ts.plot()
ts = compensated_ds[0].model_copy(deep=True)
ts.plot()

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[2], line 1
----> 1 ts = compensated_ds[0].model_copy(deep=True)
      2 ts.plot()

NameError: name 'compensated_ds' is not defined

We will obtain a new Timeseries object containing only the dips in the negative direction. In the case of Kinrooi sensor data, we first are trying to eliminate records which were taken when the diver was out of the water (which means that the water column was 0). It also includes some of the records taken after, when the groudwater level was recovering from pumping.

In [3]:

Copied!





ts_diff = ts.transform("difference", periods=12)
ts_diff_dips = ts_diff.loc[ts_diff.ts < 0]

# we've obtained a timeseries which has identified outliers. We can use those to mask our original series.
ts_identified_outliers = ts_diff_dips.detect_outliers("zscore", threshold=1.0)
ts_identified_outliers.plot(include_outliers=True)
ts_diff = ts.transform("difference", periods=12)
ts_diff_dips = ts_diff.loc[ts_diff.ts < 0]

# we've obtained a timeseries which has identified outliers. We can use those to mask our original series.
ts_identified_outliers = ts_diff_dips.detect_outliers("zscore", threshold=1.0)
ts_identified_outliers.plot(include_outliers=True)

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[3], line 1
----> 1 ts_diff = ts.transform("difference", periods=12)
      2 ts_diff_dips = ts_diff.loc[ts_diff.ts < 0]
      4 # we've obtained a timeseries which has identified outliers. We can use those to mask our original series.

NameError: name 'ts' is not defined

In [4]:

Copied!

masked = ts.mask_with(ts_identified_outliers.outliers)
masked = ts.mask_with(ts_identified_outliers.outliers)

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[4], line 1
----> 1 masked = ts.mask_with(ts_identified_outliers.outliers)

NameError: name 'ts' is not defined

In [5]:

Copied!

masked.plot()
masked.plot()

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[5], line 1
----> 1 masked.plot()

NameError: name 'masked' is not defined

By tweaking the paramters of each outlier detection method, we can actually quite accurately remove them from the dataset without doing it sevral times in a loop.