From 6f7ecd9cd131b1078ef4122ff0bfd03fa028d863 Mon Sep 17 00:00:00 2001 From: Scott_Simmons23 Date: Wed, 23 Feb 2022 18:16:16 +1300 Subject: [PATCH 01/16] first --- .../scott_features_on_features_tests.py | 17 ++ tsfresh/feature_extraction/scotts_code.py | 199 ++++++++++++++++++ tsfresh/feature_extraction/test_data.csv | 31 +++ 3 files changed, 247 insertions(+) create mode 100644 tsfresh/feature_extraction/scott_features_on_features_tests.py create mode 100644 tsfresh/feature_extraction/scotts_code.py create mode 100644 tsfresh/feature_extraction/test_data.csv diff --git a/tsfresh/feature_extraction/scott_features_on_features_tests.py b/tsfresh/feature_extraction/scott_features_on_features_tests.py new file mode 100644 index 000000000..1f9871a7d --- /dev/null +++ b/tsfresh/feature_extraction/scott_features_on_features_tests.py @@ -0,0 +1,17 @@ +import pandas as pd +from extraction import extract_features_on_sub_features +from tsfresh.feature_extraction.settings import MinimalFCParameters +# Test is going to go here. Should work with Louis code + Scott code + the code that is integrated into tsfresh + +# Read in data +ts = pd.read_csv("test_data.csv") + + +X = extract_features_on_sub_features(timeseries_container = ts, + sub_feature_split = 2, + sub_default_fc_parameters = MinimalFCParameters, + default_fc_parameters = MinimalFCParameters, + column_id = "measurement_id", + column_sort = "t", + column_kind = None, + column_value = None) diff --git a/tsfresh/feature_extraction/scotts_code.py b/tsfresh/feature_extraction/scotts_code.py new file mode 100644 index 000000000..3c0dbc7f2 --- /dev/null +++ b/tsfresh/feature_extraction/scotts_code.py @@ -0,0 +1,199 @@ +# a file that has the algorithm used in my project... + +# main features on features framework used in VSB project +def features_on_features_vsb(ts, + first_fc_params, + second_fc_params, + fc_params_is_kind, + replacement_token): + ''' + main algorithm that uses tsfresh which computes features on features (feature dynamics) for the VSB data + NOTE: The DATA format that this function supports is input option 1 for feature extraction https://tsfresh.readthedocs.io/en/latest/text/data_formats.html + + params: + ts (pd.DataFrame): The VSB measurements that will be processed. + first_fc_params, second_fc_params (dictionary): feature sets for (1) the extraction of feature time series and (2) the extraction of feature dynamics + fc_params_is_kind (bool): if the feature dictionaries maps each separate VSB signal (ts kind) to a different feature set then this value is True, otherwise false. + replacement_token (str): token that replaces double unscore in feature naming convention. This adjustment is required for featue dynamics extraction + returns: + X1 (pd.DataFrame): Feature dynamics matrix + y1 (pd.Series): Response vector. + ''' + + # map the reponse variable to each row in the features on features matrix if the timeseries is test data otherwise do nothing... + try: + y = ts.groupby("measurement_id").last()["response"] + except: + y = None + + + # assign unique pairs of (mes_id, window_id) to each element + ts["column_id"] = ts["measurement_id"].astype(str) + ", " + ts["window_id"].astype(str) + + + # drop the columns which are not relevant to feature extraction + try: + ts = ts.drop(columns = ["measurement_id", "window_id", "response"]) # for labelled data + except: + ts = ts.drop(columns = ["measurement_id", "window_id"]) # non labelled data + + print("TS INPUT {}".format(ts)) + # first round of feature extraction FEATURE TIME SERIES + X0 = (extract_features(ts, column_id = "column_id",column_sort = "time_index", kind_to_fc_parameters = first_fc_params, disable_progressbar = True) if fc_params_is_kind + else extract_features(ts, column_id = "column_id",column_sort = "time_index", default_fc_parameters = first_fc_params, disable_progressbar = True)) + + print("FIRST {}".format(X0.shape)) + + # drop any features that produce any NaNs/NAs + if X0.isnull().values.any(): + # store dropped features + dropped_feature_names = [col_name for col_name in X0.columns[X0.isna().any()].tolist()] + # store the feature calculators that fail in a file that is constantly updated. + with open("dropped_feature_names.txt", "a") as f: + for feature in dropped_feature_names: f.write(feature[feature.index("__") + 2:] + "\n") # 2 is a magic number. It works. But this should be refactored.. + + print("found " + str(len(X0.columns[X0.isna().any()].tolist())) + " features from the set of " + str(len(X0.columns)) + " features which should be dropped before being input into second feat extraction") + X0 = X0.dropna(axis = "columns") + + + # tsfresh cant handle double underscores twice so change this in preparation for the second feature extraction + X0.columns = [str(col_name).replace("__",replacement_token) for col_name in X0.columns] + + # assign windows as the original measurment ID... i.e. extracting "mes_id" from (mes_id, window_id) + X0["column_id"] = X0.index.to_series().str.split(", ", expand = True).iloc[:,0] + + print("FEATURE TS INPUT {}".format(X0)) + + # second round of feature extraction FEATURE DYNAMICS + X1 = (extract_features(X0, column_id = "column_id", kind_to_fc_parameters = second_fc_params, disable_progressbar = True) if fc_params_is_kind + else extract_features(X0, column_id = "column_id", default_fc_parameters = second_fc_params, disable_progressbar = True)) + + X1.index.name = "measurement_id" + + # drop any features which are null or na + if X1.isnull().values.any(): + print("found " + str(len(X1.columns[X1.isna().any()].tolist())) + " features from the set of " + str(len(X1.columns)) + " features which should be dropped before being considered as the final output...") + X1 = X1.dropna(axis = "columns") + + + # sort column names + X1.sort_index(axis="columns", inplace=True) + + print("X1 output {}, {}".format(X1.shape, X1)) + + # returning the feature matrix, the response variable corresponding to each feature matrix window, and optionally the dropped colnames + return (X1, y) + + + + + +### The code written into tsfresh + +class IterableTsData(Iterable[Timeseries], Sized, TsData): + """ + Special class of TsData, which can be partitioned. + Derived classes should implement __iter__ and __len__. + """ + def pivot(self, results): + """ + Helper function to turn an iterable of tuples with three entries into a dataframe. + + The input ``list_of_tuples`` needs to be an iterable with tuples containing three + entries: (a, b, c). + Out of this, a pandas dataframe will be created with all a's as index, + all b's as columns and all c's as values. + + It basically does a pd.pivot(first entry, second entry, third entry), + but optimized for non-pandas input (= python list of tuples). + + This function is called in the end of the extract_features call. + """ + return_df_dict = defaultdict(dict) + for chunk_id, variable, value in results: + # we turn it into a nested mapping `column -> index -> value` + return_df_dict[variable][chunk_id] = value + + # the mapping column -> {index -> value} + # is now a dict of dicts. The pandas dataframe + # constructor will peel this off: + # first, the keys of the outer dict (the column) + # will turn into a column header and the rest into a column + # the rest is {index -> value} which will be turned into a + # column with index. + # All index will be aligned. + return_df = pd.DataFrame(return_df_dict, dtype=float) + + # copy the type of the index + return_df.index = return_df.index.astype(self.df_id_type) + + # Sort by index to be backward compatible + return_df = return_df.sort_index() + + return return_df + + def __len__(self): + """Override in a subclass""" + raise NotImplementedError + + def __iter__(self): + """Override in a subclass""" + raise NotImplementedError + + +class ApplyableTsData(TsData): + """ + TsData base class to use, if an iterable ts data can not be used. + Its only interface is an apply function, which should be applied + to each of the chunks of the data. How this is done + depends on the implementation. + """ + def apply(self, f, **kwargs): + raise NotImplementedError + +def extract_features_on_sub_features(timeseries_container, + sub_feature_split, + sub_default_fc_parameters=None, sub_kind_to_fc_parameters=None, + default_fc_parameters=None, kind_to_fc_parameters=None, + column_id=None, column_sort=None, column_kind=None, column_value=None, + **kwargs): + ts_data = to_tsdata(timeseries_container, column_id=column_id, column_sort=column_sort, column_kind=column_kind, column_value=column_value) + if isinstance(ts_data, Iterable): + split_ts_data = IterableSplitTsData(ts_data, sub_feature_split) + else: + split_ts_data = ApplyableSplitTsData(ts_data, sub_feature_split) + + sub_features = extract_features(split_ts_data, default_fc_parameters=sub_default_fc_parameters, + kind_to_fc_parameters=sub_kind_to_fc_parameters, **kwargs, pivot=False) + + column_kind = column_kind or "variable" + column_id = column_id or "id" + column_sort = column_sort or "sort" + column_value = column_value or "value" + + # The feature names include many "_", which will confuse tsfresh where the sub feature name ends + # and where the real feature name starts. We just remove them. + # Also, we split up the index into the id and the sort + # We need to do this separately for dask dataframes, + # as the return type is not a list, but already a dataframe + if isinstance(sub_features, dd.DataFrame): + sub_features = sub_features.reset_index(drop=True) + + sub_features[column_kind] = sub_features[column_kind].apply(lambda col: col.replace("_", ""), meta=(column_kind, object)) + + sub_features[column_sort] = sub_features[column_id].apply(lambda x: x[1], meta=(column_id, "int64")) + sub_features[column_id] = sub_features[column_id].apply(lambda x: x[0], meta=(column_id, ts_data.df_id_type)) + + else: + sub_features = pd.DataFrame(sub_features, columns=[column_id, column_kind, column_value]) + + sub_features[column_kind] = sub_features[column_kind].apply(lambda col: col.replace("_", "")) + + sub_features[column_sort] = sub_features[column_id].apply(lambda x: x[1]) + sub_features[column_id] = sub_features[column_id].apply(lambda x: x[0]) + + X = extract_features(sub_features, column_id=column_id, column_sort=column_sort, column_kind=column_kind, column_value=column_value, + default_fc_parameters=default_fc_parameters, kind_to_fc_parameters=kind_to_fc_parameters, + **kwargs) + + return X diff --git a/tsfresh/feature_extraction/test_data.csv b/tsfresh/feature_extraction/test_data.csv new file mode 100644 index 000000000..8a036e22c --- /dev/null +++ b/tsfresh/feature_extraction/test_data.csv @@ -0,0 +1,31 @@ +t,y,window_id,measurement_id +1,1,1,1 +2,1,1,1 +3,1,1,1 +4,1,1,1 +5,1,1,1 +6,1,1,1 +7,1,1,1 +8,1,1,1 +9,1,1,1 +10,1,1,1 +11,2,1,1 +12,2,1,1 +13,2,1,1 +14,2,1,1 +15,2,1,1 +16,2,1,1 +17,2,1,1 +18,2,1,1 +19,2,1,1 +20,2,1,1 +21,3,2,1 +22,3,2,1 +23,3,2,1 +24,3,2,1 +25,3,2,1 +26,3,2,1 +27,3,2,1 +28,3,2,1 +29,3,2,1 +30,3,2,1 From 802531b536e142b4b7c428c0a716428e6d03eb9d Mon Sep 17 00:00:00 2001 From: Scott-Simmons Date: Thu, 24 Feb 2022 15:07:58 +1300 Subject: [PATCH 02/16] second --- tsfresh/feature_extraction/extraction.py | 9 +-- .../scott_features_on_features_tests.py | 9 ++- tsfresh/feature_extraction/test_data.csv | 62 +++++++++---------- 3 files changed, 43 insertions(+), 37 deletions(-) diff --git a/tsfresh/feature_extraction/extraction.py b/tsfresh/feature_extraction/extraction.py index 450a91c8c..95cd521f3 100644 --- a/tsfresh/feature_extraction/extraction.py +++ b/tsfresh/feature_extraction/extraction.py @@ -14,8 +14,9 @@ from tsfresh import defaults from tsfresh.feature_extraction import feature_calculators -from tsfresh.feature_extraction.data import to_tsdata, IterableSplitTsData, ApplyableSplitTsData from tsfresh.feature_extraction.settings import ComprehensiveFCParameters +from data import to_tsdata, IterableSplitTsData, ApplyableSplitTsData + from tsfresh.utilities import profiling from tsfresh.utilities.distribution import MapDistributor, MultiprocessingDistributor, \ DistributorBaseClass, ApplyDistributor @@ -148,7 +149,6 @@ def extract_features(timeseries_container, default_fc_parameters=None, warnings.simplefilter("ignore") else: warnings.simplefilter("default") - result = _do_extraction(df=timeseries_container, column_id=column_id, column_value=column_value, column_kind=column_kind, @@ -290,6 +290,7 @@ def _do_extraction_on_chunk(chunk, default_fc_parameters, kind_to_fc_parameters) fc_parameters = default_fc_parameters def _f(): + for function_name, parameter_list in fc_parameters.items(): func = getattr(feature_calculators, function_name) @@ -325,7 +326,6 @@ def _f(): if key: feature_name += "__" + str(key) yield (sample_id, feature_name, item) - return list(_f()) @@ -344,6 +344,7 @@ def extract_features_on_sub_features(timeseries_container, sub_features = extract_features(split_ts_data, default_fc_parameters=sub_default_fc_parameters, kind_to_fc_parameters=sub_kind_to_fc_parameters, **kwargs, pivot=False) + column_kind = column_kind or "variable" column_id = column_id or "id" column_sort = column_sort or "sort" @@ -374,4 +375,4 @@ def extract_features_on_sub_features(timeseries_container, default_fc_parameters=default_fc_parameters, kind_to_fc_parameters=kind_to_fc_parameters, **kwargs) - return X \ No newline at end of file + return X diff --git a/tsfresh/feature_extraction/scott_features_on_features_tests.py b/tsfresh/feature_extraction/scott_features_on_features_tests.py index 1f9871a7d..d13dfae6f 100644 --- a/tsfresh/feature_extraction/scott_features_on_features_tests.py +++ b/tsfresh/feature_extraction/scott_features_on_features_tests.py @@ -6,12 +6,17 @@ # Read in data ts = pd.read_csv("test_data.csv") +print("Minimal: {}".format(MinimalFCParameters())) X = extract_features_on_sub_features(timeseries_container = ts, sub_feature_split = 2, - sub_default_fc_parameters = MinimalFCParameters, - default_fc_parameters = MinimalFCParameters, + sub_default_fc_parameters = MinimalFCParameters(), + default_fc_parameters = MinimalFCParameters(), column_id = "measurement_id", column_sort = "t", column_kind = None, column_value = None) + +print(X) +#for col in X.columns: +# print(col) diff --git a/tsfresh/feature_extraction/test_data.csv b/tsfresh/feature_extraction/test_data.csv index 8a036e22c..a26aaa2c2 100644 --- a/tsfresh/feature_extraction/test_data.csv +++ b/tsfresh/feature_extraction/test_data.csv @@ -1,31 +1,31 @@ -t,y,window_id,measurement_id -1,1,1,1 -2,1,1,1 -3,1,1,1 -4,1,1,1 -5,1,1,1 -6,1,1,1 -7,1,1,1 -8,1,1,1 -9,1,1,1 -10,1,1,1 -11,2,1,1 -12,2,1,1 -13,2,1,1 -14,2,1,1 -15,2,1,1 -16,2,1,1 -17,2,1,1 -18,2,1,1 -19,2,1,1 -20,2,1,1 -21,3,2,1 -22,3,2,1 -23,3,2,1 -24,3,2,1 -25,3,2,1 -26,3,2,1 -27,3,2,1 -28,3,2,1 -29,3,2,1 -30,3,2,1 +t,y,measurement_id +1,1,1 +2,1,1 +3,1,1 +4,1,1 +5,1,1 +6,1,1 +7,1,1 +8,1,1 +9,1,1 +10,1,1 +11,2,1 +12,2,1 +13,2,1 +14,2,1 +15,2,2 +16,2,2 +17,2,2 +18,2,2 +19,2,2 +20,2,2 +21,3,2 +22,3,2 +23,3,2 +24,3,2 +25,3,2 +26,3,2 +27,3,2 +28,3,2 +29,3,2 +30,3,2 From f2e79089db0d9d969960afb818d76e6b7007fb16 Mon Sep 17 00:00:00 2001 From: Scott-Simmons Date: Thu, 24 Feb 2022 16:32:44 +1300 Subject: [PATCH 03/16] third --- ...res_tests.py => feature_dynamics_tests.py} | 0 tsfresh/feature_extraction/scotts_code.py | 199 ------------------ 2 files changed, 199 deletions(-) rename tsfresh/feature_extraction/{scott_features_on_features_tests.py => feature_dynamics_tests.py} (100%) delete mode 100644 tsfresh/feature_extraction/scotts_code.py diff --git a/tsfresh/feature_extraction/scott_features_on_features_tests.py b/tsfresh/feature_extraction/feature_dynamics_tests.py similarity index 100% rename from tsfresh/feature_extraction/scott_features_on_features_tests.py rename to tsfresh/feature_extraction/feature_dynamics_tests.py diff --git a/tsfresh/feature_extraction/scotts_code.py b/tsfresh/feature_extraction/scotts_code.py deleted file mode 100644 index 3c0dbc7f2..000000000 --- a/tsfresh/feature_extraction/scotts_code.py +++ /dev/null @@ -1,199 +0,0 @@ -# a file that has the algorithm used in my project... - -# main features on features framework used in VSB project -def features_on_features_vsb(ts, - first_fc_params, - second_fc_params, - fc_params_is_kind, - replacement_token): - ''' - main algorithm that uses tsfresh which computes features on features (feature dynamics) for the VSB data - NOTE: The DATA format that this function supports is input option 1 for feature extraction https://tsfresh.readthedocs.io/en/latest/text/data_formats.html - - params: - ts (pd.DataFrame): The VSB measurements that will be processed. - first_fc_params, second_fc_params (dictionary): feature sets for (1) the extraction of feature time series and (2) the extraction of feature dynamics - fc_params_is_kind (bool): if the feature dictionaries maps each separate VSB signal (ts kind) to a different feature set then this value is True, otherwise false. - replacement_token (str): token that replaces double unscore in feature naming convention. This adjustment is required for featue dynamics extraction - returns: - X1 (pd.DataFrame): Feature dynamics matrix - y1 (pd.Series): Response vector. - ''' - - # map the reponse variable to each row in the features on features matrix if the timeseries is test data otherwise do nothing... - try: - y = ts.groupby("measurement_id").last()["response"] - except: - y = None - - - # assign unique pairs of (mes_id, window_id) to each element - ts["column_id"] = ts["measurement_id"].astype(str) + ", " + ts["window_id"].astype(str) - - - # drop the columns which are not relevant to feature extraction - try: - ts = ts.drop(columns = ["measurement_id", "window_id", "response"]) # for labelled data - except: - ts = ts.drop(columns = ["measurement_id", "window_id"]) # non labelled data - - print("TS INPUT {}".format(ts)) - # first round of feature extraction FEATURE TIME SERIES - X0 = (extract_features(ts, column_id = "column_id",column_sort = "time_index", kind_to_fc_parameters = first_fc_params, disable_progressbar = True) if fc_params_is_kind - else extract_features(ts, column_id = "column_id",column_sort = "time_index", default_fc_parameters = first_fc_params, disable_progressbar = True)) - - print("FIRST {}".format(X0.shape)) - - # drop any features that produce any NaNs/NAs - if X0.isnull().values.any(): - # store dropped features - dropped_feature_names = [col_name for col_name in X0.columns[X0.isna().any()].tolist()] - # store the feature calculators that fail in a file that is constantly updated. - with open("dropped_feature_names.txt", "a") as f: - for feature in dropped_feature_names: f.write(feature[feature.index("__") + 2:] + "\n") # 2 is a magic number. It works. But this should be refactored.. - - print("found " + str(len(X0.columns[X0.isna().any()].tolist())) + " features from the set of " + str(len(X0.columns)) + " features which should be dropped before being input into second feat extraction") - X0 = X0.dropna(axis = "columns") - - - # tsfresh cant handle double underscores twice so change this in preparation for the second feature extraction - X0.columns = [str(col_name).replace("__",replacement_token) for col_name in X0.columns] - - # assign windows as the original measurment ID... i.e. extracting "mes_id" from (mes_id, window_id) - X0["column_id"] = X0.index.to_series().str.split(", ", expand = True).iloc[:,0] - - print("FEATURE TS INPUT {}".format(X0)) - - # second round of feature extraction FEATURE DYNAMICS - X1 = (extract_features(X0, column_id = "column_id", kind_to_fc_parameters = second_fc_params, disable_progressbar = True) if fc_params_is_kind - else extract_features(X0, column_id = "column_id", default_fc_parameters = second_fc_params, disable_progressbar = True)) - - X1.index.name = "measurement_id" - - # drop any features which are null or na - if X1.isnull().values.any(): - print("found " + str(len(X1.columns[X1.isna().any()].tolist())) + " features from the set of " + str(len(X1.columns)) + " features which should be dropped before being considered as the final output...") - X1 = X1.dropna(axis = "columns") - - - # sort column names - X1.sort_index(axis="columns", inplace=True) - - print("X1 output {}, {}".format(X1.shape, X1)) - - # returning the feature matrix, the response variable corresponding to each feature matrix window, and optionally the dropped colnames - return (X1, y) - - - - - -### The code written into tsfresh - -class IterableTsData(Iterable[Timeseries], Sized, TsData): - """ - Special class of TsData, which can be partitioned. - Derived classes should implement __iter__ and __len__. - """ - def pivot(self, results): - """ - Helper function to turn an iterable of tuples with three entries into a dataframe. - - The input ``list_of_tuples`` needs to be an iterable with tuples containing three - entries: (a, b, c). - Out of this, a pandas dataframe will be created with all a's as index, - all b's as columns and all c's as values. - - It basically does a pd.pivot(first entry, second entry, third entry), - but optimized for non-pandas input (= python list of tuples). - - This function is called in the end of the extract_features call. - """ - return_df_dict = defaultdict(dict) - for chunk_id, variable, value in results: - # we turn it into a nested mapping `column -> index -> value` - return_df_dict[variable][chunk_id] = value - - # the mapping column -> {index -> value} - # is now a dict of dicts. The pandas dataframe - # constructor will peel this off: - # first, the keys of the outer dict (the column) - # will turn into a column header and the rest into a column - # the rest is {index -> value} which will be turned into a - # column with index. - # All index will be aligned. - return_df = pd.DataFrame(return_df_dict, dtype=float) - - # copy the type of the index - return_df.index = return_df.index.astype(self.df_id_type) - - # Sort by index to be backward compatible - return_df = return_df.sort_index() - - return return_df - - def __len__(self): - """Override in a subclass""" - raise NotImplementedError - - def __iter__(self): - """Override in a subclass""" - raise NotImplementedError - - -class ApplyableTsData(TsData): - """ - TsData base class to use, if an iterable ts data can not be used. - Its only interface is an apply function, which should be applied - to each of the chunks of the data. How this is done - depends on the implementation. - """ - def apply(self, f, **kwargs): - raise NotImplementedError - -def extract_features_on_sub_features(timeseries_container, - sub_feature_split, - sub_default_fc_parameters=None, sub_kind_to_fc_parameters=None, - default_fc_parameters=None, kind_to_fc_parameters=None, - column_id=None, column_sort=None, column_kind=None, column_value=None, - **kwargs): - ts_data = to_tsdata(timeseries_container, column_id=column_id, column_sort=column_sort, column_kind=column_kind, column_value=column_value) - if isinstance(ts_data, Iterable): - split_ts_data = IterableSplitTsData(ts_data, sub_feature_split) - else: - split_ts_data = ApplyableSplitTsData(ts_data, sub_feature_split) - - sub_features = extract_features(split_ts_data, default_fc_parameters=sub_default_fc_parameters, - kind_to_fc_parameters=sub_kind_to_fc_parameters, **kwargs, pivot=False) - - column_kind = column_kind or "variable" - column_id = column_id or "id" - column_sort = column_sort or "sort" - column_value = column_value or "value" - - # The feature names include many "_", which will confuse tsfresh where the sub feature name ends - # and where the real feature name starts. We just remove them. - # Also, we split up the index into the id and the sort - # We need to do this separately for dask dataframes, - # as the return type is not a list, but already a dataframe - if isinstance(sub_features, dd.DataFrame): - sub_features = sub_features.reset_index(drop=True) - - sub_features[column_kind] = sub_features[column_kind].apply(lambda col: col.replace("_", ""), meta=(column_kind, object)) - - sub_features[column_sort] = sub_features[column_id].apply(lambda x: x[1], meta=(column_id, "int64")) - sub_features[column_id] = sub_features[column_id].apply(lambda x: x[0], meta=(column_id, ts_data.df_id_type)) - - else: - sub_features = pd.DataFrame(sub_features, columns=[column_id, column_kind, column_value]) - - sub_features[column_kind] = sub_features[column_kind].apply(lambda col: col.replace("_", "")) - - sub_features[column_sort] = sub_features[column_id].apply(lambda x: x[1]) - sub_features[column_id] = sub_features[column_id].apply(lambda x: x[0]) - - X = extract_features(sub_features, column_id=column_id, column_sort=column_sort, column_kind=column_kind, column_value=column_value, - default_fc_parameters=default_fc_parameters, kind_to_fc_parameters=kind_to_fc_parameters, - **kwargs) - - return X From ccfacf0b556dc4237b3a537bda649016d7a9ec4d Mon Sep 17 00:00:00 2001 From: Scott-Simmons Date: Sat, 26 Feb 2022 22:54:03 +1300 Subject: [PATCH 04/16] Added dropping of NaNs and modified the notation of feature naming conventions (underscores) --- tsfresh/feature_extraction/extraction.py | 10 ++-- .../feature_dynamics_tests.py | 49 +++++++++++++------ 2 files changed, 40 insertions(+), 19 deletions(-) diff --git a/tsfresh/feature_extraction/extraction.py b/tsfresh/feature_extraction/extraction.py index 95cd521f3..7af07aea9 100644 --- a/tsfresh/feature_extraction/extraction.py +++ b/tsfresh/feature_extraction/extraction.py @@ -315,7 +315,7 @@ def _f(): if func.fctype == "combiner": result = func(x, param=parameter_list) else: - if parameter_list: + if parameter_list: result = ((convert_to_output_format(param), func(x, **param)) for param in parameter_list) else: @@ -345,6 +345,7 @@ def extract_features_on_sub_features(timeseries_container, kind_to_fc_parameters=sub_kind_to_fc_parameters, **kwargs, pivot=False) + # the some features can produce NaNs which need to be removed before the next round of feature extraction column_kind = column_kind or "variable" column_id = column_id or "id" column_sort = column_sort or "sort" @@ -356,6 +357,7 @@ def extract_features_on_sub_features(timeseries_container, # We need to do this separately for dask dataframes, # as the return type is not a list, but already a dataframe if isinstance(sub_features, dd.DataFrame): + # TODO: dropping NAs for Dask dataframes... write tests sub_features = sub_features.reset_index(drop=True) sub_features[column_kind] = sub_features[column_kind].apply(lambda col: col.replace("_", ""), meta=(column_kind, object)) @@ -364,13 +366,13 @@ def extract_features_on_sub_features(timeseries_container, sub_features[column_id] = sub_features[column_id].apply(lambda x: x[0], meta=(column_id, ts_data.df_id_type)) else: - sub_features = pd.DataFrame(sub_features, columns=[column_id, column_kind, column_value]) + sub_features = pd.DataFrame(sub_features, columns=[column_id, column_kind, column_value]).dropna() + print("First round done: {}".format(sub_features)) - sub_features[column_kind] = sub_features[column_kind].apply(lambda col: col.replace("_", "")) + sub_features[column_kind] = sub_features[column_kind].apply(lambda col: col.replace("__", "||")) sub_features[column_sort] = sub_features[column_id].apply(lambda x: x[1]) sub_features[column_id] = sub_features[column_id].apply(lambda x: x[0]) - X = extract_features(sub_features, column_id=column_id, column_sort=column_sort, column_kind=column_kind, column_value=column_value, default_fc_parameters=default_fc_parameters, kind_to_fc_parameters=kind_to_fc_parameters, **kwargs) diff --git a/tsfresh/feature_extraction/feature_dynamics_tests.py b/tsfresh/feature_extraction/feature_dynamics_tests.py index d13dfae6f..894a0d5a9 100644 --- a/tsfresh/feature_extraction/feature_dynamics_tests.py +++ b/tsfresh/feature_extraction/feature_dynamics_tests.py @@ -1,22 +1,41 @@ import pandas as pd from extraction import extract_features_on_sub_features -from tsfresh.feature_extraction.settings import MinimalFCParameters +from tsfresh.feature_extraction.settings import MinimalFCParameters, EfficientFCParameters # Test is going to go here. Should work with Louis code + Scott code + the code that is integrated into tsfresh -# Read in data -ts = pd.read_csv("test_data.csv") -print("Minimal: {}".format(MinimalFCParameters())) -X = extract_features_on_sub_features(timeseries_container = ts, - sub_feature_split = 2, - sub_default_fc_parameters = MinimalFCParameters(), - default_fc_parameters = MinimalFCParameters(), - column_id = "measurement_id", - column_sort = "t", - column_kind = None, - column_value = None) -print(X) -#for col in X.columns: -# print(col) + + +if __name__ == "__main__": + + # Read in data + ts = pd.read_csv("./feature_extraction/test_data.csv") + print(ts) + + # running on minimal + X = extract_features_on_sub_features(timeseries_container = ts, + sub_feature_split = 1, + sub_default_fc_parameters = MinimalFCParameters(), + default_fc_parameters = MinimalFCParameters(), + column_id = "measurement_id", + column_sort = "t", + column_kind = None, + column_value = None, + show_warnings = True) + print(X) + + + # Running on efficient + X = extract_features_on_sub_features(timeseries_container = ts, + sub_feature_split = 1, + sub_default_fc_parameters = EfficientFCParameters(), + default_fc_parameters = EfficientFCParameters(), + column_id = "measurement_id", + column_sort = "t", + column_kind = None, + column_value = None, + show_warnings = True) + + print(X) \ No newline at end of file From 5bdaa4b2364b45cd8be47aa45b257ddf172d6373 Mon Sep 17 00:00:00 2001 From: Scott-Simmons Date: Sun, 6 Mar 2022 14:31:57 +1300 Subject: [PATCH 05/16] fixed data types for feature time series (for second round of extraction) casting all to floats --- tsfresh/feature_extraction/extraction.py | 22 ++++++-- .../feature_dynamics_tests.py | 53 +++++++++++++------ tsfresh/feature_extraction/test_data.csv | 36 +++---------- 3 files changed, 62 insertions(+), 49 deletions(-) diff --git a/tsfresh/feature_extraction/extraction.py b/tsfresh/feature_extraction/extraction.py index 7af07aea9..cae8b0690 100644 --- a/tsfresh/feature_extraction/extraction.py +++ b/tsfresh/feature_extraction/extraction.py @@ -5,9 +5,11 @@ This module contains the main function to interact with tsfresh: extract features """ +import numpy as np import logging import warnings from collections import Iterable +from numpy import dtype import pandas as pd from dask import dataframe as dd @@ -313,12 +315,20 @@ def _f(): x = data.values if func.fctype == "combiner": + # Casting ndarray with dtype object to dtype float as dtype object is not compatible with some feature calculators + x = np.asarray(x, dtype = float) result = func(x, param=parameter_list) else: - if parameter_list: + if parameter_list: + #if function_name == "binned_entropy": + # print("Stop here") + # Casting ndarray with dtype object to dtype float as dtype object is not compatible with some feature calculators + x = np.asarray(x, dtype = float) result = ((convert_to_output_format(param), func(x, **param)) for param in parameter_list) else: + # Casting ndarray with dtype object to dtype float as dtype object is not compatible with some feature calculators + x = np.asarray(x, dtype = float) result = [("", func(x))] for key, item in result: @@ -366,13 +376,19 @@ def extract_features_on_sub_features(timeseries_container, sub_features[column_id] = sub_features[column_id].apply(lambda x: x[0], meta=(column_id, ts_data.df_id_type)) else: - sub_features = pd.DataFrame(sub_features, columns=[column_id, column_kind, column_value]).dropna() - print("First round done: {}".format(sub_features)) + sub_features = pd.DataFrame(sub_features, columns=[column_id, column_kind, column_value]) + + # Need to drop features for all windows which contain at one NaN + target_list = sub_features[sub_features[column_value].isnull()][column_kind].unique() + sub_features = sub_features[~sub_features[column_kind].isin(target_list)] sub_features[column_kind] = sub_features[column_kind].apply(lambda col: col.replace("__", "||")) sub_features[column_sort] = sub_features[column_id].apply(lambda x: x[1]) sub_features[column_id] = sub_features[column_id].apply(lambda x: x[0]) + + print("Sub features\n{}".format(sub_features)) + X = extract_features(sub_features, column_id=column_id, column_sort=column_sort, column_kind=column_kind, column_value=column_value, default_fc_parameters=default_fc_parameters, kind_to_fc_parameters=kind_to_fc_parameters, **kwargs) diff --git a/tsfresh/feature_extraction/feature_dynamics_tests.py b/tsfresh/feature_extraction/feature_dynamics_tests.py index 894a0d5a9..0d053d027 100644 --- a/tsfresh/feature_extraction/feature_dynamics_tests.py +++ b/tsfresh/feature_extraction/feature_dynamics_tests.py @@ -1,5 +1,5 @@ import pandas as pd -from extraction import extract_features_on_sub_features +from extraction import extract_features, extract_features_on_sub_features from tsfresh.feature_extraction.settings import MinimalFCParameters, EfficientFCParameters # Test is going to go here. Should work with Louis code + Scott code + the code that is integrated into tsfresh @@ -11,25 +11,46 @@ if __name__ == "__main__": # Read in data - ts = pd.read_csv("./feature_extraction/test_data.csv") + ts = pd.read_csv("./test_data.csv") print(ts) - # running on minimal - X = extract_features_on_sub_features(timeseries_container = ts, - sub_feature_split = 1, - sub_default_fc_parameters = MinimalFCParameters(), - default_fc_parameters = MinimalFCParameters(), - column_id = "measurement_id", - column_sort = "t", - column_kind = None, - column_value = None, - show_warnings = True) - print(X) - + # # running on minimal + # X = extract_features(timeseries_container= ts, + # n_jobs = 0, + # default_fc_parameters=MinimalFCParameters(), + # column_id= "measurement_id", + # column_sort = "t", + # show_warnings = True) + + # print(X) + + # X = extract_features_on_sub_features(timeseries_container = ts, + # sub_feature_split = 3, # window size + # n_jobs = 0, + # sub_default_fc_parameters = MinimalFCParameters(), + # default_fc_parameters = MinimalFCParameters(), + # column_id = "measurement_id", + # column_sort = "t", + # column_kind = None, + # column_value = None, + # show_warnings = True) + # print(X) + + # drop feature calculators that are problematic... + + + # # running on efficient + # X = extract_features(timeseries_container= ts, + # n_jobs = 0, + # default_fc_parameters=EfficientFCParameters(), + # column_id= "measurement_id", + # column_sort = "t", + # show_warnings = True) + #print(X) - # Running on efficient X = extract_features_on_sub_features(timeseries_container = ts, - sub_feature_split = 1, + sub_feature_split = 3, + n_jobs = 0, sub_default_fc_parameters = EfficientFCParameters(), default_fc_parameters = EfficientFCParameters(), column_id = "measurement_id", diff --git a/tsfresh/feature_extraction/test_data.csv b/tsfresh/feature_extraction/test_data.csv index a26aaa2c2..801e776dd 100644 --- a/tsfresh/feature_extraction/test_data.csv +++ b/tsfresh/feature_extraction/test_data.csv @@ -1,31 +1,7 @@ t,y,measurement_id -1,1,1 -2,1,1 -3,1,1 -4,1,1 -5,1,1 -6,1,1 -7,1,1 -8,1,1 -9,1,1 -10,1,1 -11,2,1 -12,2,1 -13,2,1 -14,2,1 -15,2,2 -16,2,2 -17,2,2 -18,2,2 -19,2,2 -20,2,2 -21,3,2 -22,3,2 -23,3,2 -24,3,2 -25,3,2 -26,3,2 -27,3,2 -28,3,2 -29,3,2 -30,3,2 +1,334,1 +2,555,1 +3,664,1 +4,345346,1 +5,1356,1 +6,135,1 From 4a0038743761987578d970ef5c144454a29dab63 Mon Sep 17 00:00:00 2001 From: Scott-Simmons Date: Sun, 6 Mar 2022 14:41:31 +1300 Subject: [PATCH 06/16] dropped feature dynamics with NaNs --- tsfresh/feature_extraction/extraction.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tsfresh/feature_extraction/extraction.py b/tsfresh/feature_extraction/extraction.py index cae8b0690..eb5bedcfb 100644 --- a/tsfresh/feature_extraction/extraction.py +++ b/tsfresh/feature_extraction/extraction.py @@ -387,10 +387,10 @@ def extract_features_on_sub_features(timeseries_container, sub_features[column_sort] = sub_features[column_id].apply(lambda x: x[1]) sub_features[column_id] = sub_features[column_id].apply(lambda x: x[0]) - print("Sub features\n{}".format(sub_features)) - X = extract_features(sub_features, column_id=column_id, column_sort=column_sort, column_kind=column_kind, column_value=column_value, default_fc_parameters=default_fc_parameters, kind_to_fc_parameters=kind_to_fc_parameters, **kwargs) + # Drop all feature dynamics that have at least one NaN + X = X.dropna(axis = "columns", how = "any") return X From 10d50bf00a3b5d1123ca3c87ffd8707ec9298836 Mon Sep 17 00:00:00 2001 From: Louis Jarvis Date: Sun, 5 Feb 2023 19:01:11 +1300 Subject: [PATCH 07/16] add test differences within stacked unordered --- .../test_feature_dynamics_utils.py | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/tests/units/feature_dynamics_extraction/test_feature_dynamics_utils.py b/tests/units/feature_dynamics_extraction/test_feature_dynamics_utils.py index 34f7f0c5b..dc3577a88 100644 --- a/tests/units/feature_dynamics_extraction/test_feature_dynamics_utils.py +++ b/tests/units/feature_dynamics_extraction/test_feature_dynamics_utils.py @@ -647,6 +647,30 @@ def test_differences_within_stacked_dataframe_no_sort(self): stacked_dataframe_timeseries_container, expected_unmodified_data ) + def test_differences_within_stacked_dataframe_unordered(self): + """ Test case where a sort column exists but it is randomised and must be checked""" + stacked_df_timeseries_test_container, (column_id, column_sort, column_kind, column_value) = self.create_simple_test_data_sample_stacked() + stacked_df_timeseries_test_container["sort"] = [15, 14, 3, 8, 1, 5, 7, 0, 4, 12, 13, 9, 6, 2, 17, 16, 11, 10] + + ##TODO a function to check that the sort for stacked columns is suitable, if not throw and error + ##TODO add more arguments + def check_stacked_sort_sort(ts_container, column_id, column_sort): + for id in ts_container[column_id].unique(): + sort_col_vals = ts_container.query("{column_sort}==1")["sort"] + if pd.Series.equals(sort_col_vals.sort(), sort_col_vals): + ##TODO throw a warning + return False + return True + + + engineered_ts_within = diff_within_series( + timeseries_container=stacked_df_timeseries_test_container, + column_sort=column_sort, + column_id=column_id, + column_kind=column_kind, + column_value=column_value, + ) + def test_differences_within_dictionary(self): ( dict_timeseries_container, From a7bc8af410e7874c0462f91dbf5eb3ef8303ba34 Mon Sep 17 00:00:00 2001 From: Louis Jarvis Date: Sun, 5 Feb 2023 19:47:14 +1300 Subject: [PATCH 08/16] rm redundant test differences_bw_series_stacked func --- .../test_feature_dynamics_utils.py | 24 ------------------- 1 file changed, 24 deletions(-) diff --git a/tests/units/feature_dynamics_extraction/test_feature_dynamics_utils.py b/tests/units/feature_dynamics_extraction/test_feature_dynamics_utils.py index 616e91be9..25bceabdd 100644 --- a/tests/units/feature_dynamics_extraction/test_feature_dynamics_utils.py +++ b/tests/units/feature_dynamics_extraction/test_feature_dynamics_utils.py @@ -624,30 +624,6 @@ def test_differences_within_stacked_dataframe_no_sort(self): stacked_dataframe_timeseries_container, expected_unmodified_data ) - def test_differences_within_stacked_dataframe_unordered(self): - """ Test case where a sort column exists but it is randomised and must be checked""" - stacked_df_timeseries_test_container, (column_id, column_sort, column_kind, column_value) = self.create_simple_test_data_sample_stacked() - stacked_df_timeseries_test_container["sort"] = [15, 14, 3, 8, 1, 5, 7, 0, 4, 12, 13, 9, 6, 2, 17, 16, 11, 10] - - ##TODO a function to check that the sort for stacked columns is suitable, if not throw and error - ##TODO add more arguments - def check_stacked_sort_sort(ts_container, column_id, column_sort): - for id in ts_container[column_id].unique(): - sort_col_vals = ts_container.query("{column_sort}==1")["sort"] - if pd.Series.equals(sort_col_vals.sort(), sort_col_vals): - ##TODO throw a warning - return False - return True - - - engineered_ts_within = diff_within_series( - timeseries_container=stacked_df_timeseries_test_container, - column_sort=column_sort, - column_id=column_id, - column_kind=column_kind, - column_value=column_value, - ) - def test_differences_within_dictionary(self): ( dict_timeseries_container, From 22256d54a8b33d3117a6fbdd0a258c76fe1733e0 Mon Sep 17 00:00:00 2001 From: Louis Jarvis Date: Sun, 19 Feb 2023 14:38:37 +1300 Subject: [PATCH 09/16] temp dask changes remove mdf dependency --- tests/fixtures.py | 185 ++++++++++++++++++ .../test_feature_dynamics_data.py | 9 +- .../feature_dynamics_utils.py | 2 +- 3 files changed, 194 insertions(+), 2 deletions(-) diff --git a/tests/fixtures.py b/tests/fixtures.py index 6cd35ffbb..c9de81d1d 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -8,6 +8,7 @@ import numpy as np import pandas as pd +import dask.dataframe as ddf @contextmanager @@ -1272,3 +1273,187 @@ def create_split_up_test_data_expected_tuples_wide(self): ), ] return (wide_test_data_expected_chunked_up_tuples, window_length) + + +class DaskDataTestCase: + def create_simple_test_sample(self): + + #identical to the above + cid = np.repeat([10, 500], 40) + ckind = np.repeat(["a", "b", "a", "b"], 20) + csort = [ + 30, + 53, + 26, + 35, + 42, + 25, + 17, + 67, + 20, + 68, + 46, + 12, + 0, + 74, + 66, + 31, + 32, + 2, + 55, + 59, + 56, + 60, + 34, + 69, + 47, + 15, + 49, + 8, + 50, + 73, + 23, + 62, + 24, + 33, + 22, + 70, + 3, + 38, + 28, + 75, + 39, + 36, + 64, + 13, + 72, + 52, + 40, + 16, + 58, + 29, + 63, + 79, + 61, + 78, + 1, + 10, + 4, + 6, + 65, + 44, + 54, + 48, + 11, + 14, + 19, + 43, + 76, + 7, + 51, + 9, + 27, + 21, + 5, + 71, + 57, + 77, + 41, + 18, + 45, + 37, + ] + cval = [ + 11, + 9, + 67, + 45, + 30, + 58, + 62, + 19, + 56, + 29, + 0, + 27, + 36, + 43, + 33, + 2, + 24, + 71, + 41, + 28, + 50, + 40, + 39, + 7, + 53, + 23, + 16, + 37, + 66, + 38, + 6, + 47, + 3, + 61, + 44, + 42, + 78, + 31, + 21, + 55, + 15, + 35, + 25, + 32, + 69, + 65, + 70, + 64, + 51, + 46, + 5, + 77, + 26, + 73, + 76, + 75, + 72, + 74, + 10, + 57, + 4, + 14, + 68, + 22, + 18, + 52, + 54, + 60, + 79, + 12, + 49, + 63, + 8, + 59, + 1, + 13, + 20, + 17, + 48, + 34, + ] + + df = ddf.from_dict({"id": cid, "kind": ckind, "sort": csort, "val": cval}, npartitions=10) + df = ddf.set_index("id", drop=False) + #df.index.name = None + return ddf + + def create_test_dask_df_long(self): + # cid = np.repeat(["A", "B"], repeats = [4,4]) + # ckind = np.repeat(["A", "B"]) + # csort = + # cval = [] + pass \ No newline at end of file diff --git a/tests/units/feature_dynamics_extraction/test_feature_dynamics_data.py b/tests/units/feature_dynamics_extraction/test_feature_dynamics_data.py index 9f3ec3570..9a3703953 100644 --- a/tests/units/feature_dynamics_extraction/test_feature_dynamics_data.py +++ b/tests/units/feature_dynamics_extraction/test_feature_dynamics_data.py @@ -12,6 +12,7 @@ import pandas as pd import dask.dataframe as dd from tests.units.feature_extraction.test_data import DataAdapterTestCase +from tests.fixtures import DaskDataTestCase class IterableSplitTsDataTestCase( @@ -39,6 +40,9 @@ def test_iter_on_long_data(self): underlying_data_converted_to_tsdata, expected_non_windowed_tuples ) + ##TODO test the dask df test case + df_stacked = self.create + # Test equality of each chunk... self.assert_tsdata(split_ts_data, expected_windowed_tuples) @@ -211,7 +215,8 @@ def test_f(chunk): ) def test_apply_on_long_data_dask(self): - pass + df = DaskDataTestCase().create_simple_test_sample() + return True def test_iter_on_long_data_no_value_column_dask(self): pass @@ -236,3 +241,5 @@ def test_zero_split_size_dask(self): def test_fractional_split_size_dask(self): pass + +ApplyableSplitTsDataTestCase().test_apply_on_long_data_dask() \ No newline at end of file diff --git a/tsfresh/feature_dynamics_extraction/feature_dynamics_utils.py b/tsfresh/feature_dynamics_extraction/feature_dynamics_utils.py index 78e4e088b..0f781b220 100644 --- a/tsfresh/feature_dynamics_extraction/feature_dynamics_utils.py +++ b/tsfresh/feature_dynamics_extraction/feature_dynamics_utils.py @@ -4,7 +4,7 @@ import pandas as pd from pandas.api.types import is_numeric_dtype from typing import List -from md2pdf.core import md2pdf +#from md2pdf.core import md2pdf from tsfresh.feature_extraction import feature_calculators from tsfresh.utilities.string_manipulation import get_config_from_string from tsfresh.feature_extraction.data import ( From ccaa8cc5bc713c612c62356d79d5adb24d31dac0 Mon Sep 17 00:00:00 2001 From: Louis Jarvis Date: Sun, 19 Feb 2023 20:00:38 +1300 Subject: [PATCH 10/16] pycache to gitingnore --- .gitignore | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 7266dfdf9..ddf94e8d7 100644 --- a/.gitignore +++ b/.gitignore @@ -63,4 +63,7 @@ feature_dynamics_interpretation.pdf feature_dynamics_interpretation.md # dask dask-worker-space -dask-worker-space/ \ No newline at end of file +dask-worker-space/ + +tests/*/__pycache__/ +tests/*/*/__pycache__/ \ No newline at end of file From 7c867aa5195dc1a75dad42f856d6929c37b728fa Mon Sep 17 00:00:00 2001 From: Louis Jarvis Date: Sun, 26 Feb 2023 16:50:19 +1300 Subject: [PATCH 11/16] test feature_dynamics_extraction on wide format case --- .../test_feature_dynamics_extraction.py | 27 +++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/tests/units/feature_dynamics_extraction/test_feature_dynamics_extraction.py b/tests/units/feature_dynamics_extraction/test_feature_dynamics_extraction.py index d0cb807c9..5454a0e5e 100644 --- a/tests/units/feature_dynamics_extraction/test_feature_dynamics_extraction.py +++ b/tests/units/feature_dynamics_extraction/test_feature_dynamics_extraction.py @@ -515,6 +515,33 @@ def test_extract_feature_dynamics_alphabetically_sorted(self): self.assertEqual(col_name_chunks, list(sorted(col_name_chunks))) + def test_extract_feature_dynamics_wide(self): + ts_wide_df,_,_ = self.create_simple_test_data_sample_wide() + + fc_test_params = {"minimum": None} + window_length = 3 + + extracted_feature_dynamics = extract_feature_dynamics( + timeseries_container=ts_wide_df, + column_id="id", + column_sort="sort", + column_kind=None, #"kind", None since wide format + column_value=None, #"val", + n_jobs= 1,#self.n_jobs, + feature_timeseries_fc_parameters={window_length: fc_test_params}, + feature_dynamics_fc_parameters={window_length: fc_test_params}, + ) + + expected_ans = pd.DataFrame( + data={ + 'y1||minimum@window_3__minimum': {1: 1.0, 2: -34.0}, + 'y2||minimum@window_3__minimum': {1: -10.0, 2: 3.0}, + 'y3||minimum@window_3__minimum': {1: 4.0, 2: 1.0} + } + ) + + pd.testing.assert_frame_equal(extract_feature_dynamics, expected_ans) + class ParallelDynamicsExtractionTestCase(DataTestCase): def setUp(self): From 1be4f16fdd84f42fe84605b674e7b00de2a406cc Mon Sep 17 00:00:00 2001 From: Louis Jarvis Date: Sun, 26 Feb 2023 18:17:15 +1300 Subject: [PATCH 12/16] test extract_feature_dynamics for long --- .../test_feature_dynamics_extraction.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/tests/units/feature_dynamics_extraction/test_feature_dynamics_extraction.py b/tests/units/feature_dynamics_extraction/test_feature_dynamics_extraction.py index 5454a0e5e..76509662d 100644 --- a/tests/units/feature_dynamics_extraction/test_feature_dynamics_extraction.py +++ b/tests/units/feature_dynamics_extraction/test_feature_dynamics_extraction.py @@ -542,6 +542,23 @@ def test_extract_feature_dynamics_wide(self): pd.testing.assert_frame_equal(extract_feature_dynamics, expected_ans) + def test_extract_feature_dynamics_long(self): + + ts_long_df,_,_ = self.create_simple_test_data_sample_stacked() + + fc_test_params = {"minimum": None} + window_length = 3 + + extracted_feature_dynamics = extract_feature_dynamics( + timeseries_container=ts_long_df, + column_id="id", + column_sort="sort", + column_kind="kind", #None since wide format + column_value="val", + n_jobs= 1,#self.n_jobs, + feature_timeseries_fc_parameters={window_length: fc_test_params}, + feature_dynamics_fc_parameters={window_length: fc_test_params}, + ) class ParallelDynamicsExtractionTestCase(DataTestCase): def setUp(self): From 9612b395af936797b6e866792cb66fccb91f80a4 Mon Sep 17 00:00:00 2001 From: Louis Jarvis Date: Sun, 26 Feb 2023 18:22:26 +1300 Subject: [PATCH 13/16] test extract_feature_dynamics for list of dicts --- .../test_feature_dynamics_extraction.py | 37 +++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/tests/units/feature_dynamics_extraction/test_feature_dynamics_extraction.py b/tests/units/feature_dynamics_extraction/test_feature_dynamics_extraction.py index 76509662d..389de5093 100644 --- a/tests/units/feature_dynamics_extraction/test_feature_dynamics_extraction.py +++ b/tests/units/feature_dynamics_extraction/test_feature_dynamics_extraction.py @@ -560,6 +560,43 @@ def test_extract_feature_dynamics_long(self): feature_dynamics_fc_parameters={window_length: fc_test_params}, ) + expected_ans = pd.DataFrame( + data={ + 'y1||minimum@window_3__minimum': {1: 1.0, 2: -34.0}, + 'y2||minimum@window_3__minimum': {1: -10.0, 2: 3.0}, + 'y3||minimum@window_3__minimum': {1: 4.0, 2: 1.0} + } + ) + + pd.testing.assert_frame_equal(extract_feature_dynamics, expected_ans) + + def test_extract_feature_dynamics_dict(self): + + ts_list_of_dicts = self.create_simple_test_data_sample_dict() + fc_test_params = {"minimum": None} + window_length = 3 + + extracted_feature_dynamics = extract_feature_dynamics( + timeseries_container=ts_long_df, + column_id="id", + column_sort="sort", + column_kind="kind", #None since wide format + column_value="val", + n_jobs= 1,#self.n_jobs, + feature_timeseries_fc_parameters={window_length: fc_test_params}, + feature_dynamics_fc_parameters={window_length: fc_test_params}, + ) + + expected_ans = pd.DataFrame( + data={ + 'y1||minimum@window_3__minimum': {1: 1.0, 2: -34.0}, + 'y2||minimum@window_3__minimum': {1: -10.0, 2: 3.0}, + 'y3||minimum@window_3__minimum': {1: 4.0, 2: 1.0} + } + ) + + pd.testing.assert_frame_equal(extract_feature_dynamics, expected_ans) + class ParallelDynamicsExtractionTestCase(DataTestCase): def setUp(self): self.n_jobs = 2 From 8d63fa95faac4cb03aea3940bb7999a94ebeed05 Mon Sep 17 00:00:00 2001 From: Louis Jarvis Date: Sun, 26 Feb 2023 18:28:01 +1300 Subject: [PATCH 14/16] fix incorrect var name --- .../test_feature_dynamics_extraction.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/units/feature_dynamics_extraction/test_feature_dynamics_extraction.py b/tests/units/feature_dynamics_extraction/test_feature_dynamics_extraction.py index 389de5093..de9cf5365 100644 --- a/tests/units/feature_dynamics_extraction/test_feature_dynamics_extraction.py +++ b/tests/units/feature_dynamics_extraction/test_feature_dynamics_extraction.py @@ -540,7 +540,7 @@ def test_extract_feature_dynamics_wide(self): } ) - pd.testing.assert_frame_equal(extract_feature_dynamics, expected_ans) + pd.testing.assert_frame_equal(extracted_feature_dynamics, expected_ans) def test_extract_feature_dynamics_long(self): @@ -568,7 +568,7 @@ def test_extract_feature_dynamics_long(self): } ) - pd.testing.assert_frame_equal(extract_feature_dynamics, expected_ans) + pd.testing.assert_frame_equal(extracted_feature_dynamics, expected_ans) def test_extract_feature_dynamics_dict(self): @@ -577,10 +577,10 @@ def test_extract_feature_dynamics_dict(self): window_length = 3 extracted_feature_dynamics = extract_feature_dynamics( - timeseries_container=ts_long_df, + timeseries_container=ts_list_of_dicts, column_id="id", column_sort="sort", - column_kind="kind", #None since wide format + column_kind=None, #None since wide format column_value="val", n_jobs= 1,#self.n_jobs, feature_timeseries_fc_parameters={window_length: fc_test_params}, @@ -595,7 +595,7 @@ def test_extract_feature_dynamics_dict(self): } ) - pd.testing.assert_frame_equal(extract_feature_dynamics, expected_ans) + pd.testing.assert_frame_equal(extracted_feature_dynamics, expected_ans) class ParallelDynamicsExtractionTestCase(DataTestCase): def setUp(self): From 9a739a4911ba021716a81c0be32b51948df02f73 Mon Sep 17 00:00:00 2001 From: Louis Jarvis Date: Sun, 26 Feb 2023 19:09:58 +1300 Subject: [PATCH 15/16] fix files overwritten by merge --- tests/fixtures.py | 2 - .../test_feature_dynamics_data.py | 11 +--- .../feature_dynamics_tests.py | 62 ------------------- 3 files changed, 2 insertions(+), 73 deletions(-) delete mode 100644 tsfresh/feature_extraction/feature_dynamics_tests.py diff --git a/tests/fixtures.py b/tests/fixtures.py index c9de81d1d..9c5887c31 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -8,7 +8,6 @@ import numpy as np import pandas as pd -import dask.dataframe as ddf @contextmanager @@ -1275,7 +1274,6 @@ def create_split_up_test_data_expected_tuples_wide(self): return (wide_test_data_expected_chunked_up_tuples, window_length) -class DaskDataTestCase: def create_simple_test_sample(self): #identical to the above diff --git a/tests/units/feature_dynamics_extraction/test_feature_dynamics_data.py b/tests/units/feature_dynamics_extraction/test_feature_dynamics_data.py index 9a3703953..fc31a60db 100644 --- a/tests/units/feature_dynamics_extraction/test_feature_dynamics_data.py +++ b/tests/units/feature_dynamics_extraction/test_feature_dynamics_data.py @@ -12,7 +12,6 @@ import pandas as pd import dask.dataframe as dd from tests.units.feature_extraction.test_data import DataAdapterTestCase -from tests.fixtures import DaskDataTestCase class IterableSplitTsDataTestCase( @@ -40,9 +39,6 @@ def test_iter_on_long_data(self): underlying_data_converted_to_tsdata, expected_non_windowed_tuples ) - ##TODO test the dask df test case - df_stacked = self.create - # Test equality of each chunk... self.assert_tsdata(split_ts_data, expected_windowed_tuples) @@ -215,8 +211,7 @@ def test_f(chunk): ) def test_apply_on_long_data_dask(self): - df = DaskDataTestCase().create_simple_test_sample() - return True + pass def test_iter_on_long_data_no_value_column_dask(self): pass @@ -240,6 +235,4 @@ def test_zero_split_size_dask(self): pass def test_fractional_split_size_dask(self): - pass - -ApplyableSplitTsDataTestCase().test_apply_on_long_data_dask() \ No newline at end of file + pass \ No newline at end of file diff --git a/tsfresh/feature_extraction/feature_dynamics_tests.py b/tsfresh/feature_extraction/feature_dynamics_tests.py deleted file mode 100644 index 0d053d027..000000000 --- a/tsfresh/feature_extraction/feature_dynamics_tests.py +++ /dev/null @@ -1,62 +0,0 @@ -import pandas as pd -from extraction import extract_features, extract_features_on_sub_features -from tsfresh.feature_extraction.settings import MinimalFCParameters, EfficientFCParameters -# Test is going to go here. Should work with Louis code + Scott code + the code that is integrated into tsfresh - - - - - - -if __name__ == "__main__": - - # Read in data - ts = pd.read_csv("./test_data.csv") - print(ts) - - # # running on minimal - # X = extract_features(timeseries_container= ts, - # n_jobs = 0, - # default_fc_parameters=MinimalFCParameters(), - # column_id= "measurement_id", - # column_sort = "t", - # show_warnings = True) - - # print(X) - - # X = extract_features_on_sub_features(timeseries_container = ts, - # sub_feature_split = 3, # window size - # n_jobs = 0, - # sub_default_fc_parameters = MinimalFCParameters(), - # default_fc_parameters = MinimalFCParameters(), - # column_id = "measurement_id", - # column_sort = "t", - # column_kind = None, - # column_value = None, - # show_warnings = True) - # print(X) - - # drop feature calculators that are problematic... - - - # # running on efficient - # X = extract_features(timeseries_container= ts, - # n_jobs = 0, - # default_fc_parameters=EfficientFCParameters(), - # column_id= "measurement_id", - # column_sort = "t", - # show_warnings = True) - #print(X) - - X = extract_features_on_sub_features(timeseries_container = ts, - sub_feature_split = 3, - n_jobs = 0, - sub_default_fc_parameters = EfficientFCParameters(), - default_fc_parameters = EfficientFCParameters(), - column_id = "measurement_id", - column_sort = "t", - column_kind = None, - column_value = None, - show_warnings = True) - - print(X) \ No newline at end of file From ba876eec27d6bc22de070f51063130a97886a06c Mon Sep 17 00:00:00 2001 From: Louis Jarvis Date: Sun, 26 Feb 2023 19:14:24 +1300 Subject: [PATCH 16/16] fix files overwritten by merge --- tests/fixtures.py | 185 +--------------------------------------------- 1 file changed, 1 insertion(+), 184 deletions(-) diff --git a/tests/fixtures.py b/tests/fixtures.py index 9c5887c31..7fe306f3b 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -1271,187 +1271,4 @@ def create_split_up_test_data_expected_tuples_wide(self): ), ), ] - return (wide_test_data_expected_chunked_up_tuples, window_length) - - - def create_simple_test_sample(self): - - #identical to the above - cid = np.repeat([10, 500], 40) - ckind = np.repeat(["a", "b", "a", "b"], 20) - csort = [ - 30, - 53, - 26, - 35, - 42, - 25, - 17, - 67, - 20, - 68, - 46, - 12, - 0, - 74, - 66, - 31, - 32, - 2, - 55, - 59, - 56, - 60, - 34, - 69, - 47, - 15, - 49, - 8, - 50, - 73, - 23, - 62, - 24, - 33, - 22, - 70, - 3, - 38, - 28, - 75, - 39, - 36, - 64, - 13, - 72, - 52, - 40, - 16, - 58, - 29, - 63, - 79, - 61, - 78, - 1, - 10, - 4, - 6, - 65, - 44, - 54, - 48, - 11, - 14, - 19, - 43, - 76, - 7, - 51, - 9, - 27, - 21, - 5, - 71, - 57, - 77, - 41, - 18, - 45, - 37, - ] - cval = [ - 11, - 9, - 67, - 45, - 30, - 58, - 62, - 19, - 56, - 29, - 0, - 27, - 36, - 43, - 33, - 2, - 24, - 71, - 41, - 28, - 50, - 40, - 39, - 7, - 53, - 23, - 16, - 37, - 66, - 38, - 6, - 47, - 3, - 61, - 44, - 42, - 78, - 31, - 21, - 55, - 15, - 35, - 25, - 32, - 69, - 65, - 70, - 64, - 51, - 46, - 5, - 77, - 26, - 73, - 76, - 75, - 72, - 74, - 10, - 57, - 4, - 14, - 68, - 22, - 18, - 52, - 54, - 60, - 79, - 12, - 49, - 63, - 8, - 59, - 1, - 13, - 20, - 17, - 48, - 34, - ] - - df = ddf.from_dict({"id": cid, "kind": ckind, "sort": csort, "val": cval}, npartitions=10) - df = ddf.set_index("id", drop=False) - #df.index.name = None - return ddf - - def create_test_dask_df_long(self): - # cid = np.repeat(["A", "B"], repeats = [4,4]) - # ckind = np.repeat(["A", "B"]) - # csort = - # cval = [] - pass \ No newline at end of file + return (wide_test_data_expected_chunked_up_tuples, window_length) \ No newline at end of file