From 6f7ecd9cd131b1078ef4122ff0bfd03fa028d863 Mon Sep 17 00:00:00 2001
From: Scott_Simmons23 <ssim323@aucklanduni.ac.nz>
Date: Wed, 23 Feb 2022 18:16:16 +1300
Subject: [PATCH 01/16] first

---
 .../scott_features_on_features_tests.py       |  17 ++
 tsfresh/feature_extraction/scotts_code.py     | 199 ++++++++++++++++++
 tsfresh/feature_extraction/test_data.csv      |  31 +++
 3 files changed, 247 insertions(+)
 create mode 100644 tsfresh/feature_extraction/scott_features_on_features_tests.py
 create mode 100644 tsfresh/feature_extraction/scotts_code.py
 create mode 100644 tsfresh/feature_extraction/test_data.csv

diff --git a/tsfresh/feature_extraction/scott_features_on_features_tests.py b/tsfresh/feature_extraction/scott_features_on_features_tests.py
new file mode 100644
index 000000000..1f9871a7d
--- /dev/null
+++ b/tsfresh/feature_extraction/scott_features_on_features_tests.py
@@ -0,0 +1,17 @@
+import pandas as pd
+from extraction import extract_features_on_sub_features
+from tsfresh.feature_extraction.settings import MinimalFCParameters
+# Test is going to go here. Should work with Louis code + Scott code + the code that is integrated into tsfresh
+
+# Read in data
+ts = pd.read_csv("test_data.csv")
+
+
+X = extract_features_on_sub_features(timeseries_container = ts,
+                                     sub_feature_split = 2,
+                                     sub_default_fc_parameters = MinimalFCParameters,
+                                     default_fc_parameters = MinimalFCParameters,
+                                     column_id = "measurement_id",
+                                     column_sort = "t",
+                                     column_kind = None,
+                                     column_value = None)
diff --git a/tsfresh/feature_extraction/scotts_code.py b/tsfresh/feature_extraction/scotts_code.py
new file mode 100644
index 000000000..3c0dbc7f2
--- /dev/null
+++ b/tsfresh/feature_extraction/scotts_code.py
@@ -0,0 +1,199 @@
+# a file that has the algorithm used in my project...
+
+# main features on features framework used in VSB project
+def features_on_features_vsb(ts,
+                             first_fc_params,
+                             second_fc_params,
+                             fc_params_is_kind,
+                             replacement_token):
+    '''
+    main algorithm that uses tsfresh which computes features on features (feature dynamics) for the VSB data
+    NOTE: The DATA format that this function supports is input option 1 for feature extraction https://tsfresh.readthedocs.io/en/latest/text/data_formats.html
+
+        params:
+            ts (pd.DataFrame): The VSB measurements that will be processed.
+            first_fc_params, second_fc_params (dictionary): feature sets for (1) the extraction of feature time series and (2) the extraction of feature dynamics
+            fc_params_is_kind (bool): if the feature dictionaries maps each separate VSB signal (ts kind) to a different feature set then this value is True, otherwise false.
+            replacement_token (str): token that replaces double unscore in feature naming convention. This adjustment is required for featue dynamics extraction
+        returns:
+            X1 (pd.DataFrame): Feature dynamics matrix
+            y1 (pd.Series): Response vector.
+    '''
+
+    # map the reponse variable to each row in the features on features matrix if the timeseries is test data otherwise do nothing...
+    try:
+        y = ts.groupby("measurement_id").last()["response"]
+    except:
+        y = None
+
+
+    # assign unique pairs of (mes_id, window_id) to each element
+    ts["column_id"] = ts["measurement_id"].astype(str) + ", " + ts["window_id"].astype(str)
+
+
+    # drop the columns which are not relevant to feature extraction
+    try:
+        ts = ts.drop(columns = ["measurement_id", "window_id", "response"]) # for labelled data
+    except:
+        ts = ts.drop(columns = ["measurement_id", "window_id"]) # non labelled data
+
+    print("TS INPUT {}".format(ts))
+    # first round of feature extraction FEATURE TIME SERIES
+    X0 = (extract_features(ts, column_id = "column_id",column_sort = "time_index", kind_to_fc_parameters = first_fc_params, disable_progressbar = True) if fc_params_is_kind
+          else extract_features(ts, column_id = "column_id",column_sort = "time_index", default_fc_parameters = first_fc_params, disable_progressbar = True))
+
+    print("FIRST {}".format(X0.shape))
+
+    # drop any features that produce any NaNs/NAs
+    if X0.isnull().values.any():
+        # store dropped features
+        dropped_feature_names = [col_name for col_name in X0.columns[X0.isna().any()].tolist()]
+        # store the feature calculators that fail in a file that is constantly updated.
+        with open("dropped_feature_names.txt", "a") as f:
+            for feature in dropped_feature_names: f.write(feature[feature.index("__") + 2:] + "\n") # 2 is a magic number. It works. But this should be refactored..
+
+        print("found " + str(len(X0.columns[X0.isna().any()].tolist())) + " features from the set of " + str(len(X0.columns)) + " features which should be dropped before being input into second feat extraction")
+        X0 = X0.dropna(axis = "columns")
+
+
+    # tsfresh cant handle double underscores twice so change this in preparation for the second feature extraction
+    X0.columns = [str(col_name).replace("__",replacement_token) for col_name in X0.columns]
+
+    # assign windows as the original measurment ID... i.e. extracting "mes_id" from (mes_id, window_id)
+    X0["column_id"] = X0.index.to_series().str.split(", ", expand = True).iloc[:,0]
+
+    print("FEATURE TS INPUT {}".format(X0))
+
+    # second round of feature extraction FEATURE DYNAMICS
+    X1 = (extract_features(X0, column_id = "column_id", kind_to_fc_parameters = second_fc_params, disable_progressbar = True) if fc_params_is_kind
+          else  extract_features(X0, column_id = "column_id", default_fc_parameters = second_fc_params, disable_progressbar = True))
+
+    X1.index.name = "measurement_id"
+
+    # drop any features which are null or na
+    if X1.isnull().values.any():
+        print("found " + str(len(X1.columns[X1.isna().any()].tolist())) + " features from the set of " + str(len(X1.columns)) + " features which should be dropped before being considered as the final output...")
+        X1 = X1.dropna(axis = "columns")
+
+
+    # sort column names
+    X1.sort_index(axis="columns", inplace=True)
+
+    print("X1 output {}, {}".format(X1.shape, X1))
+
+    # returning the feature matrix, the response variable corresponding to each feature matrix window, and optionally the dropped colnames
+    return (X1, y)
+
+
+
+
+
+### The code written into tsfresh
+
+class IterableTsData(Iterable[Timeseries], Sized, TsData):
+    """
+    Special class of TsData, which can be partitioned.
+    Derived classes should implement __iter__ and __len__.
+    """
+    def pivot(self, results):
+        """
+        Helper function to turn an iterable of tuples with three entries into a dataframe.
+
+        The input ``list_of_tuples`` needs to be an iterable with tuples containing three
+        entries: (a, b, c).
+        Out of this, a pandas dataframe will be created with all a's as index,
+        all b's as columns and all c's as values.
+
+        It basically does a pd.pivot(first entry, second entry, third entry),
+        but optimized for non-pandas input (= python list of tuples).
+
+        This function is called in the end of the extract_features call.
+        """
+        return_df_dict = defaultdict(dict)
+        for chunk_id, variable, value in results:
+            # we turn it into a nested mapping `column -> index -> value`
+            return_df_dict[variable][chunk_id] = value
+
+        # the mapping column -> {index -> value}
+        # is now a dict of dicts. The pandas dataframe
+        # constructor will peel this off:
+        # first, the keys of the outer dict (the column)
+        # will turn into a column header and the rest into a column
+        # the rest is {index -> value} which will be turned into a
+        # column with index.
+        # All index will be aligned.
+        return_df = pd.DataFrame(return_df_dict, dtype=float)
+
+        # copy the type of the index
+        return_df.index = return_df.index.astype(self.df_id_type)
+
+        # Sort by index to be backward compatible
+        return_df = return_df.sort_index()
+
+        return return_df
+
+    def __len__(self):
+        """Override in a subclass"""
+        raise NotImplementedError
+
+    def __iter__(self):
+        """Override in a subclass"""
+        raise NotImplementedError
+
+
+class ApplyableTsData(TsData):
+    """
+    TsData base class to use, if an iterable ts data can not be used.
+    Its only interface is an apply function, which should be applied
+    to each of the chunks of the data. How this is done
+    depends on the implementation.
+    """
+    def apply(self, f, **kwargs):
+        raise NotImplementedError
+
+def extract_features_on_sub_features(timeseries_container,
+                                     sub_feature_split,
+                                     sub_default_fc_parameters=None, sub_kind_to_fc_parameters=None,
+                                     default_fc_parameters=None, kind_to_fc_parameters=None,
+                                     column_id=None, column_sort=None, column_kind=None, column_value=None,
+                                     **kwargs):
+    ts_data = to_tsdata(timeseries_container, column_id=column_id, column_sort=column_sort, column_kind=column_kind, column_value=column_value)
+    if isinstance(ts_data, Iterable):
+        split_ts_data = IterableSplitTsData(ts_data, sub_feature_split)
+    else:
+        split_ts_data = ApplyableSplitTsData(ts_data, sub_feature_split)
+
+    sub_features = extract_features(split_ts_data, default_fc_parameters=sub_default_fc_parameters,
+                                    kind_to_fc_parameters=sub_kind_to_fc_parameters, **kwargs, pivot=False)
+
+    column_kind = column_kind or "variable"
+    column_id = column_id or "id"
+    column_sort = column_sort or "sort"
+    column_value = column_value or "value"
+
+    # The feature names include many "_", which will confuse tsfresh where the sub feature name ends
+    # and where the real feature name starts. We just remove them.
+    # Also, we split up the index into the id and the sort
+    # We need to do this separately for dask dataframes,
+    # as the return type is not a list, but already a dataframe
+    if isinstance(sub_features, dd.DataFrame):
+        sub_features = sub_features.reset_index(drop=True)
+
+        sub_features[column_kind] = sub_features[column_kind].apply(lambda col: col.replace("_", ""), meta=(column_kind, object))
+
+        sub_features[column_sort] = sub_features[column_id].apply(lambda x: x[1], meta=(column_id, "int64"))
+        sub_features[column_id] = sub_features[column_id].apply(lambda x: x[0], meta=(column_id, ts_data.df_id_type))
+
+    else:
+        sub_features = pd.DataFrame(sub_features, columns=[column_id, column_kind, column_value])
+
+        sub_features[column_kind] = sub_features[column_kind].apply(lambda col: col.replace("_", ""))
+
+        sub_features[column_sort] = sub_features[column_id].apply(lambda x: x[1])
+        sub_features[column_id] = sub_features[column_id].apply(lambda x: x[0])
+
+    X = extract_features(sub_features, column_id=column_id, column_sort=column_sort, column_kind=column_kind, column_value=column_value,
+                         default_fc_parameters=default_fc_parameters, kind_to_fc_parameters=kind_to_fc_parameters,
+                         **kwargs)
+
+    return X
diff --git a/tsfresh/feature_extraction/test_data.csv b/tsfresh/feature_extraction/test_data.csv
new file mode 100644
index 000000000..8a036e22c
--- /dev/null
+++ b/tsfresh/feature_extraction/test_data.csv
@@ -0,0 +1,31 @@
+t,y,window_id,measurement_id
+1,1,1,1
+2,1,1,1
+3,1,1,1
+4,1,1,1
+5,1,1,1
+6,1,1,1
+7,1,1,1
+8,1,1,1
+9,1,1,1
+10,1,1,1
+11,2,1,1
+12,2,1,1
+13,2,1,1
+14,2,1,1
+15,2,1,1
+16,2,1,1
+17,2,1,1
+18,2,1,1
+19,2,1,1
+20,2,1,1
+21,3,2,1
+22,3,2,1
+23,3,2,1
+24,3,2,1
+25,3,2,1
+26,3,2,1
+27,3,2,1
+28,3,2,1
+29,3,2,1
+30,3,2,1

From 802531b536e142b4b7c428c0a716428e6d03eb9d Mon Sep 17 00:00:00 2001
From: Scott-Simmons <ssim323@aucklanduni.ac.nz>
Date: Thu, 24 Feb 2022 15:07:58 +1300
Subject: [PATCH 02/16] second

---
 tsfresh/feature_extraction/extraction.py      |  9 +--
 .../scott_features_on_features_tests.py       |  9 ++-
 tsfresh/feature_extraction/test_data.csv      | 62 +++++++++----------
 3 files changed, 43 insertions(+), 37 deletions(-)

diff --git a/tsfresh/feature_extraction/extraction.py b/tsfresh/feature_extraction/extraction.py
index 450a91c8c..95cd521f3 100644
--- a/tsfresh/feature_extraction/extraction.py
+++ b/tsfresh/feature_extraction/extraction.py
@@ -14,8 +14,9 @@
 
 from tsfresh import defaults
 from tsfresh.feature_extraction import feature_calculators
-from tsfresh.feature_extraction.data import to_tsdata, IterableSplitTsData, ApplyableSplitTsData
 from tsfresh.feature_extraction.settings import ComprehensiveFCParameters
+from data import to_tsdata, IterableSplitTsData, ApplyableSplitTsData
+
 from tsfresh.utilities import profiling
 from tsfresh.utilities.distribution import MapDistributor, MultiprocessingDistributor, \
     DistributorBaseClass, ApplyDistributor
@@ -148,7 +149,6 @@ def extract_features(timeseries_container, default_fc_parameters=None,
             warnings.simplefilter("ignore")
         else:
             warnings.simplefilter("default")
-
         result = _do_extraction(df=timeseries_container,
                                 column_id=column_id, column_value=column_value,
                                 column_kind=column_kind,
@@ -290,6 +290,7 @@ def _do_extraction_on_chunk(chunk, default_fc_parameters, kind_to_fc_parameters)
         fc_parameters = default_fc_parameters
 
     def _f():
+
         for function_name, parameter_list in fc_parameters.items():
             func = getattr(feature_calculators, function_name)
 
@@ -325,7 +326,6 @@ def _f():
                 if key:
                     feature_name += "__" + str(key)
                 yield (sample_id, feature_name, item)
-
     return list(_f())
 
 
@@ -344,6 +344,7 @@ def extract_features_on_sub_features(timeseries_container,
     sub_features = extract_features(split_ts_data, default_fc_parameters=sub_default_fc_parameters,
                                     kind_to_fc_parameters=sub_kind_to_fc_parameters, **kwargs, pivot=False)
 
+
     column_kind = column_kind or "variable"
     column_id = column_id or "id"
     column_sort = column_sort or "sort"
@@ -374,4 +375,4 @@ def extract_features_on_sub_features(timeseries_container,
                          default_fc_parameters=default_fc_parameters, kind_to_fc_parameters=kind_to_fc_parameters,
                          **kwargs)
 
-    return X
\ No newline at end of file
+    return X
diff --git a/tsfresh/feature_extraction/scott_features_on_features_tests.py b/tsfresh/feature_extraction/scott_features_on_features_tests.py
index 1f9871a7d..d13dfae6f 100644
--- a/tsfresh/feature_extraction/scott_features_on_features_tests.py
+++ b/tsfresh/feature_extraction/scott_features_on_features_tests.py
@@ -6,12 +6,17 @@
 # Read in data
 ts = pd.read_csv("test_data.csv")
 
+print("Minimal: {}".format(MinimalFCParameters()))
 
 X = extract_features_on_sub_features(timeseries_container = ts,
                                      sub_feature_split = 2,
-                                     sub_default_fc_parameters = MinimalFCParameters,
-                                     default_fc_parameters = MinimalFCParameters,
+                                     sub_default_fc_parameters = MinimalFCParameters(),
+                                     default_fc_parameters = MinimalFCParameters(),
                                      column_id = "measurement_id",
                                      column_sort = "t",
                                      column_kind = None,
                                      column_value = None)
+
+print(X)
+#for col in X.columns:
+#    print(col)
diff --git a/tsfresh/feature_extraction/test_data.csv b/tsfresh/feature_extraction/test_data.csv
index 8a036e22c..a26aaa2c2 100644
--- a/tsfresh/feature_extraction/test_data.csv
+++ b/tsfresh/feature_extraction/test_data.csv
@@ -1,31 +1,31 @@
-t,y,window_id,measurement_id
-1,1,1,1
-2,1,1,1
-3,1,1,1
-4,1,1,1
-5,1,1,1
-6,1,1,1
-7,1,1,1
-8,1,1,1
-9,1,1,1
-10,1,1,1
-11,2,1,1
-12,2,1,1
-13,2,1,1
-14,2,1,1
-15,2,1,1
-16,2,1,1
-17,2,1,1
-18,2,1,1
-19,2,1,1
-20,2,1,1
-21,3,2,1
-22,3,2,1
-23,3,2,1
-24,3,2,1
-25,3,2,1
-26,3,2,1
-27,3,2,1
-28,3,2,1
-29,3,2,1
-30,3,2,1
+t,y,measurement_id
+1,1,1
+2,1,1
+3,1,1
+4,1,1
+5,1,1
+6,1,1
+7,1,1
+8,1,1
+9,1,1
+10,1,1
+11,2,1
+12,2,1
+13,2,1
+14,2,1
+15,2,2
+16,2,2
+17,2,2
+18,2,2
+19,2,2
+20,2,2
+21,3,2
+22,3,2
+23,3,2
+24,3,2
+25,3,2
+26,3,2
+27,3,2
+28,3,2
+29,3,2
+30,3,2

From f2e79089db0d9d969960afb818d76e6b7007fb16 Mon Sep 17 00:00:00 2001
From: Scott-Simmons <ssim323@aucklanduni.ac.nz>
Date: Thu, 24 Feb 2022 16:32:44 +1300
Subject: [PATCH 03/16] third

---
 ...res_tests.py => feature_dynamics_tests.py} |   0
 tsfresh/feature_extraction/scotts_code.py     | 199 ------------------
 2 files changed, 199 deletions(-)
 rename tsfresh/feature_extraction/{scott_features_on_features_tests.py => feature_dynamics_tests.py} (100%)
 delete mode 100644 tsfresh/feature_extraction/scotts_code.py

diff --git a/tsfresh/feature_extraction/scott_features_on_features_tests.py b/tsfresh/feature_extraction/feature_dynamics_tests.py
similarity index 100%
rename from tsfresh/feature_extraction/scott_features_on_features_tests.py
rename to tsfresh/feature_extraction/feature_dynamics_tests.py
diff --git a/tsfresh/feature_extraction/scotts_code.py b/tsfresh/feature_extraction/scotts_code.py
deleted file mode 100644
index 3c0dbc7f2..000000000
--- a/tsfresh/feature_extraction/scotts_code.py
+++ /dev/null
@@ -1,199 +0,0 @@
-# a file that has the algorithm used in my project...
-
-# main features on features framework used in VSB project
-def features_on_features_vsb(ts,
-                             first_fc_params,
-                             second_fc_params,
-                             fc_params_is_kind,
-                             replacement_token):
-    '''
-    main algorithm that uses tsfresh which computes features on features (feature dynamics) for the VSB data
-    NOTE: The DATA format that this function supports is input option 1 for feature extraction https://tsfresh.readthedocs.io/en/latest/text/data_formats.html
-
-        params:
-            ts (pd.DataFrame): The VSB measurements that will be processed.
-            first_fc_params, second_fc_params (dictionary): feature sets for (1) the extraction of feature time series and (2) the extraction of feature dynamics
-            fc_params_is_kind (bool): if the feature dictionaries maps each separate VSB signal (ts kind) to a different feature set then this value is True, otherwise false.
-            replacement_token (str): token that replaces double unscore in feature naming convention. This adjustment is required for featue dynamics extraction
-        returns:
-            X1 (pd.DataFrame): Feature dynamics matrix
-            y1 (pd.Series): Response vector.
-    '''
-
-    # map the reponse variable to each row in the features on features matrix if the timeseries is test data otherwise do nothing...
-    try:
-        y = ts.groupby("measurement_id").last()["response"]
-    except:
-        y = None
-
-
-    # assign unique pairs of (mes_id, window_id) to each element
-    ts["column_id"] = ts["measurement_id"].astype(str) + ", " + ts["window_id"].astype(str)
-
-
-    # drop the columns which are not relevant to feature extraction
-    try:
-        ts = ts.drop(columns = ["measurement_id", "window_id", "response"]) # for labelled data
-    except:
-        ts = ts.drop(columns = ["measurement_id", "window_id"]) # non labelled data
-
-    print("TS INPUT {}".format(ts))
-    # first round of feature extraction FEATURE TIME SERIES
-    X0 = (extract_features(ts, column_id = "column_id",column_sort = "time_index", kind_to_fc_parameters = first_fc_params, disable_progressbar = True) if fc_params_is_kind
-          else extract_features(ts, column_id = "column_id",column_sort = "time_index", default_fc_parameters = first_fc_params, disable_progressbar = True))
-
-    print("FIRST {}".format(X0.shape))
-
-    # drop any features that produce any NaNs/NAs
-    if X0.isnull().values.any():
-        # store dropped features
-        dropped_feature_names = [col_name for col_name in X0.columns[X0.isna().any()].tolist()]
-        # store the feature calculators that fail in a file that is constantly updated.
-        with open("dropped_feature_names.txt", "a") as f:
-            for feature in dropped_feature_names: f.write(feature[feature.index("__") + 2:] + "\n") # 2 is a magic number. It works. But this should be refactored..
-
-        print("found " + str(len(X0.columns[X0.isna().any()].tolist())) + " features from the set of " + str(len(X0.columns)) + " features which should be dropped before being input into second feat extraction")
-        X0 = X0.dropna(axis = "columns")
-
-
-    # tsfresh cant handle double underscores twice so change this in preparation for the second feature extraction
-    X0.columns = [str(col_name).replace("__",replacement_token) for col_name in X0.columns]
-
-    # assign windows as the original measurment ID... i.e. extracting "mes_id" from (mes_id, window_id)
-    X0["column_id"] = X0.index.to_series().str.split(", ", expand = True).iloc[:,0]
-
-    print("FEATURE TS INPUT {}".format(X0))
-
-    # second round of feature extraction FEATURE DYNAMICS
-    X1 = (extract_features(X0, column_id = "column_id", kind_to_fc_parameters = second_fc_params, disable_progressbar = True) if fc_params_is_kind
-          else  extract_features(X0, column_id = "column_id", default_fc_parameters = second_fc_params, disable_progressbar = True))
-
-    X1.index.name = "measurement_id"
-
-    # drop any features which are null or na
-    if X1.isnull().values.any():
-        print("found " + str(len(X1.columns[X1.isna().any()].tolist())) + " features from the set of " + str(len(X1.columns)) + " features which should be dropped before being considered as the final output...")
-        X1 = X1.dropna(axis = "columns")
-
-
-    # sort column names
-    X1.sort_index(axis="columns", inplace=True)
-
-    print("X1 output {}, {}".format(X1.shape, X1))
-
-    # returning the feature matrix, the response variable corresponding to each feature matrix window, and optionally the dropped colnames
-    return (X1, y)
-
-
-
-
-
-### The code written into tsfresh
-
-class IterableTsData(Iterable[Timeseries], Sized, TsData):
-    """
-    Special class of TsData, which can be partitioned.
-    Derived classes should implement __iter__ and __len__.
-    """
-    def pivot(self, results):
-        """
-        Helper function to turn an iterable of tuples with three entries into a dataframe.
-
-        The input ``list_of_tuples`` needs to be an iterable with tuples containing three
-        entries: (a, b, c).
-        Out of this, a pandas dataframe will be created with all a's as index,
-        all b's as columns and all c's as values.
-
-        It basically does a pd.pivot(first entry, second entry, third entry),
-        but optimized for non-pandas input (= python list of tuples).
-
-        This function is called in the end of the extract_features call.
-        """
-        return_df_dict = defaultdict(dict)
-        for chunk_id, variable, value in results:
-            # we turn it into a nested mapping `column -> index -> value`
-            return_df_dict[variable][chunk_id] = value
-
-        # the mapping column -> {index -> value}
-        # is now a dict of dicts. The pandas dataframe
-        # constructor will peel this off:
-        # first, the keys of the outer dict (the column)
-        # will turn into a column header and the rest into a column
-        # the rest is {index -> value} which will be turned into a
-        # column with index.
-        # All index will be aligned.
-        return_df = pd.DataFrame(return_df_dict, dtype=float)
-
-        # copy the type of the index
-        return_df.index = return_df.index.astype(self.df_id_type)
-
-        # Sort by index to be backward compatible
-        return_df = return_df.sort_index()
-
-        return return_df
-
-    def __len__(self):
-        """Override in a subclass"""
-        raise NotImplementedError
-
-    def __iter__(self):
-        """Override in a subclass"""
-        raise NotImplementedError
-
-
-class ApplyableTsData(TsData):
-    """
-    TsData base class to use, if an iterable ts data can not be used.
-    Its only interface is an apply function, which should be applied
-    to each of the chunks of the data. How this is done
-    depends on the implementation.
-    """
-    def apply(self, f, **kwargs):
-        raise NotImplementedError
-
-def extract_features_on_sub_features(timeseries_container,
-                                     sub_feature_split,
-                                     sub_default_fc_parameters=None, sub_kind_to_fc_parameters=None,
-                                     default_fc_parameters=None, kind_to_fc_parameters=None,
-                                     column_id=None, column_sort=None, column_kind=None, column_value=None,
-                                     **kwargs):
-    ts_data = to_tsdata(timeseries_container, column_id=column_id, column_sort=column_sort, column_kind=column_kind, column_value=column_value)
-    if isinstance(ts_data, Iterable):
-        split_ts_data = IterableSplitTsData(ts_data, sub_feature_split)
-    else:
-        split_ts_data = ApplyableSplitTsData(ts_data, sub_feature_split)
-
-    sub_features = extract_features(split_ts_data, default_fc_parameters=sub_default_fc_parameters,
-                                    kind_to_fc_parameters=sub_kind_to_fc_parameters, **kwargs, pivot=False)
-
-    column_kind = column_kind or "variable"
-    column_id = column_id or "id"
-    column_sort = column_sort or "sort"
-    column_value = column_value or "value"
-
-    # The feature names include many "_", which will confuse tsfresh where the sub feature name ends
-    # and where the real feature name starts. We just remove them.
-    # Also, we split up the index into the id and the sort
-    # We need to do this separately for dask dataframes,
-    # as the return type is not a list, but already a dataframe
-    if isinstance(sub_features, dd.DataFrame):
-        sub_features = sub_features.reset_index(drop=True)
-
-        sub_features[column_kind] = sub_features[column_kind].apply(lambda col: col.replace("_", ""), meta=(column_kind, object))
-
-        sub_features[column_sort] = sub_features[column_id].apply(lambda x: x[1], meta=(column_id, "int64"))
-        sub_features[column_id] = sub_features[column_id].apply(lambda x: x[0], meta=(column_id, ts_data.df_id_type))
-
-    else:
-        sub_features = pd.DataFrame(sub_features, columns=[column_id, column_kind, column_value])
-
-        sub_features[column_kind] = sub_features[column_kind].apply(lambda col: col.replace("_", ""))
-
-        sub_features[column_sort] = sub_features[column_id].apply(lambda x: x[1])
-        sub_features[column_id] = sub_features[column_id].apply(lambda x: x[0])
-
-    X = extract_features(sub_features, column_id=column_id, column_sort=column_sort, column_kind=column_kind, column_value=column_value,
-                         default_fc_parameters=default_fc_parameters, kind_to_fc_parameters=kind_to_fc_parameters,
-                         **kwargs)
-
-    return X

From ccfacf0b556dc4237b3a537bda649016d7a9ec4d Mon Sep 17 00:00:00 2001
From: Scott-Simmons <ssim323@aucklanduni.ac.nz>
Date: Sat, 26 Feb 2022 22:54:03 +1300
Subject: [PATCH 04/16] Added dropping of NaNs and modified the notation of
 feature naming conventions (underscores)

---
 tsfresh/feature_extraction/extraction.py      | 10 ++--
 .../feature_dynamics_tests.py                 | 49 +++++++++++++------
 2 files changed, 40 insertions(+), 19 deletions(-)

diff --git a/tsfresh/feature_extraction/extraction.py b/tsfresh/feature_extraction/extraction.py
index 95cd521f3..7af07aea9 100644
--- a/tsfresh/feature_extraction/extraction.py
+++ b/tsfresh/feature_extraction/extraction.py
@@ -315,7 +315,7 @@ def _f():
             if func.fctype == "combiner":
                 result = func(x, param=parameter_list)
             else:
-                if parameter_list:
+                if parameter_list: 
                     result = ((convert_to_output_format(param), func(x, **param)) for param in
                               parameter_list)
                 else:
@@ -345,6 +345,7 @@ def extract_features_on_sub_features(timeseries_container,
                                     kind_to_fc_parameters=sub_kind_to_fc_parameters, **kwargs, pivot=False)
 
 
+    # the some features can produce NaNs which need to be removed before the next round of feature extraction
     column_kind = column_kind or "variable"
     column_id = column_id or "id"
     column_sort = column_sort or "sort"
@@ -356,6 +357,7 @@ def extract_features_on_sub_features(timeseries_container,
     # We need to do this separately for dask dataframes,
     # as the return type is not a list, but already a dataframe
     if isinstance(sub_features, dd.DataFrame):
+        # TODO: dropping NAs for Dask dataframes... write tests
         sub_features = sub_features.reset_index(drop=True)
 
         sub_features[column_kind] = sub_features[column_kind].apply(lambda col: col.replace("_", ""), meta=(column_kind, object))
@@ -364,13 +366,13 @@ def extract_features_on_sub_features(timeseries_container,
         sub_features[column_id] = sub_features[column_id].apply(lambda x: x[0], meta=(column_id, ts_data.df_id_type))
 
     else:
-        sub_features = pd.DataFrame(sub_features, columns=[column_id, column_kind, column_value])
+        sub_features = pd.DataFrame(sub_features, columns=[column_id, column_kind, column_value]).dropna()
+        print("First round done: {}".format(sub_features))
 
-        sub_features[column_kind] = sub_features[column_kind].apply(lambda col: col.replace("_", ""))
+        sub_features[column_kind] = sub_features[column_kind].apply(lambda col: col.replace("__", "||"))
 
         sub_features[column_sort] = sub_features[column_id].apply(lambda x: x[1])
         sub_features[column_id] = sub_features[column_id].apply(lambda x: x[0])
-
     X = extract_features(sub_features, column_id=column_id, column_sort=column_sort, column_kind=column_kind, column_value=column_value,
                          default_fc_parameters=default_fc_parameters, kind_to_fc_parameters=kind_to_fc_parameters,
                          **kwargs)
diff --git a/tsfresh/feature_extraction/feature_dynamics_tests.py b/tsfresh/feature_extraction/feature_dynamics_tests.py
index d13dfae6f..894a0d5a9 100644
--- a/tsfresh/feature_extraction/feature_dynamics_tests.py
+++ b/tsfresh/feature_extraction/feature_dynamics_tests.py
@@ -1,22 +1,41 @@
 import pandas as pd
 from extraction import extract_features_on_sub_features
-from tsfresh.feature_extraction.settings import MinimalFCParameters
+from tsfresh.feature_extraction.settings import MinimalFCParameters, EfficientFCParameters
 # Test is going to go here. Should work with Louis code + Scott code + the code that is integrated into tsfresh
 
-# Read in data
-ts = pd.read_csv("test_data.csv")
 
-print("Minimal: {}".format(MinimalFCParameters()))
 
-X = extract_features_on_sub_features(timeseries_container = ts,
-                                     sub_feature_split = 2,
-                                     sub_default_fc_parameters = MinimalFCParameters(),
-                                     default_fc_parameters = MinimalFCParameters(),
-                                     column_id = "measurement_id",
-                                     column_sort = "t",
-                                     column_kind = None,
-                                     column_value = None)
 
-print(X)
-#for col in X.columns:
-#    print(col)
+
+
+if __name__ == "__main__":
+    
+    # Read in data
+    ts = pd.read_csv("./feature_extraction/test_data.csv")
+    print(ts)
+
+    # running on minimal
+    X = extract_features_on_sub_features(timeseries_container = ts,
+                                        sub_feature_split = 1,
+                                        sub_default_fc_parameters = MinimalFCParameters(),
+                                        default_fc_parameters = MinimalFCParameters(),
+                                        column_id = "measurement_id",
+                                        column_sort = "t",
+                                        column_kind = None,
+                                        column_value = None,
+                                        show_warnings = True)
+    print(X)
+
+
+    # Running on efficient
+    X = extract_features_on_sub_features(timeseries_container = ts,
+                                        sub_feature_split = 1,
+                                        sub_default_fc_parameters = EfficientFCParameters(),
+                                        default_fc_parameters = EfficientFCParameters(),
+                                        column_id = "measurement_id",
+                                        column_sort = "t",
+                                        column_kind = None,
+                                        column_value = None,
+                                        show_warnings = True)
+
+    print(X)
\ No newline at end of file

From 5bdaa4b2364b45cd8be47aa45b257ddf172d6373 Mon Sep 17 00:00:00 2001
From: Scott-Simmons <ssim323@aucklanduni.ac.nz>
Date: Sun, 6 Mar 2022 14:31:57 +1300
Subject: [PATCH 05/16] fixed data types for feature time series (for second
 round of extraction) casting all to floats

---
 tsfresh/feature_extraction/extraction.py      | 22 ++++++--
 .../feature_dynamics_tests.py                 | 53 +++++++++++++------
 tsfresh/feature_extraction/test_data.csv      | 36 +++----------
 3 files changed, 62 insertions(+), 49 deletions(-)

diff --git a/tsfresh/feature_extraction/extraction.py b/tsfresh/feature_extraction/extraction.py
index 7af07aea9..cae8b0690 100644
--- a/tsfresh/feature_extraction/extraction.py
+++ b/tsfresh/feature_extraction/extraction.py
@@ -5,9 +5,11 @@
 This module contains the main function to interact with tsfresh: extract features
 """
 
+import numpy as np
 import logging
 import warnings
 from collections import Iterable
+from numpy import dtype
 
 import pandas as pd
 from dask import dataframe as dd
@@ -313,12 +315,20 @@ def _f():
                 x = data.values
 
             if func.fctype == "combiner":
+                # Casting ndarray with dtype object to dtype float as dtype object is not compatible with some feature calculators
+                x = np.asarray(x, dtype = float)
                 result = func(x, param=parameter_list)
             else:
-                if parameter_list: 
+                if parameter_list:
+                    #if function_name == "binned_entropy": 
+                    #    print("Stop here")
+                    # Casting ndarray with dtype object to dtype float as dtype object is not compatible with some feature calculators
+                    x = np.asarray(x, dtype = float)
                     result = ((convert_to_output_format(param), func(x, **param)) for param in
                               parameter_list)
                 else:
+                    # Casting ndarray with dtype object to dtype float as dtype object is not compatible with some feature calculators
+                    x = np.asarray(x, dtype = float)
                     result = [("", func(x))]
 
             for key, item in result:
@@ -366,13 +376,19 @@ def extract_features_on_sub_features(timeseries_container,
         sub_features[column_id] = sub_features[column_id].apply(lambda x: x[0], meta=(column_id, ts_data.df_id_type))
 
     else:
-        sub_features = pd.DataFrame(sub_features, columns=[column_id, column_kind, column_value]).dropna()
-        print("First round done: {}".format(sub_features))
+        sub_features = pd.DataFrame(sub_features, columns=[column_id, column_kind, column_value]) 
+
+        # Need to drop features for all windows which contain at one NaN
+        target_list = sub_features[sub_features[column_value].isnull()][column_kind].unique()
+        sub_features = sub_features[~sub_features[column_kind].isin(target_list)]
 
         sub_features[column_kind] = sub_features[column_kind].apply(lambda col: col.replace("__", "||"))
 
         sub_features[column_sort] = sub_features[column_id].apply(lambda x: x[1])
         sub_features[column_id] = sub_features[column_id].apply(lambda x: x[0])
+
+        print("Sub features\n{}".format(sub_features))
+
     X = extract_features(sub_features, column_id=column_id, column_sort=column_sort, column_kind=column_kind, column_value=column_value,
                          default_fc_parameters=default_fc_parameters, kind_to_fc_parameters=kind_to_fc_parameters,
                          **kwargs)
diff --git a/tsfresh/feature_extraction/feature_dynamics_tests.py b/tsfresh/feature_extraction/feature_dynamics_tests.py
index 894a0d5a9..0d053d027 100644
--- a/tsfresh/feature_extraction/feature_dynamics_tests.py
+++ b/tsfresh/feature_extraction/feature_dynamics_tests.py
@@ -1,5 +1,5 @@
 import pandas as pd
-from extraction import extract_features_on_sub_features
+from extraction import extract_features, extract_features_on_sub_features
 from tsfresh.feature_extraction.settings import MinimalFCParameters, EfficientFCParameters
 # Test is going to go here. Should work with Louis code + Scott code + the code that is integrated into tsfresh
 
@@ -11,25 +11,46 @@
 if __name__ == "__main__":
     
     # Read in data
-    ts = pd.read_csv("./feature_extraction/test_data.csv")
+    ts = pd.read_csv("./test_data.csv")
     print(ts)
 
-    # running on minimal
-    X = extract_features_on_sub_features(timeseries_container = ts,
-                                        sub_feature_split = 1,
-                                        sub_default_fc_parameters = MinimalFCParameters(),
-                                        default_fc_parameters = MinimalFCParameters(),
-                                        column_id = "measurement_id",
-                                        column_sort = "t",
-                                        column_kind = None,
-                                        column_value = None,
-                                        show_warnings = True)
-    print(X)
-
+    # # running on minimal
+    # X = extract_features(timeseries_container= ts,
+    #                     n_jobs = 0,
+    #                     default_fc_parameters=MinimalFCParameters(),
+    #                     column_id= "measurement_id",
+    #                     column_sort = "t",
+    #                     show_warnings = True)
+
+    # print(X)
+
+    # X = extract_features_on_sub_features(timeseries_container = ts,
+    #                                     sub_feature_split = 3, # window size
+    #                                     n_jobs = 0,
+    #                                     sub_default_fc_parameters = MinimalFCParameters(),
+    #                                     default_fc_parameters = MinimalFCParameters(),
+    #                                     column_id = "measurement_id",
+    #                                     column_sort = "t",
+    #                                     column_kind = None,
+    #                                     column_value = None,
+    #                                     show_warnings = True)
+    # print(X)
+
+    # drop feature calculators that are problematic...
+
+
+    # # running on efficient
+    # X = extract_features(timeseries_container= ts,
+    #                     n_jobs = 0,
+    #                     default_fc_parameters=EfficientFCParameters(),
+    #                     column_id= "measurement_id",
+    #                     column_sort = "t",
+    #                     show_warnings = True)
+    #print(X)
 
-    # Running on efficient
     X = extract_features_on_sub_features(timeseries_container = ts,
-                                        sub_feature_split = 1,
+                                        sub_feature_split = 3,
+                                        n_jobs = 0,
                                         sub_default_fc_parameters = EfficientFCParameters(),
                                         default_fc_parameters = EfficientFCParameters(),
                                         column_id = "measurement_id",
diff --git a/tsfresh/feature_extraction/test_data.csv b/tsfresh/feature_extraction/test_data.csv
index a26aaa2c2..801e776dd 100644
--- a/tsfresh/feature_extraction/test_data.csv
+++ b/tsfresh/feature_extraction/test_data.csv
@@ -1,31 +1,7 @@
 t,y,measurement_id
-1,1,1
-2,1,1
-3,1,1
-4,1,1
-5,1,1
-6,1,1
-7,1,1
-8,1,1
-9,1,1
-10,1,1
-11,2,1
-12,2,1
-13,2,1
-14,2,1
-15,2,2
-16,2,2
-17,2,2
-18,2,2
-19,2,2
-20,2,2
-21,3,2
-22,3,2
-23,3,2
-24,3,2
-25,3,2
-26,3,2
-27,3,2
-28,3,2
-29,3,2
-30,3,2
+1,334,1
+2,555,1
+3,664,1
+4,345346,1
+5,1356,1
+6,135,1

From 4a0038743761987578d970ef5c144454a29dab63 Mon Sep 17 00:00:00 2001
From: Scott-Simmons <ssim323@aucklanduni.ac.nz>
Date: Sun, 6 Mar 2022 14:41:31 +1300
Subject: [PATCH 06/16] dropped feature dynamics with NaNs

---
 tsfresh/feature_extraction/extraction.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tsfresh/feature_extraction/extraction.py b/tsfresh/feature_extraction/extraction.py
index cae8b0690..eb5bedcfb 100644
--- a/tsfresh/feature_extraction/extraction.py
+++ b/tsfresh/feature_extraction/extraction.py
@@ -387,10 +387,10 @@ def extract_features_on_sub_features(timeseries_container,
         sub_features[column_sort] = sub_features[column_id].apply(lambda x: x[1])
         sub_features[column_id] = sub_features[column_id].apply(lambda x: x[0])
 
-        print("Sub features\n{}".format(sub_features))
-
     X = extract_features(sub_features, column_id=column_id, column_sort=column_sort, column_kind=column_kind, column_value=column_value,
                          default_fc_parameters=default_fc_parameters, kind_to_fc_parameters=kind_to_fc_parameters,
                          **kwargs)
+    # Drop all feature dynamics that have at least one NaN
+    X = X.dropna(axis = "columns", how = "any")
 
     return X

From 10d50bf00a3b5d1123ca3c87ffd8707ec9298836 Mon Sep 17 00:00:00 2001
From: Louis Jarvis <ljar069@aucklanduni.ac.nz>
Date: Sun, 5 Feb 2023 19:01:11 +1300
Subject: [PATCH 07/16] add test differences within stacked unordered

---
 .../test_feature_dynamics_utils.py            | 24 +++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/tests/units/feature_dynamics_extraction/test_feature_dynamics_utils.py b/tests/units/feature_dynamics_extraction/test_feature_dynamics_utils.py
index 34f7f0c5b..dc3577a88 100644
--- a/tests/units/feature_dynamics_extraction/test_feature_dynamics_utils.py
+++ b/tests/units/feature_dynamics_extraction/test_feature_dynamics_utils.py
@@ -647,6 +647,30 @@ def test_differences_within_stacked_dataframe_no_sort(self):
             stacked_dataframe_timeseries_container, expected_unmodified_data
         )
 
+    def test_differences_within_stacked_dataframe_unordered(self):
+        """ Test case where a sort column exists but it is randomised and must be checked"""
+        stacked_df_timeseries_test_container, (column_id, column_sort, column_kind, column_value) = self.create_simple_test_data_sample_stacked()
+        stacked_df_timeseries_test_container["sort"] = [15, 14, 3, 8, 1, 5, 7, 0, 4, 12, 13, 9, 6, 2, 17, 16, 11, 10]
+
+        ##TODO a function to check that the sort for stacked columns is suitable, if not throw and error
+        ##TODO add more arguments 
+        def check_stacked_sort_sort(ts_container, column_id, column_sort):
+            for id in ts_container[column_id].unique():
+                sort_col_vals = ts_container.query("{column_sort}==1")["sort"]
+                if pd.Series.equals(sort_col_vals.sort(), sort_col_vals):
+                    ##TODO throw a warning
+                    return False
+            return True
+
+
+        engineered_ts_within = diff_within_series(
+            timeseries_container=stacked_df_timeseries_test_container,
+            column_sort=column_sort,
+            column_id=column_id,
+            column_kind=column_kind,
+            column_value=column_value,
+        )
+
     def test_differences_within_dictionary(self):
         (
             dict_timeseries_container,

From a7bc8af410e7874c0462f91dbf5eb3ef8303ba34 Mon Sep 17 00:00:00 2001
From: Louis Jarvis <ljar069@aucklanduni.ac.nz>
Date: Sun, 5 Feb 2023 19:47:14 +1300
Subject: [PATCH 08/16] rm redundant test differences_bw_series_stacked func

---
 .../test_feature_dynamics_utils.py            | 24 -------------------
 1 file changed, 24 deletions(-)

diff --git a/tests/units/feature_dynamics_extraction/test_feature_dynamics_utils.py b/tests/units/feature_dynamics_extraction/test_feature_dynamics_utils.py
index 616e91be9..25bceabdd 100644
--- a/tests/units/feature_dynamics_extraction/test_feature_dynamics_utils.py
+++ b/tests/units/feature_dynamics_extraction/test_feature_dynamics_utils.py
@@ -624,30 +624,6 @@ def test_differences_within_stacked_dataframe_no_sort(self):
             stacked_dataframe_timeseries_container, expected_unmodified_data
         )
 
-    def test_differences_within_stacked_dataframe_unordered(self):
-        """ Test case where a sort column exists but it is randomised and must be checked"""
-        stacked_df_timeseries_test_container, (column_id, column_sort, column_kind, column_value) = self.create_simple_test_data_sample_stacked()
-        stacked_df_timeseries_test_container["sort"] = [15, 14, 3, 8, 1, 5, 7, 0, 4, 12, 13, 9, 6, 2, 17, 16, 11, 10]
-
-        ##TODO a function to check that the sort for stacked columns is suitable, if not throw and error
-        ##TODO add more arguments 
-        def check_stacked_sort_sort(ts_container, column_id, column_sort):
-            for id in ts_container[column_id].unique():
-                sort_col_vals = ts_container.query("{column_sort}==1")["sort"]
-                if pd.Series.equals(sort_col_vals.sort(), sort_col_vals):
-                    ##TODO throw a warning
-                    return False
-            return True
-
-
-        engineered_ts_within = diff_within_series(
-            timeseries_container=stacked_df_timeseries_test_container,
-            column_sort=column_sort,
-            column_id=column_id,
-            column_kind=column_kind,
-            column_value=column_value,
-        )
-
     def test_differences_within_dictionary(self):
         (
             dict_timeseries_container,

From 22256d54a8b33d3117a6fbdd0a258c76fe1733e0 Mon Sep 17 00:00:00 2001
From: Louis Jarvis <ljar069@aucklanduni.ac.nz>
Date: Sun, 19 Feb 2023 14:38:37 +1300
Subject: [PATCH 09/16] temp dask changes remove mdf dependency

---
 tests/fixtures.py                             | 185 ++++++++++++++++++
 .../test_feature_dynamics_data.py             |   9 +-
 .../feature_dynamics_utils.py                 |   2 +-
 3 files changed, 194 insertions(+), 2 deletions(-)

diff --git a/tests/fixtures.py b/tests/fixtures.py
index 6cd35ffbb..c9de81d1d 100644
--- a/tests/fixtures.py
+++ b/tests/fixtures.py
@@ -8,6 +8,7 @@
 
 import numpy as np
 import pandas as pd
+import dask.dataframe as ddf
 
 
 @contextmanager
@@ -1272,3 +1273,187 @@ def create_split_up_test_data_expected_tuples_wide(self):
             ),
         ]
         return (wide_test_data_expected_chunked_up_tuples, window_length)
+
+
+class DaskDataTestCase:
+    def create_simple_test_sample(self):
+
+        #identical to the above 
+        cid = np.repeat([10, 500], 40)
+        ckind = np.repeat(["a", "b", "a", "b"], 20)
+        csort = [
+            30,
+            53,
+            26,
+            35,
+            42,
+            25,
+            17,
+            67,
+            20,
+            68,
+            46,
+            12,
+            0,
+            74,
+            66,
+            31,
+            32,
+            2,
+            55,
+            59,
+            56,
+            60,
+            34,
+            69,
+            47,
+            15,
+            49,
+            8,
+            50,
+            73,
+            23,
+            62,
+            24,
+            33,
+            22,
+            70,
+            3,
+            38,
+            28,
+            75,
+            39,
+            36,
+            64,
+            13,
+            72,
+            52,
+            40,
+            16,
+            58,
+            29,
+            63,
+            79,
+            61,
+            78,
+            1,
+            10,
+            4,
+            6,
+            65,
+            44,
+            54,
+            48,
+            11,
+            14,
+            19,
+            43,
+            76,
+            7,
+            51,
+            9,
+            27,
+            21,
+            5,
+            71,
+            57,
+            77,
+            41,
+            18,
+            45,
+            37,
+        ]
+        cval = [
+            11,
+            9,
+            67,
+            45,
+            30,
+            58,
+            62,
+            19,
+            56,
+            29,
+            0,
+            27,
+            36,
+            43,
+            33,
+            2,
+            24,
+            71,
+            41,
+            28,
+            50,
+            40,
+            39,
+            7,
+            53,
+            23,
+            16,
+            37,
+            66,
+            38,
+            6,
+            47,
+            3,
+            61,
+            44,
+            42,
+            78,
+            31,
+            21,
+            55,
+            15,
+            35,
+            25,
+            32,
+            69,
+            65,
+            70,
+            64,
+            51,
+            46,
+            5,
+            77,
+            26,
+            73,
+            76,
+            75,
+            72,
+            74,
+            10,
+            57,
+            4,
+            14,
+            68,
+            22,
+            18,
+            52,
+            54,
+            60,
+            79,
+            12,
+            49,
+            63,
+            8,
+            59,
+            1,
+            13,
+            20,
+            17,
+            48,
+            34,
+        ]
+        
+        df = ddf.from_dict({"id": cid, "kind": ckind, "sort": csort, "val": cval}, npartitions=10)
+        df = ddf.set_index("id", drop=False)
+        #df.index.name = None
+        return ddf
+    
+    def create_test_dask_df_long(self):
+        # cid = np.repeat(["A", "B"], repeats = [4,4])
+        # ckind = np.repeat(["A", "B"])
+        # csort = 
+        # cval = []
+        pass
\ No newline at end of file
diff --git a/tests/units/feature_dynamics_extraction/test_feature_dynamics_data.py b/tests/units/feature_dynamics_extraction/test_feature_dynamics_data.py
index 9f3ec3570..9a3703953 100644
--- a/tests/units/feature_dynamics_extraction/test_feature_dynamics_data.py
+++ b/tests/units/feature_dynamics_extraction/test_feature_dynamics_data.py
@@ -12,6 +12,7 @@
 import pandas as pd
 import dask.dataframe as dd
 from tests.units.feature_extraction.test_data import DataAdapterTestCase
+from tests.fixtures import DaskDataTestCase
 
 
 class IterableSplitTsDataTestCase(
@@ -39,6 +40,9 @@ def test_iter_on_long_data(self):
             underlying_data_converted_to_tsdata, expected_non_windowed_tuples
         )
 
+        ##TODO test the dask df test case
+        df_stacked = self.create
+
         # Test equality of each chunk...
         self.assert_tsdata(split_ts_data, expected_windowed_tuples)
 
@@ -211,7 +215,8 @@ def test_f(chunk):
         )
 
     def test_apply_on_long_data_dask(self):
-        pass
+        df  = DaskDataTestCase().create_simple_test_sample()
+        return True
 
     def test_iter_on_long_data_no_value_column_dask(self):
         pass
@@ -236,3 +241,5 @@ def test_zero_split_size_dask(self):
 
     def test_fractional_split_size_dask(self):
         pass
+
+ApplyableSplitTsDataTestCase().test_apply_on_long_data_dask()
\ No newline at end of file
diff --git a/tsfresh/feature_dynamics_extraction/feature_dynamics_utils.py b/tsfresh/feature_dynamics_extraction/feature_dynamics_utils.py
index 78e4e088b..0f781b220 100644
--- a/tsfresh/feature_dynamics_extraction/feature_dynamics_utils.py
+++ b/tsfresh/feature_dynamics_extraction/feature_dynamics_utils.py
@@ -4,7 +4,7 @@
 import pandas as pd
 from pandas.api.types import is_numeric_dtype
 from typing import List
-from md2pdf.core import md2pdf
+#from md2pdf.core import md2pdf
 from tsfresh.feature_extraction import feature_calculators
 from tsfresh.utilities.string_manipulation import get_config_from_string
 from tsfresh.feature_extraction.data import (

From ccaa8cc5bc713c612c62356d79d5adb24d31dac0 Mon Sep 17 00:00:00 2001
From: Louis Jarvis <ljar069@aucklanduni.ac.nz>
Date: Sun, 19 Feb 2023 20:00:38 +1300
Subject: [PATCH 10/16] pycache to gitingnore

---
 .gitignore | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 7266dfdf9..ddf94e8d7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -63,4 +63,7 @@ feature_dynamics_interpretation.pdf
 feature_dynamics_interpretation.md
 # dask
 dask-worker-space
-dask-worker-space/
\ No newline at end of file
+dask-worker-space/
+
+tests/*/__pycache__/
+tests/*/*/__pycache__/
\ No newline at end of file

From 7c867aa5195dc1a75dad42f856d6929c37b728fa Mon Sep 17 00:00:00 2001
From: Louis Jarvis <ljar069@aucklanduni.ac.nz>
Date: Sun, 26 Feb 2023 16:50:19 +1300
Subject: [PATCH 11/16] test feature_dynamics_extraction on wide format case

---
 .../test_feature_dynamics_extraction.py       | 27 +++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/tests/units/feature_dynamics_extraction/test_feature_dynamics_extraction.py b/tests/units/feature_dynamics_extraction/test_feature_dynamics_extraction.py
index d0cb807c9..5454a0e5e 100644
--- a/tests/units/feature_dynamics_extraction/test_feature_dynamics_extraction.py
+++ b/tests/units/feature_dynamics_extraction/test_feature_dynamics_extraction.py
@@ -515,6 +515,33 @@ def test_extract_feature_dynamics_alphabetically_sorted(self):
 
             self.assertEqual(col_name_chunks, list(sorted(col_name_chunks)))
 
+    def test_extract_feature_dynamics_wide(self):
+        ts_wide_df,_,_ = self.create_simple_test_data_sample_wide()
+
+        fc_test_params = {"minimum": None}
+        window_length = 3
+
+        extracted_feature_dynamics = extract_feature_dynamics(
+            timeseries_container=ts_wide_df,
+            column_id="id",
+            column_sort="sort",
+            column_kind=None, #"kind", None since wide format
+            column_value=None, #"val",
+            n_jobs= 1,#self.n_jobs,
+            feature_timeseries_fc_parameters={window_length: fc_test_params},
+            feature_dynamics_fc_parameters={window_length: fc_test_params},
+        )
+
+        expected_ans = pd.DataFrame(
+            data={
+                'y1||minimum@window_3__minimum': {1: 1.0, 2: -34.0}, 
+                'y2||minimum@window_3__minimum': {1: -10.0, 2: 3.0}, 
+                'y3||minimum@window_3__minimum': {1: 4.0, 2: 1.0}
+                }
+            )
+
+        pd.testing.assert_frame_equal(extract_feature_dynamics, expected_ans)
+    
 
 class ParallelDynamicsExtractionTestCase(DataTestCase):
     def setUp(self):

From 1be4f16fdd84f42fe84605b674e7b00de2a406cc Mon Sep 17 00:00:00 2001
From: Louis Jarvis <ljar069@aucklanduni.ac.nz>
Date: Sun, 26 Feb 2023 18:17:15 +1300
Subject: [PATCH 12/16] test extract_feature_dynamics for long

---
 .../test_feature_dynamics_extraction.py         | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/tests/units/feature_dynamics_extraction/test_feature_dynamics_extraction.py b/tests/units/feature_dynamics_extraction/test_feature_dynamics_extraction.py
index 5454a0e5e..76509662d 100644
--- a/tests/units/feature_dynamics_extraction/test_feature_dynamics_extraction.py
+++ b/tests/units/feature_dynamics_extraction/test_feature_dynamics_extraction.py
@@ -542,6 +542,23 @@ def test_extract_feature_dynamics_wide(self):
 
         pd.testing.assert_frame_equal(extract_feature_dynamics, expected_ans)
     
+    def test_extract_feature_dynamics_long(self):
+
+        ts_long_df,_,_ = self.create_simple_test_data_sample_stacked()
+
+        fc_test_params = {"minimum": None}
+        window_length = 3
+
+        extracted_feature_dynamics = extract_feature_dynamics(
+            timeseries_container=ts_long_df,
+            column_id="id",
+            column_sort="sort",
+            column_kind="kind", #None since wide format
+            column_value="val",
+            n_jobs= 1,#self.n_jobs,
+            feature_timeseries_fc_parameters={window_length: fc_test_params},
+            feature_dynamics_fc_parameters={window_length: fc_test_params},
+        )
 
 class ParallelDynamicsExtractionTestCase(DataTestCase):
     def setUp(self):

From 9612b395af936797b6e866792cb66fccb91f80a4 Mon Sep 17 00:00:00 2001
From: Louis Jarvis <ljar069@aucklanduni.ac.nz>
Date: Sun, 26 Feb 2023 18:22:26 +1300
Subject: [PATCH 13/16] test extract_feature_dynamics for list of dicts

---
 .../test_feature_dynamics_extraction.py       | 37 +++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/tests/units/feature_dynamics_extraction/test_feature_dynamics_extraction.py b/tests/units/feature_dynamics_extraction/test_feature_dynamics_extraction.py
index 76509662d..389de5093 100644
--- a/tests/units/feature_dynamics_extraction/test_feature_dynamics_extraction.py
+++ b/tests/units/feature_dynamics_extraction/test_feature_dynamics_extraction.py
@@ -560,6 +560,43 @@ def test_extract_feature_dynamics_long(self):
             feature_dynamics_fc_parameters={window_length: fc_test_params},
         )
 
+        expected_ans = pd.DataFrame(
+            data={
+                'y1||minimum@window_3__minimum': {1: 1.0, 2: -34.0}, 
+                'y2||minimum@window_3__minimum': {1: -10.0, 2: 3.0}, 
+                'y3||minimum@window_3__minimum': {1: 4.0, 2: 1.0}
+                }
+            )
+
+        pd.testing.assert_frame_equal(extract_feature_dynamics, expected_ans)
+
+    def test_extract_feature_dynamics_dict(self):
+        
+        ts_list_of_dicts = self.create_simple_test_data_sample_dict()
+        fc_test_params = {"minimum": None}
+        window_length = 3
+
+        extracted_feature_dynamics = extract_feature_dynamics(
+            timeseries_container=ts_long_df,
+            column_id="id",
+            column_sort="sort",
+            column_kind="kind", #None since wide format
+            column_value="val",
+            n_jobs= 1,#self.n_jobs,
+            feature_timeseries_fc_parameters={window_length: fc_test_params},
+            feature_dynamics_fc_parameters={window_length: fc_test_params},
+        )
+
+        expected_ans = pd.DataFrame(
+            data={
+                'y1||minimum@window_3__minimum': {1: 1.0, 2: -34.0}, 
+                'y2||minimum@window_3__minimum': {1: -10.0, 2: 3.0}, 
+                'y3||minimum@window_3__minimum': {1: 4.0, 2: 1.0}
+                }
+            )
+
+        pd.testing.assert_frame_equal(extract_feature_dynamics, expected_ans)
+
 class ParallelDynamicsExtractionTestCase(DataTestCase):
     def setUp(self):
         self.n_jobs = 2

From 8d63fa95faac4cb03aea3940bb7999a94ebeed05 Mon Sep 17 00:00:00 2001
From: Louis Jarvis <ljar069@aucklanduni.ac.nz>
Date: Sun, 26 Feb 2023 18:28:01 +1300
Subject: [PATCH 14/16] fix incorrect var name

---
 .../test_feature_dynamics_extraction.py                | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/units/feature_dynamics_extraction/test_feature_dynamics_extraction.py b/tests/units/feature_dynamics_extraction/test_feature_dynamics_extraction.py
index 389de5093..de9cf5365 100644
--- a/tests/units/feature_dynamics_extraction/test_feature_dynamics_extraction.py
+++ b/tests/units/feature_dynamics_extraction/test_feature_dynamics_extraction.py
@@ -540,7 +540,7 @@ def test_extract_feature_dynamics_wide(self):
                 }
             )
 
-        pd.testing.assert_frame_equal(extract_feature_dynamics, expected_ans)
+        pd.testing.assert_frame_equal(extracted_feature_dynamics, expected_ans)
     
     def test_extract_feature_dynamics_long(self):
 
@@ -568,7 +568,7 @@ def test_extract_feature_dynamics_long(self):
                 }
             )
 
-        pd.testing.assert_frame_equal(extract_feature_dynamics, expected_ans)
+        pd.testing.assert_frame_equal(extracted_feature_dynamics, expected_ans)
 
     def test_extract_feature_dynamics_dict(self):
         
@@ -577,10 +577,10 @@ def test_extract_feature_dynamics_dict(self):
         window_length = 3
 
         extracted_feature_dynamics = extract_feature_dynamics(
-            timeseries_container=ts_long_df,
+            timeseries_container=ts_list_of_dicts,
             column_id="id",
             column_sort="sort",
-            column_kind="kind", #None since wide format
+            column_kind=None, #None since wide format
             column_value="val",
             n_jobs= 1,#self.n_jobs,
             feature_timeseries_fc_parameters={window_length: fc_test_params},
@@ -595,7 +595,7 @@ def test_extract_feature_dynamics_dict(self):
                 }
             )
 
-        pd.testing.assert_frame_equal(extract_feature_dynamics, expected_ans)
+        pd.testing.assert_frame_equal(extracted_feature_dynamics, expected_ans)
 
 class ParallelDynamicsExtractionTestCase(DataTestCase):
     def setUp(self):

From 9a739a4911ba021716a81c0be32b51948df02f73 Mon Sep 17 00:00:00 2001
From: Louis Jarvis <ljar069@aucklanduni.ac.nz>
Date: Sun, 26 Feb 2023 19:09:58 +1300
Subject: [PATCH 15/16] fix files overwritten by merge

---
 tests/fixtures.py                             |  2 -
 .../test_feature_dynamics_data.py             | 11 +---
 .../feature_dynamics_tests.py                 | 62 -------------------
 3 files changed, 2 insertions(+), 73 deletions(-)
 delete mode 100644 tsfresh/feature_extraction/feature_dynamics_tests.py

diff --git a/tests/fixtures.py b/tests/fixtures.py
index c9de81d1d..9c5887c31 100644
--- a/tests/fixtures.py
+++ b/tests/fixtures.py
@@ -8,7 +8,6 @@
 
 import numpy as np
 import pandas as pd
-import dask.dataframe as ddf
 
 
 @contextmanager
@@ -1275,7 +1274,6 @@ def create_split_up_test_data_expected_tuples_wide(self):
         return (wide_test_data_expected_chunked_up_tuples, window_length)
 
 
-class DaskDataTestCase:
     def create_simple_test_sample(self):
 
         #identical to the above 
diff --git a/tests/units/feature_dynamics_extraction/test_feature_dynamics_data.py b/tests/units/feature_dynamics_extraction/test_feature_dynamics_data.py
index 9a3703953..fc31a60db 100644
--- a/tests/units/feature_dynamics_extraction/test_feature_dynamics_data.py
+++ b/tests/units/feature_dynamics_extraction/test_feature_dynamics_data.py
@@ -12,7 +12,6 @@
 import pandas as pd
 import dask.dataframe as dd
 from tests.units.feature_extraction.test_data import DataAdapterTestCase
-from tests.fixtures import DaskDataTestCase
 
 
 class IterableSplitTsDataTestCase(
@@ -40,9 +39,6 @@ def test_iter_on_long_data(self):
             underlying_data_converted_to_tsdata, expected_non_windowed_tuples
         )
 
-        ##TODO test the dask df test case
-        df_stacked = self.create
-
         # Test equality of each chunk...
         self.assert_tsdata(split_ts_data, expected_windowed_tuples)
 
@@ -215,8 +211,7 @@ def test_f(chunk):
         )
 
     def test_apply_on_long_data_dask(self):
-        df  = DaskDataTestCase().create_simple_test_sample()
-        return True
+        pass
 
     def test_iter_on_long_data_no_value_column_dask(self):
         pass
@@ -240,6 +235,4 @@ def test_zero_split_size_dask(self):
         pass
 
     def test_fractional_split_size_dask(self):
-        pass
-
-ApplyableSplitTsDataTestCase().test_apply_on_long_data_dask()
\ No newline at end of file
+        pass
\ No newline at end of file
diff --git a/tsfresh/feature_extraction/feature_dynamics_tests.py b/tsfresh/feature_extraction/feature_dynamics_tests.py
deleted file mode 100644
index 0d053d027..000000000
--- a/tsfresh/feature_extraction/feature_dynamics_tests.py
+++ /dev/null
@@ -1,62 +0,0 @@
-import pandas as pd
-from extraction import extract_features, extract_features_on_sub_features
-from tsfresh.feature_extraction.settings import MinimalFCParameters, EfficientFCParameters
-# Test is going to go here. Should work with Louis code + Scott code + the code that is integrated into tsfresh
-
-
-
-
-
-
-if __name__ == "__main__":
-    
-    # Read in data
-    ts = pd.read_csv("./test_data.csv")
-    print(ts)
-
-    # # running on minimal
-    # X = extract_features(timeseries_container= ts,
-    #                     n_jobs = 0,
-    #                     default_fc_parameters=MinimalFCParameters(),
-    #                     column_id= "measurement_id",
-    #                     column_sort = "t",
-    #                     show_warnings = True)
-
-    # print(X)
-
-    # X = extract_features_on_sub_features(timeseries_container = ts,
-    #                                     sub_feature_split = 3, # window size
-    #                                     n_jobs = 0,
-    #                                     sub_default_fc_parameters = MinimalFCParameters(),
-    #                                     default_fc_parameters = MinimalFCParameters(),
-    #                                     column_id = "measurement_id",
-    #                                     column_sort = "t",
-    #                                     column_kind = None,
-    #                                     column_value = None,
-    #                                     show_warnings = True)
-    # print(X)
-
-    # drop feature calculators that are problematic...
-
-
-    # # running on efficient
-    # X = extract_features(timeseries_container= ts,
-    #                     n_jobs = 0,
-    #                     default_fc_parameters=EfficientFCParameters(),
-    #                     column_id= "measurement_id",
-    #                     column_sort = "t",
-    #                     show_warnings = True)
-    #print(X)
-
-    X = extract_features_on_sub_features(timeseries_container = ts,
-                                        sub_feature_split = 3,
-                                        n_jobs = 0,
-                                        sub_default_fc_parameters = EfficientFCParameters(),
-                                        default_fc_parameters = EfficientFCParameters(),
-                                        column_id = "measurement_id",
-                                        column_sort = "t",
-                                        column_kind = None,
-                                        column_value = None,
-                                        show_warnings = True)
-
-    print(X)
\ No newline at end of file

From ba876eec27d6bc22de070f51063130a97886a06c Mon Sep 17 00:00:00 2001
From: Louis Jarvis <ljar069@aucklanduni.ac.nz>
Date: Sun, 26 Feb 2023 19:14:24 +1300
Subject: [PATCH 16/16] fix files overwritten by merge

---
 tests/fixtures.py | 185 +---------------------------------------------
 1 file changed, 1 insertion(+), 184 deletions(-)

diff --git a/tests/fixtures.py b/tests/fixtures.py
index 9c5887c31..7fe306f3b 100644
--- a/tests/fixtures.py
+++ b/tests/fixtures.py
@@ -1271,187 +1271,4 @@ def create_split_up_test_data_expected_tuples_wide(self):
                 ),
             ),
         ]
-        return (wide_test_data_expected_chunked_up_tuples, window_length)
-
-
-    def create_simple_test_sample(self):
-
-        #identical to the above 
-        cid = np.repeat([10, 500], 40)
-        ckind = np.repeat(["a", "b", "a", "b"], 20)
-        csort = [
-            30,
-            53,
-            26,
-            35,
-            42,
-            25,
-            17,
-            67,
-            20,
-            68,
-            46,
-            12,
-            0,
-            74,
-            66,
-            31,
-            32,
-            2,
-            55,
-            59,
-            56,
-            60,
-            34,
-            69,
-            47,
-            15,
-            49,
-            8,
-            50,
-            73,
-            23,
-            62,
-            24,
-            33,
-            22,
-            70,
-            3,
-            38,
-            28,
-            75,
-            39,
-            36,
-            64,
-            13,
-            72,
-            52,
-            40,
-            16,
-            58,
-            29,
-            63,
-            79,
-            61,
-            78,
-            1,
-            10,
-            4,
-            6,
-            65,
-            44,
-            54,
-            48,
-            11,
-            14,
-            19,
-            43,
-            76,
-            7,
-            51,
-            9,
-            27,
-            21,
-            5,
-            71,
-            57,
-            77,
-            41,
-            18,
-            45,
-            37,
-        ]
-        cval = [
-            11,
-            9,
-            67,
-            45,
-            30,
-            58,
-            62,
-            19,
-            56,
-            29,
-            0,
-            27,
-            36,
-            43,
-            33,
-            2,
-            24,
-            71,
-            41,
-            28,
-            50,
-            40,
-            39,
-            7,
-            53,
-            23,
-            16,
-            37,
-            66,
-            38,
-            6,
-            47,
-            3,
-            61,
-            44,
-            42,
-            78,
-            31,
-            21,
-            55,
-            15,
-            35,
-            25,
-            32,
-            69,
-            65,
-            70,
-            64,
-            51,
-            46,
-            5,
-            77,
-            26,
-            73,
-            76,
-            75,
-            72,
-            74,
-            10,
-            57,
-            4,
-            14,
-            68,
-            22,
-            18,
-            52,
-            54,
-            60,
-            79,
-            12,
-            49,
-            63,
-            8,
-            59,
-            1,
-            13,
-            20,
-            17,
-            48,
-            34,
-        ]
-        
-        df = ddf.from_dict({"id": cid, "kind": ckind, "sort": csort, "val": cval}, npartitions=10)
-        df = ddf.set_index("id", drop=False)
-        #df.index.name = None
-        return ddf
-    
-    def create_test_dask_df_long(self):
-        # cid = np.repeat(["A", "B"], repeats = [4,4])
-        # ckind = np.repeat(["A", "B"])
-        # csort = 
-        # cval = []
-        pass
\ No newline at end of file
+        return (wide_test_data_expected_chunked_up_tuples, window_length)
\ No newline at end of file