Skip to content

could not convert string to float: 'x' - using FeatureSelector #30

@balgad

Description

@balgad

When trying to use FeatureSelector I got "" message.

Command I use (python 3.10):

from verstack import FeatureSelector
FS = FeatureSelector(objective = 'classification', auto = True)
selected_feats = FS.fit_transform(X_encoded, y)

Error call stack:

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[24], line 3
      1 from verstack import FeatureSelector
      2 FS = FeatureSelector(objective = 'classification', auto = True)
----> 3 selected_feats = FS.fit_transform(X_encoded, y)

File ~\AppData\Roaming\Python\Python310\site-packages\verstack\tools.py:19, in timer.<locals>.wrapped(*args, **kwargs)
     16 @wraps(func)
     17 def wrapped(*args, **kwargs):
     18     start = time.time()
---> 19     result = func(*args, **kwargs)
     20     end = time.time()
     21     elapsed = round(end-start,5)

File ~\AppData\Roaming\Python\Python310\site-packages\verstack\FeatureSelector.py:232, in FeatureSelector.fit_transform(self, X, y, **kwargs)
    230 if self.auto:
    231     self.printer.print(f'Comparing LinearRegression and RandomForest for feature selection', order = 2)
--> 232     self._auto_linear_randomforest_selector(X, y, kwargs)
    233 else:
    234     self.printer.print(f'Running feature selection with {self._model}', order = 2)

File ~\AppData\Roaming\Python\Python310\site-packages\verstack\FeatureSelector.py:294, in FeatureSelector._auto_linear_randomforest_selector(self, X, y, kwargs)
    291 selector_rf = self._get_selector(randomforest_model, y, kwargs)
    293 self.printer.print(f'Running feature selection with {linear_model}', order = 2)
--> 294 feats_lr_flags = self._prepare_data_apply_selector(X, y, selector_lr, scale_data = True)
    296 self.printer.print(f'Running feature selection with {randomforest_model}', order = 2)
    297 feats_rf_flags = self._prepare_data_apply_selector(X, y, selector_rf, scale_data = False)

File ~\AppData\Roaming\Python\Python310\site-packages\verstack\FeatureSelector.py:251, in FeatureSelector._prepare_data_apply_selector(self, X, y, selector, scale_data)
    249 X_subset, y_subset = self._subset_data(X, y)
    250 if scale_data:
--> 251     X_subset = self._scale_data(X_subset)
    252 try:
    253     X_subset, y_subset = self._transform_data_to_float_32(X_subset, y_subset)

File ~\AppData\Roaming\Python\Python310\site-packages\verstack\FeatureSelector.py:499, in FeatureSelector._scale_data(self, X)
    497 from sklearn.preprocessing import StandardScaler
    498 scaler = StandardScaler()
--> 499 X = scaler.fit_transform(X)
    500 return X

File C:\Anaconda3\envs\python_310\lib\site-packages\sklearn\base.py:867, in TransformerMixin.fit_transform(self, X, y, **fit_params)
    863 # non-optimized default implementation; override when a better
    864 # method is possible for a given clustering algorithm
    865 if y is None:
    866     # fit method of arity 1 (unsupervised transformation)
--> 867     return self.fit(X, **fit_params).transform(X)
    868 else:
    869     # fit method of arity 2 (supervised transformation)
    870     return self.fit(X, y, **fit_params).transform(X)

File C:\Anaconda3\envs\python_310\lib\site-packages\sklearn\preprocessing\_data.py:809, in StandardScaler.fit(self, X, y, sample_weight)
    807 # Reset internal state before fitting
    808 self._reset()
--> 809 return self.partial_fit(X, y, sample_weight)

File C:\Anaconda3\envs\python_310\lib\site-packages\sklearn\preprocessing\_data.py:844, in StandardScaler.partial_fit(self, X, y, sample_weight)
    812 """Online computation of mean and std on X for later scaling.
    813 
    814 All of X is processed as a single batch. This is intended for cases
   (...)
    841     Fitted scaler.
    842 """
    843 first_call = not hasattr(self, "n_samples_seen_")
--> 844 X = self._validate_data(
    845     X,
    846     accept_sparse=("csr", "csc"),
    847     dtype=FLOAT_DTYPES,
    848     force_all_finite="allow-nan",
    849     reset=first_call,
    850 )
    851 n_features = X.shape[1]
    853 if sample_weight is not None:

File C:\Anaconda3\envs\python_310\lib\site-packages\sklearn\base.py:577, in BaseEstimator._validate_data(self, X, y, reset, validate_separately, **check_params)
    575     raise ValueError("Validation should be done on X, y or both.")
    576 elif not no_val_X and no_val_y:
--> 577     X = check_array(X, input_name="X", **check_params)
    578     out = X
    579 elif no_val_X and not no_val_y:

File C:\Anaconda3\envs\python_310\lib\site-packages\sklearn\utils\validation.py:856, in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)
    854         array = array.astype(dtype, casting="unsafe", copy=False)
    855     else:
--> 856         array = np.asarray(array, order=order, dtype=dtype)
    857 except ComplexWarning as complex_warning:
    858     raise ValueError(
    859         "Complex data not supported\n{}\n".format(array)
    860     ) from complex_warning

File C:\Anaconda3\envs\python_310\lib\site-packages\pandas\core\generic.py:2070, in NDFrame.__array__(self, dtype)
   2069 def __array__(self, dtype: npt.DTypeLike | None = None) -> np.ndarray:
-> 2070     return np.asarray(self._values, dtype=dtype)

ValueError: could not convert string to float: 'x'

Could you help me what am I doing wrong?

Thanks,
balgad

ps.: anyway, it's a great package! :)

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions