Issue
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
categorical = [col for col in X.columns
if X[col].dtypes == 'object']
numerical = list(set(X.columns) - set(categorical))
numerical_processes = make_pipeline(SimpleImputer(strategy = 'constant', fill_value = 0), StandardScaler())
categorical_processes = make_pipeline(SimpleImputer(strategy = 'constant', fill_value = 'None'), OrdinalEncoder())
X[numerical] = numerical_processes.fit_transform(X[numerical])
X[categorical] = categorical_processes.fit_transform(X[categorical])
Here's the code I used to preprocess my data. For reference I am trying to calculate Mutual Information for features in a dataset where the target are house sale prices.
from sklearn.feature_selection import mutual_info_regression
def MI(X, y, categorical):
mi_score = mutual_info_regression(X, y, discrete_features = categorical)
mi_score = pd.Series(mi_score, name = 'Mutual Info', index = X.columns)
return mi_score
This is the function I used to calculate Mutual Information.
mi_scores = MI(X, y, categorical)
print(mi_scores)
I should be getting a pandas series with the mutual information score for each column in the dataframe X, but I get this error instead:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[21], line 1
----> 1 mi_scores = MI(X, y, categorical)
2 print(mi_scores)
Cell In[15], line 3, in MI(X, y, categorical)
2 def MI(X, y, categorical):
----> 3 mi_score = mutual_info_regression(X, y, discrete_features = categorical)
4 mi_score = pd.Series(mi_score, name = 'Mutual Info', index = X.columns)
5 return mi_score
File /opt/conda/lib/python3.10/site-packages/sklearn/feature_selection/_mutual_info.py:388, in mutual_info_regression(X, y, discrete_features, n_neighbors, copy, random_state)
312 def mutual_info_regression(
313 X, y, *, discrete_features="auto", n_neighbors=3, copy=True, random_state=None
314 ):
315 """Estimate mutual information for a continuous target variable.
316
317 Mutual information (MI) [1]_ between two random variables is a non-negative
(...)
386 of a Random Vector", Probl. Peredachi Inf., 23:2 (1987), 9-16
387 """
--> 388 return _estimate_mi(X, y, discrete_features, False, n_neighbors, copy, random_state)
File /opt/conda/lib/python3.10/site-packages/sklearn/feature_selection/_mutual_info.py:267, in _estimate_mi(X, y, discrete_features, discrete_target, n_neighbors, copy, random_state)
265 discrete_mask.fill(discrete_features)
266 else:
--> 267 discrete_features = check_array(discrete_features, ensure_2d=False)
268 if discrete_features.dtype != "bool":
269 discrete_mask = np.zeros(n_features, dtype=bool)
File /opt/conda/lib/python3.10/site-packages/sklearn/utils/validation.py:910, in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)
902 raise ValueError(
903 "Expected 2D array, got 1D array instead:\narray={}.\n"
904 "Reshape your data either using array.reshape(-1, 1) if "
905 "your data has a single feature or array.reshape(1, -1) "
906 "if it contains a single sample.".format(array)
907 )
909 if dtype_numeric and array.dtype.kind in "USV":
--> 910 raise ValueError(
911 "dtype='numeric' is not compatible with arrays of bytes/strings."
912 "Convert your data to numeric values explicitly instead."
913 )
914 if not allow_nd and array.ndim >= 3:
915 raise ValueError(
916 "Found array with dim %d. %s expected <= 2."
917 % (array.ndim, estimator_name)
918 )
ValueError: dtype='numeric' is not compatible with arrays of bytes/strings.Convert your data to numeric values explicitly instead.
I have tried so many times to preprocess my data and I have double checked every time. I encoded all categorical variables and imputed missing values. I imputed missing values for numeric variable too and ran them through a standard scaler. How do I fix this error?
Solution
As specified in the documentation available here,
the discrete_features parameter should be either a boolean mask with shape (n_features,) or array with indices of discrete features.
This is how it works:
from sklearn.feature_selection import mutual_info_regression
def MI(X, y, categorical):
mi_score = mutual_info_regression(X,
y.values.ravel(),
discrete_features= X.columns.get_indexer(categorical),
random_state=42)
mi_score = pd.Series(mi_score, name = 'Mutual Info', index = X.columns)
return mi_score
- I've also modified
y.values.ravel()
so that the target variabley
is a 1D array (a vector) rather than a DataFrame column. - Finally, I've added
random_state=42
for reproducible results
With this, you should have no more worries about estimating your mutual information.
Answered By - yht
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.