Issue
I'm using the Sklearn Pipeline
+ GridSearchCV
for the data-preprocessing/hyperparameter tuning of a Deep Neural Network (regression).
For the preprocessing, I would need a custom class for dropping the highly correlated columns of the dataset. Here is my code (to be improved):
class MyDecorrelator():
def __init__(self, threshold):
self.threshold = threshold
def fit(self, X, y=None):
return self
def transform(self, X, y = None):
correlated_features = set() # Set of all the names of correlated columns
corr_matrix = X.corr()
for i in range(len(corr_matrix.columns)):
for j in range(i):
if abs(corr_matrix.iloc[i, j]) > self.threshold: # we are interested in absolute coeff value
colname = corr_matrix.columns[i] # getting the name of column
correlated_features.add(colname)
return X.drop(labels=correlated_features, axis=1, inplace=True)
def create_model(input_shape = 150, optimizer='adam', learn_rate=0.01, activation='relu', init='uniform', hidden_layers = 1, dropout = 0.5, hidden_size=64):
# create model
model = Sequential()
model.add(Dense(input_shape, activation=activation, kernel_initializer=init, ))
for i in range(hidden_layers):
model.add(Dense(hidden_size, activation=activation))
model.add(Dropout(dropout), )
model.add(Dense(1, activation='linear'))
# Compile model
model.compile(loss='mean_absolute_error', optimizer=optimizer)
return model
estimator = Pipeline([
('scaler', MinMaxScaler(feature_range=(0.0, 1.0))),
('decorrelation', MyDecorrelator(0.9)),
('feature_selector', SelectKBest()),
('kr', KerasRegressor(build_fn = create_model))
], verbose = True)
param_grid = [{
'kr__optimizer': ['RMSprop', 'Adam'],
'kr__epochs': [100, 300],
#'kr__init': [ 'uniform', 'zeros', 'normal', ],
'kr__batch_size':[32, 128],
'kr__learn_rate': [0.01, 0.1],
'kr__activation': ['relu', 'sigmoid'],
'kr__dropout': [0.9, 0.1],
'kr__hidden_layers': [2, 3],
'kr__hidden_size': [64, 128],
'feature_selector__score_func': [mutual_info_regression],
'feature_selector__k': [k],
'kr__input_shape': [k]
}
for k in [50, 100] ]
grid = HalvingGridSearchCV(estimator=estimator, param_grid=param_grid, n_jobs=-1, cv=KFold(n_splits = 5), verbose=10)
but, when I try to run the grid.fit(X, Y)
, it gives the following error:
'MyDecorrelator' object has no attribute 'set_params'
Furthermore, if I try to change the first line to class MyDecorrelator(BaseEstimator):
, it says
AttributeError: 'numpy.ndarray' object has no attribute 'corr'
How to fix it?
Update:
I have corrected using the solution by Comsavvy, but as a result i get a warning: UserWarning: One or more of the test scores are non-finite: [nan nan nan ...]
. How can it happen? It worked without the decorrelation.
Solution
The code runs correctly in this way,
class MyDecorrelator(BaseEstimator, TransformerMixin):
def __init__(self, threshold):
self.threshold = threshold
self.correlated_columns = None
def fit(self, X, y=None):
correlated_features = set()
X = pd.DataFrame(X)
corr_matrix = X.corr()
for i in range(len(corr_matrix.columns)):
for j in range(i):
if abs(corr_matrix.iloc[i, j]) > self.threshold: # we are interested in absolute coeff value
colname = corr_matrix.columns[i] # getting the name of column
correlated_features.add(colname)
self.correlated_features = correlated_features
return self
def transform(self, X, y=None, **kwargs):
return (pd.DataFrame(X)).drop(labels=self.correlated_features, axis=1)
Answered By - Gio
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.