Issue
Could someone please demonstrate how to amend this code (which is a reproducible example):
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold,KFold
from sklearn.feature_selection import SelectKBest
#from xgboost import XGBClassifier
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectKBest, RFECV
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, recall_score, accuracy_score, precision_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import make_scorer
from sklearn.metrics import precision_score,recall_score,f1_score,roc_auc_score
from sklearn import metrics
from sklearn.datasets import make_classification
from numpy import mean
from sklearn.model_selection import train_test_split
from numpy import std
from sklearn.utils import shuffle
import numpy as np
from sklearn.metrics import roc_curve
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import pandas as pd
full_X_train,full_y_train = make_classification(n_samples =500,n_features = 20, random_state=1, n_informative=10,n_redundant=10)
def run_model_with_grid_search(param_grid={},output_plt_file = 'plt.png',model_name=RandomForestClassifier(),X_train=full_X_train,y_train=full_y_train,model_id='random_forest_with_hpo_no_fs_geno_class', n_splits=5, output_file='random_forest_with_hpo_no_fs_geno_class.txt'):
cv_outer = KFold(n_splits=5,shuffle=True,random_state=1)
for train_ix,test_ix in cv_outer.split(X_train):
split_x_train, split_x_test = X_train[train_ix,:],X_train[test_ix,:] #add in .iloc
split_y_train, split_y_test = y_train[train_ix],y_train[test_ix] #add in .iloc
cv_inner = KFold(n_splits=3,shuffle=True,random_state=1)
model = model_name
rfecv = RFECV(estimator=model, step=1,cv=5, scoring='roc_auc')
search = GridSearchCV(model,param_grid=param_grid,scoring='roc_auc',cv=cv_inner,refit=True)
pipeline = Pipeline([('feature_sele',rfecv),('clf_cv',search)])
result = pipeline.fit(split_x_train,split_y_train)
#result = search.fit(split_x_train,split_y_train)
best_model = result.best_estimator_
yhat = best_model.predict(split_x_test)
print('>acc=%.3f,est=%.3f,cfg=%s'%(accuracy,result.best_score_,result.best_params_))
return
param_grid = [{
'min_samples_leaf':[1,3,5],
}]
run_model_with_grid_search(param_grid=param_grid)
Generates:
Attribute Error: Pipeline object has no attribute 'best_estimator_'
The ultimate aim is to perform nested cross validation, hyper parameter optimization and feature selection in this function, and I was trying to follow this example, if someone could show me how to edit this function to perform that correctly I'd appreciate it.
Solution
Normally, you'd run grid search on the pipeline, not the pipeline on grid search. Is there a certain reason you'd want it the other way round?
pipeline = Pipeline([('feature_sele',rfecv), ('clf',model)])
search = GridSearchCV(pipeline, param_grid=param_grid, scoring='roc_auc', cv=cv_inner, refit=True)
result = search.fit(split_x_train, split_y_train)
best_model = result.best_estimator_
(param_grid
will require clf_
prefix to hyperparameter names ofc.)
On an unlrelated note, accuracy
seems to be undefined.
Answered By - dx2-66
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.