Issue
Can anyone explain to me why this code:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import SelectKBest
#from xgboost import XGBClassifier
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, recall_score, accuracy_score, precision_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import make_scorer
from sklearn.metrics import precision_score,recall_score,f1_score,roc_auc_score
from sklearn import metrics
from sklearn.datasets import make_classification
from numpy import mean
from sklearn.model_selection import train_test_split
from numpy import std
from sklearn.utils import shuffle
import numpy as np
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt
import pickle
#import neptune.new as neptune
import pandas as pd
df = pd.DataFrame({
'Height': [167, 175, 170, 186, 190, 188, 158, 169, 183, 180],
'Weight': [65, 70, 72, 80, 86, 94, 50, 58, 78, 85],
'Team': ['A', 'A', 'B', 'B', 'B', 'B', 'A', 'A', 'B', 'A']
})
full_X_train = df.iloc[:,:-1]
full_y_train = df.iloc[:,-1]
def create_model(X_train=full_X_train,y_train=full_y_train,model_name=SVC(kernel='linear'),n_splits=5,file_name='test_ml'):
clf = model_name
k_fold = StratifiedKFold(n_splits=n_splits,random_state=42,shuffle=True)
precision = []
recall = []
f1 = []
aucs = []
for train_index,test_index in k_fold.split(X_train,y_train):
x_train_fold,x_test_fold = X_train.iloc[train_index],X_train.iloc[test_index]
y_train_fold,y_test_fold = y_train[train_index],y_train[test_index]
clf.fit(x_train_fold,y_train_fold)
y_pred = clf.predict(x_test_fold)
save_mod = file_name + '.' + str(count) + '.fold.json'
pickle.dump(clf,open(save_mod,'wb'))
f1.append(f1_score(y_test_fold,y_pred))
return f1
def get_scores(model,output_file = 'output.txt'):
open_output = open(output_file, 'a')
open_output.write('F1: mean=%.2f std=%.2f, n=%d' % (mean(model[2])*100, std(model[2])*100, len(model[2])) + '\n')
return
def run_model_with_grid_search(model_name=RandomForestClassifier(),X_train=full_X_train,y_train=full_y_train,model_id='test_id', n_splits=5, output_file='', param_grid={}):
search = GridSearchCV(
model_name,
cv=3,
param_grid=param_grid,
scoring='accuracy',
refit=True
)
fit_model = search.fit(X_train,y_train)
return fit_model,fit_model.best_params_,fit_model.best_score_
fit_model,params,best_score = run_model_with_grid_search(param_grid=[{'max_depth':list(range(5,9))}])
model = create_model(fit_model) #n_jobs=-1
print(get_scores(model))
Returns:
File "ml_models.py", line 84, in <module>
model = create_model(fit_model) #n_jobs=-1
File "ml_models.py", line 50, in create_model
for train_index,test_index in k_fold.split(X_train,y_train):
File "/Users/slowat/anaconda/envs/nlp_course/lib/python3.7/site-packages/sklearn/model_selection/_split.py", line 324, in split
X, y, groups = indexable(X, y, groups)
File "/Users/slowat/anaconda/envs/nlp_course/lib/python3.7/site-packages/sklearn/utils/validation.py", line 299, in indexable
check_consistent_length(*result)
File "/Users/slowat/anaconda/envs/nlp_course/lib/python3.7/site-packages/sklearn/utils/validation.py", line 259, in check_consistent_length
lengths = [_num_samples(X) for X in arrays if X is not None]
File "/Users/slowat/anaconda/envs/nlp_course/lib/python3.7/site-packages/sklearn/utils/validation.py", line 259, in <listcomp>
lengths = [_num_samples(X) for X in arrays if X is not None]
File "/Users/slowat/anaconda/envs/nlp_course/lib/python3.7/site-packages/sklearn/utils/validation.py", line 203, in _num_samples
" a valid collection." % x)
TypeError: Singleton array array(GridSearchCV(cv=3, estimator=RandomForestClassifier(),
param_grid=[{'max_depth': [5, 6, 7, 8]}], scoring='accuracy'),
dtype=object) cannot be considered a valid collection.
I have seen this answer, but I don't think this applies to me?
(In case it matters, the overall aim is to implement a grid search with features selection in a Pipeline object - but I haven't figured out how to do that yet because of this error).
Solution
You are passing fit_model
as positional argument to create_model
. The create_model
function has this signature:
create_model(X_train=full_X_train,y_train=full_y_train,model_name=SVC(kernel='linear'),n_splits=5,file_name='test_ml')
So, currently X_train
will have the value fit_model
and get passed to the grid search, creating this error. To fix it, you should use the keyword argument:
model = create_model(model_name=fit_model)
Answered By - user2246849
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.