Issue
Could I please ask, my aim is to use shap with cross validation to identify the most important features for my model.
I have this code:
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
import shap
import pandas as pd
import numpy as np
#loading and preparing the data
iris = load_breast_cancer()
X = iris.data
y = iris.target
columns = iris.feature_names
#if you don't shuffle you wont need to keep track of test_index, but I think
#it is always good practice to shuffle your data
kf = KFold(n_splits=2,shuffle=True)
list_shap_values = list()
list_test_sets = list()
for train_index, test_index in kf.split(X):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
X_train = pd.DataFrame(X_train,columns=columns)
X_test = pd.DataFrame(X_test,columns=columns)
#training model
clf = RandomForestClassifier(random_state=0)
clf.fit(X_train, y_train)
#explaining model
explainer = shap.TreeExplainer(clf)
shap_values = explainer.shap_values(X_test)
#for each iteration we save the test_set index and the shap_values
list_shap_values.append(shap_values)
list_test_sets.append(test_index)
#combining results from all iterations
test_set = list_test_sets[0]
shap_values = np.array(list_shap_values[0])
for i in range(1,len(list_test_sets)):
test_set = np.concatenate((test_set,list_test_sets[i]),axis=0)
shap_values = np.concatenate((shap_values,np.array(list_shap_values[i])),axis=1)
#bringing back variable names
X_test_df = pd.DataFrame(X[test_set],columns=columns)
cols = X_test_df.columns
sv = np.abs(shap_values[1,:,:]).mean(0)
importance_df = pd.DataFrame({
"column_name": cols,
"shap_values": sv
})
#expected result
importance_df.sort_values("shap_values", ascending=False)
print(importance_df)
Could I please ask, have I implemented this correctly? Specifically, is this line correct?
test_set = np.concatenate((test_set,list_test_sets[i]),axis=0)
shap_values = np.concatenate((shap_values,np.array(list_shap_values[i])),axis=1)
I saw this in sample code here, but I don't understand why I would use axis 0 for the test_set and axis 1 for the shap values. I had asked a question about a bug I had here, and it came up in the comments, but I don't clearly understand the suggeestion in terms of how to code this correctly.
Solution
I would do it this way:
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
import shap
import pandas as pd
import numpy as np
#loading and preparing the data
iris = load_breast_cancer()
X = iris.data
y = iris.target
columns = iris.feature_names
#if you don't shuffle you wont need to keep track of test_index, but I think
#it is always good practice to shuffle your data
kf = KFold(n_splits=2,shuffle=True)
list_shap_values = list()
list_test_sets = list()
for train_index, test_index in kf.split(X):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
X_train = pd.DataFrame(X_train,columns=columns)
X_test = pd.DataFrame(X_test,columns=columns)
#training model
clf = RandomForestClassifier(random_state=0)
clf.fit(X_train, y_train)
#explaining model
explainer = shap.TreeExplainer(clf)
shap_values = explainer.shap_values(X_test)
#for each iteration we save the test_set index and the shap_values
list_shap_values.append(shap_values)
# flatten list of lists, pick the sv for 1 class, stack the result
shap_values = np.vstack([sv[1] for sv in list_shap_values])
sv = np.abs(shap_values).mean(0) # <-- error corrected
importance_df = pd.DataFrame({
"column_name": columns,
"shap_values": sv
})
Answered By - Sergey Bushmanov
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.