Issue
I'm trying to fit a dataframe with SkLearn DecisionTree with the following code. But I get a error Length of feature_names, 9 does not match number of features, 8
. The DecisionTree seems to have only fitted categorical features after transformed by onehotencoding, not the numerical feature. How can I include the numerical feature in the decisiontree model?
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import tree
from matplotlib import pyplot as plt
import graphviz
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.linear_model import LinearRegression
df = pd.DataFrame({'brand' : ['aaaa', 'asdfasdf', 'sadfds', 'NaN'],
'category' : ['asdf','asfa','asdfas','as'],
'num1' : [1, 1, 0, 0] ,
'target' : [1,0,0,1]})
df
dtarget=df['target']
dfeatures=df.drop('target', axis=1)
num = dfeatures.select_dtypes(include=["int64"]).columns.tolist()
cat = dfeatures.select_dtypes(include=["object"]).columns.tolist()
transformer = ColumnTransformer(
transformers=[
("cat", OneHotEncoder(), cat),
]
)
clf= DecisionTreeClassifier(criterion="entropy", max_depth = 5)
pipe = Pipeline(steps=[
('onehotenc', transformer),
('decisiontree', clf)
])
#Fit the training data to the pipeline
pipe.fit(dfeatures, dtarget)
pipe.named_steps['onehotenc'].get_feature_names_out().tolist(),
dot_data= tree.export_graphviz(clf,
out_file=None,
feature_names = num + pipe.named_steps['onehotenc'].get_feature_names_out().tolist(),
class_names= ['1', '0'],
filled = True)
Solution
The numeric feature isn't in your transformer. Since you don't want to do any changes to it, try letting it pass through. You can explicitly define passthrough columns, or pass the remainder. remainder is fine if you know that's the only other column that could ever be sent to the model.
transformer = ColumnTransformer(
transformers=[
("cat", OneHotEncoder(), cat),
],remainder='passthrough'
)
With this you will see your features names include the num1
column
pipe.named_steps['onehotenc'].get_feature_names_out().tolist()
Output
['cat__brand_NaN',
'cat__brand_aaaa',
'cat__brand_asdfasdf',
'cat__brand_sadfds',
'cat__category_as',
'cat__category_asdf',
'cat__category_asdfas',
'cat__category_asfa',
'remainder__num1']
Answered By - Chris
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.