Issue
Everyone
I am trying to create a pipeline using the scikit-learn.
Basically, I have a jupyter-notebook that loading data using pandas, split dataset to train and test the model.
My problem occur in the line: clf.fit(X_train, y_train)
you can see the whole code on my github repo jupyter-notebook
log error:
----------------------------------------------------------------------
KeyError Traceback (most recent call last)
~/anaconda3/lib/python3.7/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
2656 try:
-> 2657 return self._engine.get_loc(key)
2658 except KeyError:
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 'survived'
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
~/anaconda3/lib/python3.7/site-packages/sklearn/utils/__init__.py in _get_column_indices(X, key)
446 for col in columns:
--> 447 col_idx = all_columns.get_loc(col)
448 if not isinstance(col_idx, numbers.Integral):
~/anaconda3/lib/python3.7/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
2658 except KeyError:
-> 2659 return self._engine.get_loc(self._maybe_cast_indexer(key))
2660 indexer = self.get_indexer([key], method=method, tolerance=tolerance)
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 'survived'
The above exception was the direct cause of the following exception:
ValueError Traceback (most recent call last)
<ipython-input-16-17661ab0f723> in <module>
----> 1 clf.fit(X_train, y_train)
2 print("model score: %.3f" % clf.score(X_test, y_test))
~/anaconda3/lib/python3.7/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
328 """
329 fit_params_steps = self._check_fit_params(**fit_params)
--> 330 Xt = self._fit(X, y, **fit_params_steps)
331 with _print_elapsed_time('Pipeline',
332 self._log_message(len(self.steps) - 1)):
~/anaconda3/lib/python3.7/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params_steps)
294 message_clsname='Pipeline',
295 message=self._log_message(step_idx),
--> 296 **fit_params_steps[name])
297 # Replace the transformer of the step with the fitted
298 # transformer. This is necessary when loading the transformer
~/anaconda3/lib/python3.7/site-packages/joblib/memory.py in __call__(self, *args, **kwargs)
350
351 def __call__(self, *args, **kwargs):
--> 352 return self.func(*args, **kwargs)
353
354 def call_and_shelve(self, *args, **kwargs):
~/anaconda3/lib/python3.7/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
738 with _print_elapsed_time(message_clsname, message):
739 if hasattr(transformer, 'fit_transform'):
--> 740 res = transformer.fit_transform(X, y, **fit_params)
741 else:
742 res = transformer.fit(X, y, **fit_params).transform(X)
~/anaconda3/lib/python3.7/site-packages/sklearn/compose/_column_transformer.py in fit_transform(self, X, y)
527 self._validate_transformers()
528 self._validate_column_callables(X)
--> 529 self._validate_remainder(X)
530
531 result = self._fit_transform(X, y, _fit_transform_one)
~/anaconda3/lib/python3.7/site-packages/sklearn/compose/_column_transformer.py in _validate_remainder(self, X)
325 cols = []
326 for columns in self._columns:
--> 327 cols.extend(_get_column_indices(X, columns))
328
329 remaining_idx = sorted(set(range(self._n_features)) - set(cols))
~/anaconda3/lib/python3.7/site-packages/sklearn/utils/__init__.py in _get_column_indices(X, key)
454 raise ValueError(
455 "A given column is not a column of the dataframe"
--> 456 ) from e
457
458 return column_indices
ValueError: A given column is not a column of the dataframe
I checked if the columns exist before of pass the dataframe to split in train and test.
Someone have some idea of how to solve this issue?
Thanks in advance! Cheers
Solution
The error comes from the fact that you from the very beginning drop the column survived
when defining X
. You only checked its presence in y_train
.
Simply replace
X= df.drop('survived', axis=1)
by
X= df
and your
clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_test, y_test))
returns
model score: 1.000
Answered By - Serge de Gosson de Varennes
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.