Issue
I am trying to use predict_proba
from a sklearn Pipeline
for a DataFrame
with only one row where i wrote my Pipeline
like the following way.
def get_email_length(email) -> int:
return len(email.split("@")[0])
def get_domain_length(email) -> int:
parts = email.split("@")
return len(parts[-1]) if len(parts) > 1 else 0
class EmailLengthTransformer(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
return X.apply(lambda x: get_email_length(x)).values.reshape(-1, 1)
class DomainLengthTransformer(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
return X.apply(lambda x: get_domain_length(x)).values.reshape(-1, 1)
class EmailTransformer(BaseEstimator, TransformerMixin):
def __init__(self):
self.email_transformer = FeatureUnion(
[
("email_length", EmailLengthTransformer()),
("domain_length", DomainLengthTransformer()),
]
)
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
X_tr = X.squeeze() if len(X) > 1 else X
return self.email_transformer.fit_transform(X_tr)
entities_list = ['TH', 'PH']
entities_list = list(np.array(entities_list).reshape(1, len(entities_list)))
preprocess = ColumnTransformer(
transformers=[
("email_text", EmailTransformer(), ["email"]),
("entity_cat", OneHotEncoder(sparse=False, categories=entities_list), ["global_entity_id"]),
]
)
xgb_model = XGBClassifier()
pipe = Pipeline([("preproc", preprocess), ("classifier", xgb_model)])
after this i train it and then whenever i try to pass a DataFrame like this into pipe.predict_proba(test)
it fails
d={
'email': ['[email protected]'],
'global_entity_id': ['TH']
}
test=pd.DataFrame.from_dict(d)
pipe.predict_proba(test)
i got error
~/.local/lib/python3.7/site-packages/sklearn/utils/metaestimators.py in <lambda>(*args, **kwargs)
118
119 # lambda, but not partial, allows help() to work with update_wrapper
--> 120 out = lambda *args, **kwargs: self.fn(obj, *args, **kwargs)
121 # update the docstring of the returned function
122 update_wrapper(out, self.fn)
~/.local/lib/python3.7/site-packages/sklearn/pipeline.py in predict_proba(self, X)
472 Xt = X
473 for _, name, transform in self._iter(with_final=False):
--> 474 Xt = transform.transform(Xt)
475 return self.steps[-1][-1].predict_proba(Xt)
476
~/.local/lib/python3.7/site-packages/sklearn/compose/_column_transformer.py in transform(self, X)
563 "data given during fit."
564 )
--> 565 Xs = self._fit_transform(X, None, _transform_one, fitted=True)
566 self._validate_output(Xs)
567
~/.local/lib/python3.7/site-packages/sklearn/compose/_column_transformer.py in _fit_transform(self, X, y, func, fitted)
442 message=self._log_message(name, idx, len(transformers)))
443 for idx, (name, trans, column, weight) in enumerate(
--> 444 self._iter(fitted=fitted, replace_strings=True), 1))
445 except ValueError as e:
446 if "Expected 2D array, got 1D array instead" in str(e):
~/.local/lib/python3.7/site-packages/joblib/parallel.py in __call__(self, iterable)
1002 # remaining jobs.
1003 self._iterating = False
-> 1004 if self.dispatch_one_batch(iterator):
1005 self._iterating = self._original_iterator is not None
1006
~/.local/lib/python3.7/site-packages/joblib/parallel.py in dispatch_one_batch(self, iterator)
833 return False
834 else:
--> 835 self._dispatch(tasks)
836 return True
837
~/.local/lib/python3.7/site-packages/joblib/parallel.py in _dispatch(self, batch)
752 with self._lock:
753 job_idx = len(self._jobs)
--> 754 job = self._backend.apply_async(batch, callback=cb)
755 # A job can complete so quickly than its callback is
756 # called before we get here, causing self._jobs to
~/.local/lib/python3.7/site-packages/joblib/_parallel_backends.py in apply_async(self, func, callback)
207 def apply_async(self, func, callback=None):
208 """Schedule a func to be run"""
--> 209 result = ImmediateResult(func)
210 if callback:
211 callback(result)
~/.local/lib/python3.7/site-packages/joblib/_parallel_backends.py in __init__(self, batch)
588 # Don't delay the application, to avoid keeping the input
589 # arguments in memory
--> 590 self.results = batch()
591
592 def get(self):
~/.local/lib/python3.7/site-packages/joblib/parallel.py in __call__(self)
254 with parallel_backend(self._backend, n_jobs=self._n_jobs):
255 return [func(*args, **kwargs)
--> 256 for func, args, kwargs in self.items]
257
258 def __len__(self):
~/.local/lib/python3.7/site-packages/joblib/parallel.py in <listcomp>(.0)
254 with parallel_backend(self._backend, n_jobs=self._n_jobs):
255 return [func(*args, **kwargs)
--> 256 for func, args, kwargs in self.items]
257
258 def __len__(self):
~/.local/lib/python3.7/site-packages/sklearn/utils/fixes.py in __call__(self, *args, **kwargs)
220 def __call__(self, *args, **kwargs):
221 with config_context(**self.config):
--> 222 return self.function(*args, **kwargs)
~/.local/lib/python3.7/site-packages/sklearn/pipeline.py in _transform_one(transformer, X, y, weight, **fit_params)
731
732 def _transform_one(transformer, X, y, weight, **fit_params):
--> 733 res = transformer.transform(X)
734 # if we have a weight for this transformer, multiply output
735 if weight is None:
~/data-fraud-email-susp-model/model_package/src/transformers.py in transform(self, X, y)
150
151 def transform(self, X, y=None):
--> 152 return self.email_transformer.fit_transform(X.squeeze())
~/.local/lib/python3.7/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
978 sum of n_components (output dimension) over transformers.
979 """
--> 980 results = self._parallel_func(X, y, fit_params, _fit_transform_one)
981 if not results:
982 # All transformers are None
~/.local/lib/python3.7/site-packages/sklearn/pipeline.py in _parallel_func(self, X, y, fit_params, func)
1005 message=self._log_message(name, idx, len(transformers)),
1006 **fit_params) for idx, (name, transformer,
-> 1007 weight) in enumerate(transformers, 1))
1008
1009 def transform(self, X):
~/.local/lib/python3.7/site-packages/joblib/parallel.py in __call__(self, iterable)
1002 # remaining jobs.
1003 self._iterating = False
-> 1004 if self.dispatch_one_batch(iterator):
1005 self._iterating = self._original_iterator is not None
1006
~/.local/lib/python3.7/site-packages/joblib/parallel.py in dispatch_one_batch(self, iterator)
833 return False
834 else:
--> 835 self._dispatch(tasks)
836 return True
837
~/.local/lib/python3.7/site-packages/joblib/parallel.py in _dispatch(self, batch)
752 with self._lock:
753 job_idx = len(self._jobs)
--> 754 job = self._backend.apply_async(batch, callback=cb)
755 # A job can complete so quickly than its callback is
756 # called before we get here, causing self._jobs to
~/.local/lib/python3.7/site-packages/joblib/_parallel_backends.py in apply_async(self, func, callback)
207 def apply_async(self, func, callback=None):
208 """Schedule a func to be run"""
--> 209 result = ImmediateResult(func)
210 if callback:
211 callback(result)
~/.local/lib/python3.7/site-packages/joblib/_parallel_backends.py in __init__(self, batch)
588 # Don't delay the application, to avoid keeping the input
589 # arguments in memory
--> 590 self.results = batch()
591
592 def get(self):
~/.local/lib/python3.7/site-packages/joblib/parallel.py in __call__(self)
254 with parallel_backend(self._backend, n_jobs=self._n_jobs):
255 return [func(*args, **kwargs)
--> 256 for func, args, kwargs in self.items]
257
258 def __len__(self):
~/.local/lib/python3.7/site-packages/joblib/parallel.py in <listcomp>(.0)
254 with parallel_backend(self._backend, n_jobs=self._n_jobs):
255 return [func(*args, **kwargs)
--> 256 for func, args, kwargs in self.items]
257
258 def __len__(self):
~/.local/lib/python3.7/site-packages/sklearn/utils/fixes.py in __call__(self, *args, **kwargs)
220 def __call__(self, *args, **kwargs):
221 with config_context(**self.config):
--> 222 return self.function(*args, **kwargs)
~/.local/lib/python3.7/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
752 with _print_elapsed_time(message_clsname, message):
753 if hasattr(transformer, 'fit_transform'):
--> 754 res = transformer.fit_transform(X, y, **fit_params)
755 else:
756 res = transformer.fit(X, y, **fit_params).transform(X)
~/.local/lib/python3.7/site-packages/sklearn/base.py in fit_transform(self, X, y, **fit_params)
697 if y is None:
698 # fit method of arity 1 (unsupervised transformation)
--> 699 return self.fit(X, **fit_params).transform(X)
700 else:
701 # fit method of arity 2 (supervised transformation)
~/data-fraud-email-susp-model/model_package/src/transformers.py in transform(self, X, y)
57
58 def transform(self, X, y=None):
---> 59 return X.apply(lambda x: get_email_length(x)).values.reshape(-1, 1)
60
61
AttributeError: 'str' object has no attribute 'apply'
Solution
If you specify the column slicer by a string instead of a list of strings, you'll be passing a Series to your transformer and not a data frame:("email_text", EmailTransformer(), "email")
This way you can get rid of the squeeze()
call in the transform method.
Answered By - Denis Geidman
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.