Sunday, May 15, 2022

[FIXED] 'str' object has no attribute 'apply' with sklearn

May 15, 2022 python, scikit-learn No comments

Issue

I am trying to use predict_proba from a sklearn Pipeline for a DataFrame with only one row where i wrote my Pipeline like the following way.

def get_email_length(email) -> int:
    return len(email.split("@")[0])


def get_domain_length(email) -> int:
    parts = email.split("@")
    return len(parts[-1]) if len(parts) > 1 else 0

class EmailLengthTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X.apply(lambda x: get_email_length(x)).values.reshape(-1, 1)


class DomainLengthTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X.apply(lambda x: get_domain_length(x)).values.reshape(-1, 1)

class EmailTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.email_transformer = FeatureUnion(
            [
                ("email_length", EmailLengthTransformer()),
                ("domain_length", DomainLengthTransformer()),
            ]
        )

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_tr = X.squeeze() if len(X) > 1 else X
        return self.email_transformer.fit_transform(X_tr)

entities_list = ['TH', 'PH']
entities_list = list(np.array(entities_list).reshape(1, len(entities_list)))

preprocess = ColumnTransformer(
            transformers=[
                ("email_text", EmailTransformer(), ["email"]),
                ("entity_cat", OneHotEncoder(sparse=False, categories=entities_list), ["global_entity_id"]),
            ]
        )

xgb_model = XGBClassifier()

pipe = Pipeline([("preproc", preprocess), ("classifier", xgb_model)])

after this i train it and then whenever i try to pass a DataFrame like this into pipe.predict_proba(test) it fails

d={
    'email': ['[email protected]'],
    'global_entity_id': ['TH']
}

test=pd.DataFrame.from_dict(d)

pipe.predict_proba(test)

i got error

~/.local/lib/python3.7/site-packages/sklearn/utils/metaestimators.py in <lambda>(*args, **kwargs)
    118 
    119         # lambda, but not partial, allows help() to work with update_wrapper
--> 120         out = lambda *args, **kwargs: self.fn(obj, *args, **kwargs)
    121         # update the docstring of the returned function
    122         update_wrapper(out, self.fn)

~/.local/lib/python3.7/site-packages/sklearn/pipeline.py in predict_proba(self, X)
    472         Xt = X
    473         for _, name, transform in self._iter(with_final=False):
--> 474             Xt = transform.transform(Xt)
    475         return self.steps[-1][-1].predict_proba(Xt)
    476 

~/.local/lib/python3.7/site-packages/sklearn/compose/_column_transformer.py in transform(self, X)
    563                 "data given during fit."
    564             )
--> 565         Xs = self._fit_transform(X, None, _transform_one, fitted=True)
    566         self._validate_output(Xs)
    567 

~/.local/lib/python3.7/site-packages/sklearn/compose/_column_transformer.py in _fit_transform(self, X, y, func, fitted)
    442                     message=self._log_message(name, idx, len(transformers)))
    443                 for idx, (name, trans, column, weight) in enumerate(
--> 444                         self._iter(fitted=fitted, replace_strings=True), 1))
    445         except ValueError as e:
    446             if "Expected 2D array, got 1D array instead" in str(e):

~/.local/lib/python3.7/site-packages/joblib/parallel.py in __call__(self, iterable)
   1002             # remaining jobs.
   1003             self._iterating = False
-> 1004             if self.dispatch_one_batch(iterator):
   1005                 self._iterating = self._original_iterator is not None
   1006 

~/.local/lib/python3.7/site-packages/joblib/parallel.py in dispatch_one_batch(self, iterator)
    833                 return False
    834             else:
--> 835                 self._dispatch(tasks)
    836                 return True
    837 

~/.local/lib/python3.7/site-packages/joblib/parallel.py in _dispatch(self, batch)
    752         with self._lock:
    753             job_idx = len(self._jobs)
--> 754             job = self._backend.apply_async(batch, callback=cb)
    755             # A job can complete so quickly than its callback is
    756             # called before we get here, causing self._jobs to

~/.local/lib/python3.7/site-packages/joblib/_parallel_backends.py in apply_async(self, func, callback)
    207     def apply_async(self, func, callback=None):
    208         """Schedule a func to be run"""
--> 209         result = ImmediateResult(func)
    210         if callback:
    211             callback(result)

~/.local/lib/python3.7/site-packages/joblib/_parallel_backends.py in __init__(self, batch)
    588         # Don't delay the application, to avoid keeping the input
    589         # arguments in memory
--> 590         self.results = batch()
    591 
    592     def get(self):

~/.local/lib/python3.7/site-packages/joblib/parallel.py in __call__(self)
    254         with parallel_backend(self._backend, n_jobs=self._n_jobs):
    255             return [func(*args, **kwargs)
--> 256                     for func, args, kwargs in self.items]
    257 
    258     def __len__(self):

~/.local/lib/python3.7/site-packages/joblib/parallel.py in <listcomp>(.0)
    254         with parallel_backend(self._backend, n_jobs=self._n_jobs):
    255             return [func(*args, **kwargs)
--> 256                     for func, args, kwargs in self.items]
    257 
    258     def __len__(self):

~/.local/lib/python3.7/site-packages/sklearn/utils/fixes.py in __call__(self, *args, **kwargs)
    220     def __call__(self, *args, **kwargs):
    221         with config_context(**self.config):
--> 222             return self.function(*args, **kwargs)

~/.local/lib/python3.7/site-packages/sklearn/pipeline.py in _transform_one(transformer, X, y, weight, **fit_params)
    731 
    732 def _transform_one(transformer, X, y, weight, **fit_params):
--> 733     res = transformer.transform(X)
    734     # if we have a weight for this transformer, multiply output
    735     if weight is None:

~/data-fraud-email-susp-model/model_package/src/transformers.py in transform(self, X, y)
    150 
    151     def transform(self, X, y=None):
--> 152         return self.email_transformer.fit_transform(X.squeeze())

~/.local/lib/python3.7/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
    978             sum of n_components (output dimension) over transformers.
    979         """
--> 980         results = self._parallel_func(X, y, fit_params, _fit_transform_one)
    981         if not results:
    982             # All transformers are None

~/.local/lib/python3.7/site-packages/sklearn/pipeline.py in _parallel_func(self, X, y, fit_params, func)
   1005             message=self._log_message(name, idx, len(transformers)),
   1006             **fit_params) for idx, (name, transformer,
-> 1007                                     weight) in enumerate(transformers, 1))
   1008 
   1009     def transform(self, X):

~/.local/lib/python3.7/site-packages/joblib/parallel.py in __call__(self, iterable)
   1002             # remaining jobs.
   1003             self._iterating = False
-> 1004             if self.dispatch_one_batch(iterator):
   1005                 self._iterating = self._original_iterator is not None
   1006 

~/.local/lib/python3.7/site-packages/joblib/parallel.py in dispatch_one_batch(self, iterator)
    833                 return False
    834             else:
--> 835                 self._dispatch(tasks)
    836                 return True
    837 

~/.local/lib/python3.7/site-packages/joblib/parallel.py in _dispatch(self, batch)
    752         with self._lock:
    753             job_idx = len(self._jobs)
--> 754             job = self._backend.apply_async(batch, callback=cb)
    755             # A job can complete so quickly than its callback is
    756             # called before we get here, causing self._jobs to

~/.local/lib/python3.7/site-packages/joblib/_parallel_backends.py in apply_async(self, func, callback)
    207     def apply_async(self, func, callback=None):
    208         """Schedule a func to be run"""
--> 209         result = ImmediateResult(func)
    210         if callback:
    211             callback(result)

~/.local/lib/python3.7/site-packages/joblib/_parallel_backends.py in __init__(self, batch)
    588         # Don't delay the application, to avoid keeping the input
    589         # arguments in memory
--> 590         self.results = batch()
    591 
    592     def get(self):

~/.local/lib/python3.7/site-packages/joblib/parallel.py in __call__(self)
    254         with parallel_backend(self._backend, n_jobs=self._n_jobs):
    255             return [func(*args, **kwargs)
--> 256                     for func, args, kwargs in self.items]
    257 
    258     def __len__(self):

~/.local/lib/python3.7/site-packages/joblib/parallel.py in <listcomp>(.0)
    254         with parallel_backend(self._backend, n_jobs=self._n_jobs):
    255             return [func(*args, **kwargs)
--> 256                     for func, args, kwargs in self.items]
    257 
    258     def __len__(self):

~/.local/lib/python3.7/site-packages/sklearn/utils/fixes.py in __call__(self, *args, **kwargs)
    220     def __call__(self, *args, **kwargs):
    221         with config_context(**self.config):
--> 222             return self.function(*args, **kwargs)

~/.local/lib/python3.7/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
    752     with _print_elapsed_time(message_clsname, message):
    753         if hasattr(transformer, 'fit_transform'):
--> 754             res = transformer.fit_transform(X, y, **fit_params)
    755         else:
    756             res = transformer.fit(X, y, **fit_params).transform(X)

~/.local/lib/python3.7/site-packages/sklearn/base.py in fit_transform(self, X, y, **fit_params)
    697         if y is None:
    698             # fit method of arity 1 (unsupervised transformation)
--> 699             return self.fit(X, **fit_params).transform(X)
    700         else:
    701             # fit method of arity 2 (supervised transformation)

~/data-fraud-email-susp-model/model_package/src/transformers.py in transform(self, X, y)
     57 
     58     def transform(self, X, y=None):
---> 59         return X.apply(lambda x: get_email_length(x)).values.reshape(-1, 1)
     60 
     61 

AttributeError: 'str' object has no attribute 'apply'

Solution

If you specify the column slicer by a string instead of a list of strings, you'll be passing a Series to your transformer and not a data frame:("email_text", EmailTransformer(), "email") This way you can get rid of the squeeze() call in the transform method.

Answered By - Denis Geidman

This Answer collected from stackoverflow and tested by PythonFixing community admins, is licensed under cc by-sa 2.5 , cc by-sa 3.0 and cc by-sa 4.0

Sunday, May 15, 2022

[FIXED] 'str' object has no attribute 'apply' with sklearn

Issue

Solution

0 comments:

Post a Comment

Popular Posts

Labels