Issue
I replicated a TextNormalizer
class from this book like this
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.base import BaseEstimator, TransformerMixin
class TextNormalizer(BaseEstimator, TransformerMixin):
def __init__(self, language='english'):
self.stopwords = set(nltk.corpus.stopwords.words(language))
self.lemmatizer = WordNetLemmatizer()
def remove_concat(self, narrative):
chars_to_remove = ['-', '_', '+']
reg_ex = '[' + re.escape (''. join (chars_to_remove)) + ']'
return re.sub(reg_ex, ' ', narrative)
def process_narrative(self, narrative):
cleaned_narrative = self.remove_concat(narrative)
tokens = nltk.word_tokenize(cleaned_narrative)
return [token.lower() for token in tokens if token.lower() not in self.stopwords]
I wanted to learn the code step by step by testing it like this
tn = TextNormalizer()
tn
The following error occurred
AttributeError Traceback (most recent call last)
~\Anaconda3\lib\site-packages\IPython\core\formatters.py in __call__(self, obj, include, exclude)
968
969 if method is not None:
--> 970 return method(include=include, exclude=exclude)
971 return None
972 else:
~\Anaconda3\lib\site-packages\sklearn\base.py in _repr_mimebundle_(self, **kwargs)
462 def _repr_mimebundle_(self, **kwargs):
463 """Mime bundle used by jupyter kernels to display estimator"""
--> 464 output = {"text/plain": repr(self)}
465 if get_config()["display"] == 'diagram':
466 output["text/html"] = estimator_html_repr(self)
~\Anaconda3\lib\site-packages\sklearn\base.py in __repr__(self, N_CHAR_MAX)
258 n_max_elements_to_show=N_MAX_ELEMENTS_TO_SHOW)
259
--> 260 repr_ = pp.pformat(self)
261
262 # Use bruteforce ellipsis when there are a lot of non-blank characters
~\Anaconda3\lib\pprint.py in pformat(self, object)
151 def pformat(self, object):
152 sio = _StringIO()
--> 153 self._format(object, sio, 0, 0, {}, 0)
154 return sio.getvalue()
155
~\Anaconda3\lib\pprint.py in _format(self, object, stream, indent, allowance, context, level)
168 self._readable = False
169 return
--> 170 rep = self._repr(object, context, level)
171 max_width = self._width - indent - allowance
172 if len(rep) > max_width:
~\Anaconda3\lib\pprint.py in _repr(self, object, context, level)
402
403 def _repr(self, object, context, level):
--> 404 repr, readable, recursive = self.format(object, context.copy(),
405 self._depth, level)
406 if not readable:
~\Anaconda3\lib\site-packages\sklearn\utils\_pprint.py in format(self, object, context, maxlevels, level)
178
179 def format(self, object, context, maxlevels, level):
--> 180 return _safe_repr(object, context, maxlevels, level,
181 changed_only=self._changed_only)
182
~\Anaconda3\lib\site-packages\sklearn\utils\_pprint.py in _safe_repr(object, context, maxlevels, level, changed_only)
423 recursive = False
424 if changed_only:
--> 425 params = _changed_params(object)
426 else:
427 params = object.get_params(deep=False)
~\Anaconda3\lib\site-packages\sklearn\utils\_pprint.py in _changed_params(estimator)
89 estimator with non-default values."""
90
---> 91 params = estimator.get_params(deep=False)
92 init_func = getattr(estimator.__init__, 'deprecated_original',
93 estimator.__init__)
~\Anaconda3\lib\site-packages\sklearn\base.py in get_params(self, deep)
193 out = dict()
194 for key in self._get_param_names():
--> 195 value = getattr(self, key)
196 if deep and hasattr(value, 'get_params'):
197 deep_items = value.get_params().items()
AttributeError: 'TextNormalizer' object has no attribute 'language'
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
~\Anaconda3\lib\site-packages\IPython\core\formatters.py in __call__(self, obj)
700 type_pprinters=self.type_printers,
701 deferred_pprinters=self.deferred_printers)
--> 702 printer.pretty(obj)
703 printer.flush()
704 return stream.getvalue()
~\Anaconda3\lib\site-packages\IPython\lib\pretty.py in pretty(self, obj)
392 if cls is not object \
393 and callable(cls.__dict__.get('__repr__')):
--> 394 return _repr_pprint(obj, self, cycle)
395
396 return _default_pprint(obj, self, cycle)
~\Anaconda3\lib\site-packages\IPython\lib\pretty.py in _repr_pprint(obj, p, cycle)
698 """A pprint that just redirects to the normal repr function."""
699 # Find newlines and replace them with p.break_()
--> 700 output = repr(obj)
701 lines = output.splitlines()
702 with p.group():
~\Anaconda3\lib\site-packages\sklearn\base.py in __repr__(self, N_CHAR_MAX)
258 n_max_elements_to_show=N_MAX_ELEMENTS_TO_SHOW)
259
--> 260 repr_ = pp.pformat(self)
261
262 # Use bruteforce ellipsis when there are a lot of non-blank characters
~\Anaconda3\lib\pprint.py in pformat(self, object)
151 def pformat(self, object):
152 sio = _StringIO()
--> 153 self._format(object, sio, 0, 0, {}, 0)
154 return sio.getvalue()
155
~\Anaconda3\lib\pprint.py in _format(self, object, stream, indent, allowance, context, level)
168 self._readable = False
169 return
--> 170 rep = self._repr(object, context, level)
171 max_width = self._width - indent - allowance
172 if len(rep) > max_width:
~\Anaconda3\lib\pprint.py in _repr(self, object, context, level)
402
403 def _repr(self, object, context, level):
--> 404 repr, readable, recursive = self.format(object, context.copy(),
405 self._depth, level)
406 if not readable:
~\Anaconda3\lib\site-packages\sklearn\utils\_pprint.py in format(self, object, context, maxlevels, level)
178
179 def format(self, object, context, maxlevels, level):
--> 180 return _safe_repr(object, context, maxlevels, level,
181 changed_only=self._changed_only)
182
~\Anaconda3\lib\site-packages\sklearn\utils\_pprint.py in _safe_repr(object, context, maxlevels, level, changed_only)
423 recursive = False
424 if changed_only:
--> 425 params = _changed_params(object)
426 else:
427 params = object.get_params(deep=False)
~\Anaconda3\lib\site-packages\sklearn\utils\_pprint.py in _changed_params(estimator)
89 estimator with non-default values."""
90
---> 91 params = estimator.get_params(deep=False)
92 init_func = getattr(estimator.__init__, 'deprecated_original',
93 estimator.__init__)
~\Anaconda3\lib\site-packages\sklearn\base.py in get_params(self, deep)
193 out = dict()
194 for key in self._get_param_names():
--> 195 value = getattr(self, key)
196 if deep and hasattr(value, 'get_params'):
197 deep_items = value.get_params().items()
AttributeError: 'TextNormalizer' object has no attribute 'language'
Although the class TextNormalizer
would throw the above error if I tried to instantiate it, but it worked if applied to text like this
df = pd.DataFrame({'description': ['My order_number is A-08', 'It cost me +$80.00']})
tn = TextNormalizer()
df['description'].apply(tn.process_narrative)
Which produced this output
0 [order, number, 08]
1 [cost, $, 80.00]
Name: description, dtype: object
Can someone please explain what is happening? I meant it worked although it seemed wrong. What's the reason for this "phenomenon"?
Solution
I had to also download the stopwords, but setting self.language = language
in the code and then using either language
or self.language
to retrieve the correct list resolves the error.
Removing the language
parameter and all lines using it also runs without issue, so it just doesn't like the parameter being specified, but not used.
import nltk
import pandas as pd
nltk.download('stopwords')
nltk.download('punkt')
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.base import BaseEstimator, TransformerMixin
class TextNormalizer(BaseEstimator, TransformerMixin):
def __init__(self, language='english'):
self.language = language
self.stopwords = set(nltk.corpus.stopwords.words(self.language))
self.lemmatizer = WordNetLemmatizer()
test = TextNormalizer()
test
Addendum
The reason
tn
from the OP returns an error is because language
isn't being set when the TextNormalizer
object is instantiated, and because that value is used in the object's representation of itself.
When the single line:
tn
is run, this does not instantiate the object tn
, which must already exist. It evaluates and returns the value of __repr__
(which is defined somewhere in nltk
). No error arises from creation of the class
itself because the __repr__
method is not evaluated until you run tn
on its own, or repr(tn)
.
df['description'].apply(tn.process_narrative)
does not throw an error because tn
already exists, and its __repr__
method is not being used.
You can fix this by assigning self.language
as discussed above, or doing this after the item is created:
tn = TextNormalizer()
tn.language = 'english'
tn
Answered By - baileythegreen
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.