Issue
I have the following code:
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn import linear_model
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import GridSearchCV
# Based on the following which has more examples:
# http://nbviewer.jupyter.org/github/michelleful/SingaporeRoadnameOrigins/blob/master/notebooks/04%20Adding%20features%20with%20Pipelines.ipynb
# http://michelleful.github.io/code-blog//2015/06/18/classifying-roads/
# http://zacstewart.com/2014/08/05/pipelines-of-featureunions-of-pipelines.html
# https://stackoverflow.com/questions/49466193/how-to-add-a-feature-to-a-vectorized-data-set/49501769#49501769
# Load ANSI file into pandas dataframe.
df = pd.read_csv(r'e:/work/python/papf.txt', encoding = 'latin1', usecols=['LAST_NAME', 'RACE'])
# Convert last name to lower case.
df['LAST_NAME'] = df['LAST_NAME'].str.lower()
# Remove the last name spaces.
# df['LAST_NAME'] = df['LAST_NAME'].str.replace(' ', '')
# Remove all rows where race is NOT in African, Coloured, White, Indian.
df = df.drop(df[~df['RACE'].isin(['African', 'Coloured', 'White', 'Indian'])].index)
# Returns a column from the dataframe named df as a numpy array of type string.
class TextExtractor(BaseEstimator, TransformerMixin):
"""Adapted from code by @zacstewart
https://github.com/zacstewart/kaggle_seeclickfix/blob/master/estimator.py
Also see Zac Stewart's excellent blogpost on pipelines:
http://zacstewart.com/2014/08/05/pipelines-of-featureunions-of-pipelines.html
"""
def __init__(self, column_name):
self.column_name = column_name
def transform(self, df):
# Select the relevant column and return it as a numpy array.
# Set the array type to be string.
return np.asarray(df[self.column_name]).astype(str) # This refers to the df passed as a parameter, and not to the global scope one.
def fit(self, *_):
return self
class Apply(BaseEstimator, TransformerMixin):
"""Takes in a function and applies it element-wise to every element in the numpy array it's supplied with."""
def __init__(self, fn):
self.fn = np.vectorize(fn)
def transform(self, data):
# Note: reshaping is necessary because otherwise sklearn
# interprets the 1-d array as a single sample.
return self.fn(data.reshape(data.size, 1))
def fit(self, *_):
return self
class AverageWordLengthExtractor(BaseEstimator, TransformerMixin):
"""Takes in dataframe, extracts last name column, outputs average word length"""
def __init__(self):
pass
def average_word_length(self, name):
"""Helper code to compute average word length of a name"""
return np.mean([len(word) for word in name.split()])
def transform(self, df, y=None):
"""The workhorse of this feature extractor"""
return df['LAST_NAME'].apply(self.average_word_length) # This refers to the df passed as a parameter, and not to the global scope one.
def fit(self, df, y=None):
"""Returns self unless something different happens in train and test"""
return self
# Let's pick the same random 10% of the data to train with.
random.seed(1965)
train_test_set = df.loc[random.sample(list(df.index.values), int(len(df) / 10))]
# X = train_test_set[['road_name', 'has_malay_road_tag']]
X = train_test_set[['LAST_NAME']]
y = train_test_set['RACE']
vect = CountVectorizer(ngram_range=(1,4), analyzer='char')
clf = LinearSVC() # #MultinomialNB() #linear_model.SGDClassifier(max_iter=500)
pipeline = Pipeline([
('name_extractor', TextExtractor('LAST_NAME')), # Extract names from df.
('text_features', FeatureUnion([
('vect', vect), # Extract ngrams from names.
('num_words', Apply(lambda s: len(s.split()))), # Number of words.
('ave_word_length', Apply(lambda s: np.mean([len(w) for w in s.split()]))), # Average word length.
])),
('clf' , clf), # Feed the output through a classifier.
])
def run_experiment(X, y, pipeline, num_expts=100):
scores = list()
for i in range(num_expts):
X_train, X_test, y_train, y_true = train_test_split(X, y)
model = pipeline.fit(X_train, y_train) # Train the classifier.
y_test = model.predict(X_test) # Apply the model to the test data.
#print(X_test)
#print(type(X_test))
score = accuracy_score(y_test, y_true) # Compare the results to the gold standard.
scores.append(score)
print(sum(scores) / num_expts)
# Run x times (num_expts) and get the average accuracy.
run_experiment(X, y, pipeline, 1)
# Train a final model for use in the actual output.
X_train, X_test, y_train, y_true = train_test_split(X, y)
model = pipeline.fit(X_train, y_train) # Train the classifier.
df2 = pd.DataFrame(columns=['LAST_NAME'], data=[['Joemat']]) # Create a test case of one.
print(model.predict(df2))
# Solution to this part might be here: https://stackoverflow.com/questions/49466193/how-to-add-a-feature-to-a-vectorized-data-set/49501769#49501769
pg = {'clf__C': [0.1, 1, 10, 100]}
grid = GridSearchCV(pipeline, param_grid=pg, cv=5)
X_train, X_test, y_train, y_true = train_test_split(X, y)
grid.fit(X_train, y_train)
print(grid.best_params_)
# {'clf__C': 0.1}
print(grid.best_score_)
# 0.702290076336
This code works fine until I add the last part with the GridSearchCV, at which point it throws the following exception:
Traceback (most recent call last):
File "e:\Work\Python\name_train5.py", line 132, in <module>
grid.fit(X_train, y_train)
File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py", line 945, in fit
return self._fit(X, y, groups, ParameterGrid(self.param_grid))
File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py", line 550, in _fit
base_estimator = clone(self.estimator)
File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\base.py", line 69, in clone
new_object_params[name] = clone(param, safe=False)
File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\base.py", line 57, in clone
return estimator_type([clone(e, safe=safe) for e in estimator])
File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\base.py", line 57, in <listcomp>
return estimator_type([clone(e, safe=safe) for e in estimator])
File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\base.py", line 57, in clone
return estimator_type([clone(e, safe=safe) for e in estimator])
File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\base.py", line 57, in <listcomp>
return estimator_type([clone(e, safe=safe) for e in estimator])
File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\base.py", line 69, in clone
new_object_params[name] = clone(param, safe=False)
File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\base.py", line 57, in clone
return estimator_type([clone(e, safe=safe) for e in estimator])
File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\base.py", line 57, in <listcomp>
return estimator_type([clone(e, safe=safe) for e in estimator])
File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\base.py", line 57, in clone
return estimator_type([clone(e, safe=safe) for e in estimator])
File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\base.py", line 57, in <listcomp>
return estimator_type([clone(e, safe=safe) for e in estimator])
File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\base.py", line 126, in clone
(estimator, name))
RuntimeError: Cannot clone object Apply(fn=<numpy.lib.function_base.vectorize object at 0x00000201E64780B8>), as the constructor does not seem to set parameter fn
I have found this similar error on stack overflow, but sadly I don't understand the answer. Could someone shed some light as to what I am doing wrong?
Example CSV data:
LAST_NAME,RACE
Ramaepadi,African
Motsamai,African
Van Rooyen,White
Khan,Asian
Du Plessis,White
Singh,Asian
Madlanga,African
Janse van Rensburg,
Solution
The constructor is supposed to store parameters as attributes and nothing else.
All the action should take place in fit
and transform
.
See below a minimal working example
from sklearn.datasets import make_regression
from sklearn.base import BaseEstimator, TransformerMixin
class Apply(BaseEstimator, TransformerMixin):
def __init__(self, fn):
self.fn = fn
self.fn_vectorized = None
def transform(self, data):
# Note: reshaping is necessary because otherwise sklearn
# interprets the 1-d array as a single sample.
return self.fn_vectorized(data.reshape(data.size, 1))
def fit(self, *_):
self.fn_vectorized = np.vectorize(self.fn)
return self
X, y = make_regression(n_features=1)
model = Apply(lambda x: 0 * x)
model.fit_transform(X)
Outputs
array([[ 0.],
[ 0.],
[ 0.], ...
Btw things work without np.vectorize
too.
Answered By - Jan K
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.