Issue
For a school project I've developed an AdaBoost classifier as follows (I'm using decision stumps as weak learners ):
class AdaBoost:
def __init__(self, boosting_rounds):
self.boosting_rounds = boosting_rounds # number of weak learners used
self.weak_learners = [] # [ (learner, amount of say), ... ]
self.decision_stumps = []
def fit(self, x, y):
n_rows = x.shape[0] # number of examples in training set
attributes = x.columns # features in training set
# dataset example weights
d = np.full(n_rows, 1 / n_rows)
# preparing all decision stumps
for a in attributes:
# finding possible thresholds for decision stump
values = x[a].unique()
values.sort()
thresholds = []
for i in range(1, len(values)):
thresholds.append((values[i] + values[i - 1]) / 2)
for threshold in thresholds:
self.decision_stumps.append(DecisionStump(a, threshold))
for t in range(0, self.boosting_rounds):
# choosing decision stump that minimizes weights dependent error
min_error = float("inf")
h = None
for ds in self.decision_stumps:
ds_error = 0
for example in range(0, n_rows):
example_row = x.iloc[example]
if ds.predict(example_row) != y.iloc[example][0]: # misclassification
ds_error += d[example]
if ds_error < min_error:
min_error = ds_error
h = ds
# amount of say
small = 1e-10
alpha = 0.5 * np.log((1 - min_error) / (min_error + small)) # small avoids division by 0
# storing learner with corresponding amount of say
self.weak_learners.append((h, alpha))
# updating weights
for i in range(0, len(d)):
d[i] = d[i] * np.exp(-alpha * y.iloc[i] * h.predict(x.iloc[i]))
z = np.sum(d) # normalisation factor
for i in range(0, len(d)):
d[i] /= z
def predict(self, x):
s = 0 # sum of predictions
for i in range(0, len(self.weak_learners)):
prediction = self.weak_learners[i][0].predict(x)
s += prediction * self.weak_learners[i][1] # prediction * amount of say
return int(np.sign(s))
I would like to plot the learning curve.
When I try to use sklearn.model_selection LearningCurveDisplay
I get the error that my estimator has no scoring function, while when I add one the error becomes that I'm missing something else and so on and so forth...
How should I go about it, so as to plot the learning curve of my classifier ?
Solution
For an estimator to be compatible with scikit-learn it needs to implement some standard features. I've modified your original code, and seems like it's now working with LearningCurveDisplay.from_estimator()
. The code block below includes the test data I was using.
Quick assessment of a single estimator to see if it works:
Learning curve display:
As your implementation loops over all thresholds for all features, the time it takes scales exponentially with the number of samples. I've run it with 100 samples. You may find it prohibitively slow if you have lots of samples.
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator
#Test data
N = 100
np.random.seed(0)
x = pd.DataFrame({
'feat0': np.linspace(0, 1, N) + np.random.randn(N) / 5,
'feat1': np.sin(np.linspace(0, 1, N)) ** 2 + np.random.randn(N) / 5,
'feat2': np.sin(np.linspace(0, 1, N)) ** 0.5 + np.random.randn(N) / 5
})
y = pd.DataFrame({'label': [1] * (N//2) + [0] * (N//2)})
#Define DecisionStump() class
class DecisionStump:
def __init__(self, a, threshold):
self.a = a
self.threshold = threshold
def predict(self, row):
return np.where(row[self.a] < self.threshold, 1, 0)
#Modified OP's AdaBoost
class AdaBoost (BaseEstimator):
def __init__(self, boosting_rounds):
self.boosting_rounds = boosting_rounds # number of weak learners used
def fit(self, x, y):
self.feature_names_in_ = np.array(x.columns)
self.n_features_in_ = x.shape[1]
self.weak_learners = [] # [ (learner, amount of say), ... ]
self.decision_stumps = []
n_rows = x.shape[0] # number of examples in training set
attributes = x.columns # features in training set
# dataset example weights
d = np.full(n_rows, 1 / n_rows)
# preparing all decision stumps
for a in attributes:
# finding possible thresholds for decision stump
values = x[a].unique()
values.sort()
thresholds = []
for i in range(1, len(values)):
thresholds.append((values[i] + values[i - 1]) / 2)
for threshold in thresholds:
self.decision_stumps.append(DecisionStump(a, threshold))
for t in range(0, self.boosting_rounds):
# choosing decision stump that minimizes weights dependent error
min_error = float("inf")
h = None
for ds in self.decision_stumps:
ds_error = 0
for example in range(0, n_rows):
example_row = x.iloc[example]
if ds.predict(example_row) != y.iloc[example][0]: # misclassification
ds_error += d[example]
if ds_error < min_error:
min_error = ds_error
h = ds
# amount of say
small = 1e-10
alpha = 0.5 * np.log((1 - min_error) / (min_error + small)) # small avoids division by 0
# storing learner with corresponding amount of say
self.weak_learners.append((h, alpha))
# updating weights
for i in range(0, len(d)):
d[i] = d[i] * np.exp(-alpha * y.iloc[i][0] * h.predict(x.iloc[i]))
z = np.sum(d) # normalisation factor
for i in range(0, len(d)):
d[i] /= z
return self
def predict(self, x):
s = 0 # sum of predictions
for i in range(0, len(self.weak_learners)):
prediction = self.weak_learners[i][0].predict(x)
s += prediction * self.weak_learners[i][1] # prediction * amount of say
return np.sign(s).astype(int)
def score(self, x, y):
predictions = self.predict(x)
accuracy = 100 * (predictions == y.values.ravel()).sum() / len(y)
return accuracy.item()
#Test it with a single run
adb = AdaBoost(boosting_rounds=50).fit(x, y)
from matplotlib import pyplot as plt
predictions = adb.predict(x)
accuracy = adb.score(x, y)
plt.scatter(y.values.ravel() + np.random.uniform(size=N)/5,
adb.predict(x) + np.random.uniform(size=N)/5, marker='.', s=70, alpha=0.6)
plt.xlabel('y label')
plt.ylabel('predicted')
plt.yticks([0, 1])
plt.xticks([0, 1])
plt.title(f'boosting_rounds={adb.boosting_rounds} | accuracy={round(accuracy, 1)}%')
Plot the learning curve:
#Plot learning curve
from sklearn.model_selection import LearningCurveDisplay
LearningCurveDisplay.from_estimator(
AdaBoost(boosting_rounds=50),
x,
y,
)
Answered By - some3128
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.