Saturday, December 2, 2023

[FIXED] Plotting a classifier learning curve

December 02, 2023 machine-learning, matplotlib, plot, python, scikit-learn No comments

Issue

For a school project I've developed an AdaBoost classifier as follows (I'm using decision stumps as weak learners ):

class AdaBoost:
def __init__(self, boosting_rounds):
    self.boosting_rounds = boosting_rounds  # number of weak learners used
    self.weak_learners = []  # [ (learner, amount of say), ... ]
    self.decision_stumps = []

def fit(self, x, y):
    n_rows = x.shape[0]  # number of examples in training set
    attributes = x.columns  # features in training set

    # dataset example weights
    d = np.full(n_rows, 1 / n_rows)

    # preparing all decision stumps
    for a in attributes:
        # finding possible thresholds for decision stump
        values = x[a].unique()
        values.sort()
        thresholds = []
        for i in range(1, len(values)):
            thresholds.append((values[i] + values[i - 1]) / 2)
        for threshold in thresholds:
            self.decision_stumps.append(DecisionStump(a, threshold))

    for t in range(0, self.boosting_rounds):

        # choosing decision stump that minimizes weights dependent error
        min_error = float("inf")
        h = None

        for ds in self.decision_stumps:
            ds_error = 0
            for example in range(0, n_rows):
                example_row = x.iloc[example]
                if ds.predict(example_row) != y.iloc[example][0]:  # misclassification
                    ds_error += d[example]
            if ds_error < min_error:
                min_error = ds_error
                h = ds

        # amount of say
        small = 1e-10
        alpha = 0.5 * np.log((1 - min_error) / (min_error + small))  # small avoids division by 0

        # storing learner with corresponding amount of say
        self.weak_learners.append((h, alpha))
        
        # updating weights
        for i in range(0, len(d)):
            d[i] = d[i] * np.exp(-alpha * y.iloc[i] * h.predict(x.iloc[i]))
        z = np.sum(d)  # normalisation factor
        for i in range(0, len(d)):
            d[i] /= z

def predict(self, x):
    s = 0  # sum of predictions
    for i in range(0, len(self.weak_learners)):
        prediction = self.weak_learners[i][0].predict(x)
        s += prediction * self.weak_learners[i][1]  # prediction * amount of say

    return int(np.sign(s))

I would like to plot the learning curve.

When I try to use sklearn.model_selection LearningCurveDisplay I get the error that my estimator has no scoring function, while when I add one the error becomes that I'm missing something else and so on and so forth...

How should I go about it, so as to plot the learning curve of my classifier ?

Solution

For an estimator to be compatible with scikit-learn it needs to implement some standard features. I've modified your original code, and seems like it's now working with LearningCurveDisplay.from_estimator(). The code block below includes the test data I was using.

Quick assessment of a single estimator to see if it works:

Learning curve display:

As your implementation loops over all thresholds for all features, the time it takes scales exponentially with the number of samples. I've run it with 100 samples. You may find it prohibitively slow if you have lots of samples.

import numpy as np
import pandas as pd

from sklearn.base import BaseEstimator

#Test data
N = 100
np.random.seed(0)
x = pd.DataFrame({
    'feat0': np.linspace(0, 1, N) + np.random.randn(N) / 5,
    'feat1': np.sin(np.linspace(0, 1, N)) ** 2 + np.random.randn(N) / 5,
    'feat2': np.sin(np.linspace(0, 1, N)) ** 0.5 + np.random.randn(N) / 5
})
y = pd.DataFrame({'label': [1] * (N//2) + [0] * (N//2)})

#Define DecisionStump() class
class DecisionStump:
    def __init__(self, a, threshold):
        self.a = a
        self.threshold = threshold

    def predict(self, row):
        return np.where(row[self.a] < self.threshold, 1, 0)

#Modified OP's AdaBoost
class AdaBoost (BaseEstimator):
    def __init__(self, boosting_rounds):
        self.boosting_rounds = boosting_rounds  # number of weak learners used

    def fit(self, x, y):
        self.feature_names_in_ = np.array(x.columns)
        self.n_features_in_ = x.shape[1]
        
        self.weak_learners = []  # [ (learner, amount of say), ... ]
        self.decision_stumps = []
        
        n_rows = x.shape[0]  # number of examples in training set
        attributes = x.columns  # features in training set

        # dataset example weights
        d = np.full(n_rows, 1 / n_rows)

        # preparing all decision stumps
        for a in attributes:
            # finding possible thresholds for decision stump
            values = x[a].unique()
            values.sort()
            thresholds = []
            for i in range(1, len(values)):
                thresholds.append((values[i] + values[i - 1]) / 2)
            for threshold in thresholds:
                self.decision_stumps.append(DecisionStump(a, threshold))

        for t in range(0, self.boosting_rounds):

            # choosing decision stump that minimizes weights dependent error
            min_error = float("inf")
            h = None

            for ds in self.decision_stumps:
                ds_error = 0
                for example in range(0, n_rows):
                    example_row = x.iloc[example]
                    if ds.predict(example_row) != y.iloc[example][0]:  # misclassification
                        ds_error += d[example]
                if ds_error < min_error:
                    min_error = ds_error
                    h = ds

            # amount of say
            small = 1e-10
            alpha = 0.5 * np.log((1 - min_error) / (min_error + small))  # small avoids division by 0

            # storing learner with corresponding amount of say
            self.weak_learners.append((h, alpha))
            
            # updating weights
            for i in range(0, len(d)):
                d[i] = d[i] * np.exp(-alpha * y.iloc[i][0] * h.predict(x.iloc[i]))
            z = np.sum(d)  # normalisation factor
            for i in range(0, len(d)):
                d[i] /= z
            
            return self

    def predict(self, x):
        s = 0  # sum of predictions
        for i in range(0, len(self.weak_learners)):
            prediction = self.weak_learners[i][0].predict(x)
            s += prediction * self.weak_learners[i][1]  # prediction * amount of say

        return np.sign(s).astype(int)
    
    def score(self, x, y):
        predictions = self.predict(x)
        accuracy = 100 * (predictions == y.values.ravel()).sum() / len(y)
        return accuracy.item()

#Test it with a single run
adb = AdaBoost(boosting_rounds=50).fit(x, y)

from matplotlib import pyplot as plt
predictions = adb.predict(x)
accuracy = adb.score(x, y)
 
plt.scatter(y.values.ravel() + np.random.uniform(size=N)/5,
            adb.predict(x) + np.random.uniform(size=N)/5, marker='.', s=70, alpha=0.6)
plt.xlabel('y label')
plt.ylabel('predicted')
plt.yticks([0, 1])
plt.xticks([0, 1])
plt.title(f'boosting_rounds={adb.boosting_rounds} | accuracy={round(accuracy, 1)}%')

Plot the learning curve:

#Plot learning curve
from sklearn.model_selection import LearningCurveDisplay

LearningCurveDisplay.from_estimator(
    AdaBoost(boosting_rounds=50),
    x,
    y,
)

Answered By - some3128

This Answer collected from stackoverflow and tested by PythonFixing community admins, is licensed under cc by-sa 2.5 , cc by-sa 3.0 and cc by-sa 4.0

Saturday, December 2, 2023

[FIXED] Plotting a classifier learning curve

Issue

Solution

0 comments:

Post a Comment

Popular Posts

Labels