How to plot learning curve and validation curve while using pipeline

Question

I would appreciate if you could let me know in the following example code:

from collections import Counter
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split,StratifiedKFold,learning_curve,validation_curve,GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
import numpy as np
import matplotlib.pyplot as plt

def plot_learning_curve(train_sizes, train_scores, test_scores, title, alpha=0.1):
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)
    plt.plot(train_sizes, train_mean, label='train score', color='blue', marker='o')
    plt.fill_between(train_sizes, train_mean + train_std,
                     train_mean - train_std, color='blue', alpha=alpha)
    plt.plot(train_sizes, test_mean, label='test score', color='red', marker='o')
    plt.fill_between(train_sizes, test_mean + test_std, test_mean - test_std, color='red', alpha=alpha)
    plt.title(title)
    plt.xlabel('Number of training points')
    plt.ylabel('F-measure')
    plt.grid(ls='--')
    plt.legend(loc='best')
    plt.show()


def plot_validation_curve(param_range, train_scores, test_scores, title, alpha=0.1):
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)
    plt.plot(param_range, train_mean, label='train score', color='blue', marker='o')
    plt.fill_between(param_range, train_mean + train_std,
                     train_mean - train_std, color='blue', alpha=alpha)
    plt.plot(param_range, test_mean, label='test score', color='red', marker='o')
    plt.fill_between(param_range, test_mean + test_std, test_mean - test_std, color='red', alpha=alpha)
    plt.title(title)
    plt.grid(ls='--')
    plt.xlabel('Parameter value')
    plt.ylabel('F-measure')
    plt.legend(loc='best')
    plt.show()

X, y = make_classification(n_classes=2, class_sep=2,weights=[0.9, 0.1], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10)
print('Original dataset shape {}'.format(Counter(y)))

ln = X.shape
names = ["x%s" % i for i in range(1, ln[1] + 1)]

X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=0)
st=StandardScaler()

rg = LogisticRegression(class_weight = { 0:1, 1:6.5 }, random_state = 42, solver = 'saga',max_iter=100,n_jobs=-1)

param_grid = {'clf__C': [0.001,0.01,0.1,0.002,0.02,0.005,0.0007,.0006,0.0005],
              'clf__class_weight':[{ 0:1, 1:6 },{ 0:1, 1:4 },{ 0:1, 1:5.5 },{ 0:1, 1:4.5 },{ 0:1, 1:5 }]
              }

pipeline = Pipeline(steps=[('scaler', st),
                           ('clf', rg )])

cv=StratifiedKFold(n_splits=5,random_state=42)
rg_cv = GridSearchCV(pipeline, param_grid, cv=cv, scoring =  'f1')
rg_cv.fit(X_train, y_train)
print("Tuned rg best params: {}".format(rg_cv.best_params_))

ypred = rg_cv.predict(X_train)
print(classification_report(y_train, ypred))
print('######################')
ypred2 = rg_cv.predict(X_test)
print(classification_report(y_test, ypred2))

plt.figure(figsize=(9,6))
param_range1=[i / 10000.0 for i in range(1, 11)]
param_range2=[{0: 1, 1: 6}, {0: 1, 1: 4}, {0: 1, 1: 5.5}, {0: 1, 1: 4.5}, {0: 1, 1: 5}]

if __name__ == '__main__':
    train_sizes, train_scores, test_scores = learning_curve(
              estimator= rg_cv.best_estimator_ , X= X_train, y = y_train,
                train_sizes=np.arange(0.1,1.1,0.1), cv= cv,  scoring='f1', n_jobs= - 1)

    plot_learning_curve(train_sizes, train_scores, test_scores, title='Learning curve for Logistic Regression')

    train_scores, test_scores = validation_curve(
        estimator=rg_cv.best_estimator_, X=X_train, y=y_train, param_name="clf__C", param_range=param_range1,
        cv=cv, scoring="f1", n_jobs=-1)

    plot_validation_curve(param_range1, train_scores, test_scores, title="Validation Curve for C", alpha=0.1)

    train_scores, test_scores = validation_curve(
        estimator=rg_cv.best_estimator_, X=X_train, y=y_train, param_name="clf__class_weight", param_range=param_range2,
        cv=cv, scoring="f1", n_jobs=-1)

    plot_validation_curve(param_range2, train_scores, test_scores, title="Validation Curve for class_weight", alpha=0.1)

Why when the best estimator of GridSearchCv is passed into the learning curve function, it prints all the previous print lines several times?
How to plot validation curve for class weight? TypeError: float() argument must be a string or a number, not 'dict'

ebrahimi · Accepted Answer · 2018-06-23T07:59:08.430

With respect to the first and second question, the code should change into:

from collections import Counter
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, StratifiedKFold, learning_curve, validation_curve, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
import numpy as np
import matplotlib.pyplot as plt


def plot_learning_curve(train_sizes, train_scores, test_scores, title, alpha=0.1):
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)
    plt.plot(train_sizes, train_mean, label='train score', color='blue', marker='o')
    plt.fill_between(train_sizes, train_mean + train_std,
                     train_mean - train_std, color='blue', alpha=alpha)
    plt.plot(train_sizes, test_mean, label='test score', color='red', marker='o')

    plt.fill_between(train_sizes, test_mean + test_std, test_mean - test_std, color='red', alpha=alpha)
    plt.title(title)
    plt.xlabel('Number of training points')
    plt.ylabel('F-measure')
    plt.grid(ls='--')
    plt.legend(loc='best')
    plt.show()


def plot_validation_curve(param_range, train_scores, test_scores, title, alpha=0.1):
    param_range = [x[1] for x in param_range] 
    sort_idx = np.argsort(param_range)
    param_range=np.array(param_range)[sort_idx]
    train_mean = np.mean(train_scores, axis=1)[sort_idx]
    train_std = np.std(train_scores, axis=1)[sort_idx]
    test_mean = np.mean(test_scores, axis=1)[sort_idx]
    test_std = np.std(test_scores, axis=1)[sort_idx]
    plt.plot(param_range, train_mean, label='train score', color='blue', marker='o')
    plt.fill_between(param_range, train_mean + train_std,
                 train_mean - train_std, color='blue', alpha=alpha)
    plt.plot(param_range, test_mean, label='test score', color='red', marker='o')
    plt.fill_between(param_range, test_mean + test_std, test_mean - test_std, color='red', alpha=alpha)
    plt.title(title)
    plt.grid(ls='--')
    plt.xlabel('Weight of class 2')
    plt.ylabel('Average values and standard deviation for F1-Score')
    plt.legend(loc='best')
    plt.show()


if __name__ == '__main__':
    X, y = make_classification(n_classes=2, class_sep=2, weights=[0.9, 0.1], n_informative=3, n_redundant=1, flip_y=0,
                               n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10)
    print('Original dataset shape {}'.format(Counter(y)))

    ln = X.shape
    names = ["x%s" % i for i in range(1, ln[1] + 1)]

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    st = StandardScaler()

    rg = LogisticRegression(class_weight={0: 1, 1: 6.5}, random_state=42, solver='saga', max_iter=100, n_jobs=-1)

    param_grid = {'clf__C': [0.001, 0.01, 0.1, 0.002, 0.02, 0.005, 0.0007, .0006, 0.0005],
                  'clf__class_weight': [{0: 1, 1: 6}, {0: 1, 1: 4}, {0: 1, 1: 5.5}, {0: 1, 1: 4.5}, {0: 1, 1: 5}]
                  }

    pipeline = Pipeline(steps=[('scaler', st),
                               ('clf', rg)])

    cv = StratifiedKFold(n_splits=5, random_state=42)
    rg_cv = GridSearchCV(pipeline, param_grid, cv=cv, scoring='f1')
    rg_cv.fit(X_train, y_train)
    print("Tuned rg best params: {}".format(rg_cv.best_params_))

    ypred = rg_cv.predict(X_train)
    print(classification_report(y_train, ypred))
    print('######################')
    ypred2 = rg_cv.predict(X_test)
    print(classification_report(y_test, ypred2))

    plt.figure(figsize=(9, 6))
    param_range1 = [i / 10000.0 for i in range(1, 11)]
    param_range2 = [{0: 1, 1: 6}, {0: 1, 1: 4}, {0: 1, 1: 5.5}, {0: 1, 1: 4.5}, {0: 1, 1: 5}]

    train_sizes, train_scores, test_scores = learning_curve(
        estimator=rg_cv.best_estimator_, X=X_train, y=y_train,
        train_sizes=np.arange(0.1, 1.1, 0.1), cv=cv, scoring='f1', n_jobs=- 1)

    plot_learning_curve(train_sizes, train_scores, test_scores, title='Learning curve for Logistic Regression')

    train_scores, test_scores = validation_curve(
        estimator=rg_cv.best_estimator_, X=X_train, y=y_train, param_name="clf__C", param_range=param_range1,
        cv=cv, scoring="f1", n_jobs=-1)

    plot_validation_curve(param_range1, train_scores, test_scores, title="Validation Curve for C", alpha=0.1)

    train_scores, test_scores = validation_curve(
        estimator=rg_cv.best_estimator_, X=X_train, y=y_train, param_name="clf__class_weight", param_range=param_range2,
        cv=cv, scoring="f1", n_jobs=-1)

    plot_validation_curve(param_range2, train_scores, test_scores, title="Validation Curve for class_weight", alpha=0.1)

score 2 · Answer 2 · edited Dec 14 '19 at 23:17

Currently (sklearn 0.22), with the example provided in the question, there's a future warning that sklearn 0.24 will raise an error at the line 72:

cv = StratifiedKFold(n_splits=5, random_state=42)

\lib\site-packages\sklearn\model_selection_split.py:296: FutureWarning: Setting a random_state has no effect since shuffle is False. This will raise an error in 0.24. You should leave random_state to its default (None), or set shuffle=True. FutureWarning

According to the StratifiedKFold documentation:

shuffle : boolean, optional
Whether to shuffle each class’s samples before splitting into batches. random_state: int, RandomState instance or None, optional, default=None
If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by np.random. Only used when shuffle is True. This should be left to None if shuffle is False.
Notes
The implementation is designed to: (...)
Preserve order dependencies in the dataset ordering, when shuffle=False: all samples from class k in some test set were contiguous in y, or separated in y by samples from classes other than k.

Aside the future warning to be addressed, the default shuffle=False ensure reproducibility because it preserves the datasets ordering. So far, so good.

However, without knowing the dataset order is also random, only with StratifiedKFold(..., shuffle=True) one can ensure there won't be any dataset ordering bias affecting StratifiedKFold.

As the dataset generator make_classification was used with its default `shuffle=True' there won't be a dataset ordering bias issue this time:

According to make_classification documentation:

Without shuffling, X horizontally stacks features in the following order: the primary n_informative features, followed by n_redundant linear combinations of the informative features, followed by n_repeated duplicates, drawn randomly with replacement from the informative and redundant features. The remaining features are filled with random noise. Thus, without shuffling, all useful features are contained in the columns X[:, :n_informative + n_redundant + n_repeated].

In order to just solve the future error, remove the useless random_state=42:
cv = StratifiedKFold(n_splits=5)

In order to solve the future error in situations one can't ensure the dataset ordering is random:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) or even
cv = StratifiedShuffleSplit(n_splits=5, random_state=42)

How to plot learning curve and validation curve while using pipeline

2 Answers2

Linked