Xgboost Multiclass evaluation Metrics

Question

Im training an Xgb Multiclass problem, but im having doubts about my evaluation metrics,

heres my code + output

import matplotlib.pylab as plt
from sklearn import metrics
from matplotlib import pyplot
from sklearn.model_selection import GridSearchCV 
import xgboost as xgb
from statistics import mean
%matplotlib inline
from sklearn.preprocessing import label_binarize
from itertools import cycle
from sklearn.metrics import roc_curve, auc
def plot_roc_curve(y_test, y_pred):
n_classes = len(np.unique(y_test))
y_test = label_binarize(y_test, classes=np.arange(n_classes))
#y_pred = label_binarize(y_pred, classes=np.arange(n_classes))


Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = metrics.roc_curve(y_test[:, i], y_pred[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])


Compute micro-average ROC curve and ROC area
fpr[&quot;micro&quot;], tpr[&quot;micro&quot;], _ = roc_curve(y_test.ravel(), y_pred.ravel())
roc_auc[&quot;micro&quot;] = auc(fpr[&quot;micro&quot;], tpr[&quot;micro&quot;])


First aggregate all false positive rates
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))


Then interpolate all ROC curves at this points
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
    mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])


Finally average it and compute AUC
mean_tpr /= n_classes

fpr[&quot;macro&quot;] = all_fpr
tpr[&quot;macro&quot;] = mean_tpr
roc_auc[&quot;macro&quot;] = auc(fpr[&quot;macro&quot;], tpr[&quot;macro&quot;])


Plot all ROC curves
plt.figure(figsize=(10,8))

plt.figure(dpi=300)
lw = 2
plt.plot(fpr[&quot;micro&quot;], tpr[&quot;micro&quot;],
         label=&quot;micro-average ROC curve (area = {0:0.2f})&quot;.format(roc_auc[&quot;micro&quot;]),
         color=&quot;pink&quot;, linestyle=&quot;-.&quot;, linewidth=4,)

plt.plot(fpr[&quot;macro&quot;], tpr[&quot;macro&quot;],
         label=&quot;macro-average ROC curve (area = {0:0.2f})&quot;.format(roc_auc[&quot;macro&quot;]),
         color=&quot;purple&quot;, linestyle=&quot;-.&quot;, linewidth=4,)

colors = cycle([&quot;gray&quot;, &quot;green&quot;, &quot;blue&quot;, &quot;yellow&quot;, &quot;red&quot;,'black','brown','goldenrod','gold',
                'aqua','violet','darkslategray','mistyrose','darkorange','tan'])
for i, color in zip(range(n_classes), colors):
    plt.plot(fpr[i], tpr[i], color=color, lw=lw, linestyle=&quot;--&quot;,
             label=&quot;ROC curve of class {0} (area = {1:0.2f})&quot;.format(i, roc_auc[i]),)

plt.plot([0, 1], [0, 1], &quot;k--&quot;, lw=lw)
plt.xlim([-0.05, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel(&quot;False Positive Rate&quot;)
plt.ylabel(&quot;True Positive Rate&quot;)
plt.title(&quot;Receiver Operating Characteristic (ROC) curve&quot;)
plt.legend()




def evaluate_model(alg, train, target, predictors,test,target2, early_stopping_rounds=10,n_jobs=-1,useTrainCV=False, cv_folds=5):
    plt.rcParams['figure.figsize'] = [100, 50]
    plt.tick_params(axis='both', which='major', labelsize=50)
    plt.tick_params(axis='both', which='minor', labelsize=50)
if useTrainCV:
    xgb_param = alg.get_xgb_params()
    xgtrain = xgb.DMatrix(train[predictors].values, target['CLASS_TARGET'].values)
    cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
        metrics='auc', early_stopping_rounds=early_stopping_rounds, verbose_eval=True)
    print (&quot;cvresult---&quot;, cvresult.shape[0])
    print (cvresult)
    alg.set_params(n_estimators=cvresult.shape[0])        

#Ajustar (&quot;Fit&quot;) el algoritmo a los datos
evaluation = [(train, target), (test, target2)]#sd
alg.fit(train[predictors], target['CLASS_TARGET'], eval_metric=[&quot;merror&quot;, &quot;mlogloss&quot;,&quot;auc&quot;],eval_set=evaluation)

#Predecir el set de entrenamiento:
dtrain_predictions = alg.predict(train[predictors])
dtrain_predprob = alg.predict_proba(train[predictors])#[:,1]
feat_imp = pd.Series(alg.get_booster().get_fscore()).sort_values(ascending=False) 
feat_imp.plot(kind='bar', title='Importancia', color='b') 
plt.ylabel('Importancia Score')

#Print Reporte de modelo:

print(&quot;\n Reporte de Modelo&quot;)
print(feat_imp)
print(&quot;No. de vars : %.4g&quot; % feat_imp.count())
print(&quot;Accuracy : %.4g&quot; % metrics.accuracy_score(target['CLASS_TARGET'].values, dtrain_predictions))
print(&quot;AUC Score (Balanced): %f&quot; % metrics.roc_auc_score(target['CLASS_TARGET'], dtrain_predprob, multi_class='ovr', average='weighted'))

# evaluate predictions
pred = dtrain_predprob
accuracy = metrics.accuracy_score(target['CLASS_TARGET'].values, dtrain_predictions)
accuracy_AUC = metrics.roc_auc_score(target['CLASS_TARGET'], dtrain_predprob, multi_class='ovr', average='weighted')
predictions = [np.round(value) for value in pred]
#fpr, tpr, thresholds = metrics.roc_curve(y, scores, pos_label=2)

#print(&quot;Accuracy: %.2f%%&quot; % (accuracy * 100.0))
#  performance metrics
results = alg.evals_result()
epochs = len(results['validation_0']['merror'])
x_axis = range(0, epochs)
plt.style.use('ggplot')
# plot log loss
fig, ax = pyplot.subplots(figsize=(12,12))
ax.plot(x_axis, results['validation_0']['mlogloss'], label='Train')
ax.plot(x_axis, results['validation_1']['mlogloss'], label='Test')
#ax.plot(x_axis, results['validation_2']['logloss'], label='Val')
ax.legend()
pyplot.ylabel('Log Loss')
pyplot.title('XGBoost Log Loss')
pyplot.show()
# plot classification error
fig, ax = pyplot.subplots(figsize=(12,12))
ax.plot(x_axis, results['validation_0']['merror'], label='Train')
ax.plot(x_axis, results['validation_1']['merror'], label='Test')
#ax.plot(x_axis, results['validation_2']['error'], label='Val')
ax.legend()

pyplot.ylabel('Classification Error')
pyplot.title('XGBoost Classification Error')
pyplot.show()


xgb0 = xgb.XGBClassifier(max_depth=3, 
                         learning_rate=0.1, 
                         n_estimators=40, 
                         objective='multi:sofprob',
                         gamma=1, 
                         min_child_weight=1, 
                         max_delta_step=0, 
                         subsample=1, 
                         colsample_bytree=1, 
                         colsample_bylevel=1, 
                         colsample_bynode=1 , 
                         reg_alpha=1, 
                         reg_lambda=1, 
                         scale_pos_weight=1, 
                         base_score=0.5, 
                         random_state=0, 
                         seed=None,
                         silent=None,
                         missing=999999,
                         verbosity=1,
                         use_label_encoder =False,
                         n_jobs=-1)

Output:

Now lets evaluate the evaluation set:

df = pd.DataFrame(data=y_val)
yv=df.iloc[:,0]
df_va2 = xgb0.predict(X_val[features])
df_va= xgb0.predict_proba(X_val[features])#[:,1];
#df_val['Y_FAIL'] = np.where(((df_va <= .53)), 0, 1)
#Print model report:
print("Accuracy : %.4g" % metrics.accuracy_score(yv, df_va2))
print("One VS Rest")
print("AUC Score (Val) Macro: %f" % metrics.roc_auc_score(yv, df_va, multi_class='ovr', average='macro'))
print("AUC Score (Val) Weighted: %f" % metrics.roc_auc_score(yv, df_va, multi_class='ovr', average='weighted'))
print("One VS One")
print("AUC Score (Val) Macro: %f" % metrics.roc_auc_score(yv, df_va, multi_class='ovo', average='macro'))
print("AUC Score (Val) Weighted: %f" % metrics.roc_auc_score(yv, df_va, multi_class='ovo', average='weighted'))
plot_roc_curve(yv, df_va)

Output:

Accuracy : 0.8749
One VS Rest
AUC Score (Val) Macro: 0.990113
AUC Score (Val) Weighted: 0.964739
One VS One
AUC Score (Val) Macro: 0.994858
AUC Score (Val) Weighted: 0.983933

this looks great, thing is when i try to calculate AUC for individual classes i get this

code:

df = pd.DataFrame(data=y_val)
yv=df.iloc[:,0]
#df_va2 = xgb0.predict(X_val[features])
#df_va= xgb0.predict_proba(X_val[features])
d = yv.unique()
class_name = list(d.flatten())
for p in class_name:
    fpr, tpr, thresholds = metrics.roc_curve(yv, xgb0.predict_proba(X_val[features])[:,1], pos_label = p) 
    auroc = round(metrics.auc(fpr, tpr),2)
    print('Xgb',p,'--AUC--->',auroc)

Output:

Xgb 10 --AUC---> 0.36
Xgb 8 --AUC---> 0.15
Xgb 4 --AUC---> 0.45
Xgb 1 --AUC---> 0.97
Xgb 12 --AUC---> 0.34
Xgb 7 --AUC---> 0.08
Xgb 2 --AUC---> 0.58
Xgb 5 --AUC---> 0.44
Xgb 14 --AUC---> 0.37
Xgb 11 --AUC---> 0.0
Xgb 6 --AUC---> 0.41
Xgb 0 --AUC---> 0.0
Xgb 3 --AUC---> 0.5
Xgb 9 --AUC---> 0.12
Xgb 13 --AUC---> 0.28

So what metric is right?, im doing something wrong?, why its so diferent

Thanks for your help, and sorry for bad english.

score 2 · Answer 1 · answered Sep 21 '22 at 12:47

In this snippet:

for p in class_name:
    fpr, tpr, thresholds = metrics.roc_curve(
        yv, 
        xgb0.predict_proba(
            X_val[features]
        )[:,1],  # <----
        pos_label = p
    ) 
    auroc = round(metrics.auc(fpr, tpr),2)
    print('Xgb',p,'--AUC--->',auroc)

you slice the predicted probabilities always to the first column, but you want the probability of outcome p there. If the classes are named just 0,1,..., then replacing the 1 with p should do the trick.

Xgboost Multiclass evaluation Metrics

Compute ROC curve and ROC area for each class

Compute micro-average ROC curve and ROC area

First aggregate all false positive rates

Then interpolate all ROC curves at this points

Finally average it and compute AUC

Plot all ROC curves

plt.figure(dpi=300)

1 Answers1