3

Im training an Xgb Multiclass problem, but im having doubts about my evaluation metrics,

heres my code + output

import matplotlib.pylab as plt
from sklearn import metrics
from matplotlib import pyplot
from sklearn.model_selection import GridSearchCV 
import xgboost as xgb
from statistics import mean
%matplotlib inline
from sklearn.preprocessing import label_binarize
from itertools import cycle
from sklearn.metrics import roc_curve, auc
def plot_roc_curve(y_test, y_pred):
n_classes = len(np.unique(y_test))
y_test = label_binarize(y_test, classes=np.arange(n_classes))
#y_pred = label_binarize(y_pred, classes=np.arange(n_classes))

Compute ROC curve and ROC area for each class

fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = metrics.roc_curve(y_test[:, i], y_pred[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

Compute micro-average ROC curve and ROC area

fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_pred.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

First aggregate all false positive rates

all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))

Then interpolate all ROC curves at this points

mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
    mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])

Finally average it and compute AUC

mean_tpr /= n_classes

fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

Plot all ROC curves

plt.figure(figsize=(10,8))

plt.figure(dpi=300)

lw = 2
plt.plot(fpr["micro"], tpr["micro"],
         label="micro-average ROC curve (area = {0:0.2f})".format(roc_auc["micro"]),
         color="pink", linestyle="-.", linewidth=4,)

plt.plot(fpr["macro"], tpr["macro"],
         label="macro-average ROC curve (area = {0:0.2f})".format(roc_auc["macro"]),
         color="purple", linestyle="-.", linewidth=4,)

colors = cycle(["gray", "green", "blue", "yellow", "red",'black','brown','goldenrod','gold',
                'aqua','violet','darkslategray','mistyrose','darkorange','tan'])
for i, color in zip(range(n_classes), colors):
    plt.plot(fpr[i], tpr[i], color=color, lw=lw, linestyle="--",
             label="ROC curve of class {0} (area = {1:0.2f})".format(i, roc_auc[i]),)

plt.plot([0, 1], [0, 1], "k--", lw=lw)
plt.xlim([-0.05, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver Operating Characteristic (ROC) curve")
plt.legend()



def evaluate_model(alg, train, target, predictors,test,target2, early_stopping_rounds=10,n_jobs=-1,useTrainCV=False, cv_folds=5): plt.rcParams['figure.figsize'] = [100, 50] plt.tick_params(axis='both', which='major', labelsize=50) plt.tick_params(axis='both', which='minor', labelsize=50)

if useTrainCV:
    xgb_param = alg.get_xgb_params()
    xgtrain = xgb.DMatrix(train[predictors].values, target['CLASS_TARGET'].values)
    cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
        metrics='auc', early_stopping_rounds=early_stopping_rounds, verbose_eval=True)
    print ("cvresult---", cvresult.shape[0])
    print (cvresult)
    alg.set_params(n_estimators=cvresult.shape[0])        

#Ajustar ("Fit") el algoritmo a los datos
evaluation = [(train, target), (test, target2)]#sd
alg.fit(train[predictors], target['CLASS_TARGET'], eval_metric=["merror", "mlogloss","auc"],eval_set=evaluation)

#Predecir el set de entrenamiento:
dtrain_predictions = alg.predict(train[predictors])
dtrain_predprob = alg.predict_proba(train[predictors])#[:,1]
feat_imp = pd.Series(alg.get_booster().get_fscore()).sort_values(ascending=False) 
feat_imp.plot(kind='bar', title='Importancia', color='b') 
plt.ylabel('Importancia Score')

#Print Reporte de modelo:

print("\n Reporte de Modelo")
print(feat_imp)
print("No. de vars : %.4g" % feat_imp.count())
print("Accuracy : %.4g" % metrics.accuracy_score(target['CLASS_TARGET'].values, dtrain_predictions))
print("AUC Score (Balanced): %f" % metrics.roc_auc_score(target['CLASS_TARGET'], dtrain_predprob, multi_class='ovr', average='weighted'))

# evaluate predictions
pred = dtrain_predprob
accuracy = metrics.accuracy_score(target['CLASS_TARGET'].values, dtrain_predictions)
accuracy_AUC = metrics.roc_auc_score(target['CLASS_TARGET'], dtrain_predprob, multi_class='ovr', average='weighted')
predictions = [np.round(value) for value in pred]
#fpr, tpr, thresholds = metrics.roc_curve(y, scores, pos_label=2)

#print("Accuracy: %.2f%%" % (accuracy * 100.0))
#  performance metrics
results = alg.evals_result()
epochs = len(results['validation_0']['merror'])
x_axis = range(0, epochs)
plt.style.use('ggplot')
# plot log loss
fig, ax = pyplot.subplots(figsize=(12,12))
ax.plot(x_axis, results['validation_0']['mlogloss'], label='Train')
ax.plot(x_axis, results['validation_1']['mlogloss'], label='Test')
#ax.plot(x_axis, results['validation_2']['logloss'], label='Val')
ax.legend()
pyplot.ylabel('Log Loss')
pyplot.title('XGBoost Log Loss')
pyplot.show()
# plot classification error
fig, ax = pyplot.subplots(figsize=(12,12))
ax.plot(x_axis, results['validation_0']['merror'], label='Train')
ax.plot(x_axis, results['validation_1']['merror'], label='Test')
#ax.plot(x_axis, results['validation_2']['error'], label='Val')
ax.legend()

pyplot.ylabel('Classification Error')
pyplot.title('XGBoost Classification Error')
pyplot.show()

xgb0 = xgb.XGBClassifier(max_depth=3, learning_rate=0.1, n_estimators=40, objective='multi:sofprob', gamma=1, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1, colsample_bylevel=1, colsample_bynode=1 , reg_alpha=1, reg_lambda=1, scale_pos_weight=1, base_score=0.5, random_state=0, seed=None, silent=None, missing=999999, verbosity=1, use_label_encoder =False, n_jobs=-1)

Output:

Variable importance

Log-loss

error

Now lets evaluate the evaluation set:

df = pd.DataFrame(data=y_val)
yv=df.iloc[:,0]

df_va2 = xgb0.predict(X_val[features]) df_va= xgb0.predict_proba(X_val[features])#[:,1]; #df_val['Y_FAIL'] = np.where(((df_va <= .53)), 0, 1) #Print model report:

print("Accuracy : %.4g" % metrics.accuracy_score(yv, df_va2)) print("One VS Rest") print("AUC Score (Val) Macro: %f" % metrics.roc_auc_score(yv, df_va, multi_class='ovr', average='macro')) print("AUC Score (Val) Weighted: %f" % metrics.roc_auc_score(yv, df_va, multi_class='ovr', average='weighted')) print("One VS One") print("AUC Score (Val) Macro: %f" % metrics.roc_auc_score(yv, df_va, multi_class='ovo', average='macro')) print("AUC Score (Val) Weighted: %f" % metrics.roc_auc_score(yv, df_va, multi_class='ovo', average='weighted')) plot_roc_curve(yv, df_va)

Output:

Accuracy : 0.8749

One VS Rest AUC Score (Val) Macro: 0.990113 AUC Score (Val) Weighted: 0.964739

One VS One AUC Score (Val) Macro: 0.994858 AUC Score (Val) Weighted: 0.983933

AUC

this looks great, thing is when i try to calculate AUC for individual classes i get this

code:

df = pd.DataFrame(data=y_val)
yv=df.iloc[:,0]

#df_va2 = xgb0.predict(X_val[features]) #df_va= xgb0.predict_proba(X_val[features])

d = yv.unique() class_name = list(d.flatten())

for p in class_name: fpr, tpr, thresholds = metrics.roc_curve(yv, xgb0.predict_proba(X_val[features])[:,1], pos_label = p) auroc = round(metrics.auc(fpr, tpr),2) print('Xgb',p,'--AUC--->',auroc)

Output:

Xgb 10 --AUC---> 0.36
Xgb 8 --AUC---> 0.15
Xgb 4 --AUC---> 0.45
Xgb 1 --AUC---> 0.97
Xgb 12 --AUC---> 0.34
Xgb 7 --AUC---> 0.08
Xgb 2 --AUC---> 0.58
Xgb 5 --AUC---> 0.44
Xgb 14 --AUC---> 0.37
Xgb 11 --AUC---> 0.0
Xgb 6 --AUC---> 0.41
Xgb 0 --AUC---> 0.0
Xgb 3 --AUC---> 0.5
Xgb 9 --AUC---> 0.12
Xgb 13 --AUC---> 0.28

So what metric is right?, im doing something wrong?, why its so diferent

Thanks for your help, and sorry for bad english.

Chichostyle
  • 31
  • 1
  • 2

1 Answers1

2

In this snippet:

for p in class_name:
    fpr, tpr, thresholds = metrics.roc_curve(
        yv, 
        xgb0.predict_proba(
            X_val[features]
        )[:,1],  # <----
        pos_label = p
    ) 
    auroc = round(metrics.auc(fpr, tpr),2)
    print('Xgb',p,'--AUC--->',auroc)

you slice the predicted probabilities always to the first column, but you want the probability of outcome p there. If the classes are named just 0,1,..., then replacing the 1 with p should do the trick.

Ben Reiniger
  • 12,855
  • 3
  • 20
  • 63