from math import sqrt

import graphviz
import matplotlib as mpl
import numpy as np
import pandas as pd
import pyrsm as rsm
import statsmodels.formula.api as smf
import xgboost as xgb
from sklearn import metrics, tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import (
    GridSearchCV,
    ShuffleSplit,
    StratifiedShuffleSplit,
    train_test_split,
)
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from statsmodels.genmod.families import Binomial
from statsmodels.genmod.families.links import logit

mpl.rcParams["figure.dpi"] = 150
rsm.__version__  # should be 0.4.2 or higher

'0.4.5'


bbb = pd.read_pickle("data/bbb.pkl")

bbb


bbb["buyer_yes"] = pd.get_dummies(bbb["buyer"], drop_first=True)

bbb


bbb['b_yes'] = rsm.ifelse(bbb['buyer'] == "no",0,1)

bbb


bbb[["buyer", "buyer_yes"]]


bbb["buyer"].dtype

CategoricalDtype(categories=['yes', 'no'], ordered=False)


bbb["buyer"].cat.categories

Index(['yes', 'no'], dtype='object')


bbb["buyer"] = bbb["buyer"].cat.reorder_categories(["no", "yes"])
bbb["buyer"].cat.categories

Index(['no', 'yes'], dtype='object')


bbb["buyer_yes"] = pd.get_dummies(bbb["buyer"], drop_first=True)


bbb[["buyer", "buyer_yes"]]


bbb["buyer_yes"] = (bbb["buyer"] == "yes").astype(int)
bbb["gender_male"] = (bbb["gender"] == "M").astype(int)

bbb


bbb[["buyer", "buyer_yes", "gender", "gender_male"]]


np.random.seed(1234)
bbb["rnd"] = np.random.randn(bbb.shape[0])
bbb["rnd"].head()

0    0.471435
1   -1.190976
2    1.432707
3   -0.312652
4   -0.720589
Name: rnd, dtype: float64


rvar = "buyer_yes"
evar = [
    "gender_male",
    "last",
    "total",
    "child",
    "youth",
    "cook",
    "do_it",
    "reference",
    "art",
    "geog",
    "rnd",
]
idvar = "acctnum"
lev = "Yes"


rsm.__version__

'0.4.5'


X = bbb[evar].copy()
y = bbb[rvar]

X

y

0        0
1        0
2        0
3        0
4        0
        ..
49995    0
49996    0
49997    0
49998    0
49999    0
Name: buyer_yes, Length: 50000, dtype: int64


Xs = rsm.scale_df(X, train=bbb.training == 1, excl="gender_male")
Xs


ssplit = ShuffleSplit(n_splits=1, test_size=0.3, random_state=1234)
for train_index, test_index in ssplit.split(X):
    pass
train_index

array([24515, 21369,  7912, ..., 23924, 34086, 27439])


ssplit.split(X)

<generator object BaseShuffleSplit.split at 0x7fe05e16b6d0>

X


X["training"] = 0
X.iloc[train_index, X.columns.get_loc("training")] = 1
print(
    f"Training: {y[X.training == 1].mean().round(4)}, Test: {y[X.training == 0].mean().round(4)}"
)

Training: 0.0898, Test: 0.0919

X


X.columns.get_loc("training")

11


X.iloc[0,11]

1


sssplit = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=1234)
for train_index, test_index in sssplit.split(y, y):
    pass
X["training"] = 0
X.iloc[train_index, X.columns.get_loc("training")] = 1
print(
    f"Training: {y[X.training == 1].mean().round(4)}, Test: {y[X.training == 0].mean().round(4)}",
    f"\nTraining: {y[X.training == 1].sum():,}, Test: {y[X.training == 0].sum():,}",
)

Training: 0.0904, Test: 0.0904 
Training: 2,261, Test: 2,261


sssplit = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=1234)
for train_index, test_index in sssplit.split(y, y):
    pass
X["training"] = 0
X.iloc[train_index, X.columns.get_loc("training")] = 1


training = X.pop("training")


training

0        1
1        0
2        1
3        0
4        1
        ..
49995    1
49996    1
49997    1
49998    0
49999    1
Name: training, Length: 50000, dtype: int64


Xs = rsm.scale_df(X, train=training == 1, excl="gender_male")

Xs


eval_dat = bbb[[idvar, rvar]].copy()
eval_dat["training"] = training


eval_dat["training"]

0        1
1        0
2        1
3        0
4        1
        ..
49995    1
49996    1
49997    1
49998    0
49999    1
Name: training, Length: 50000, dtype: int64


eval_dat


Xs.assign(buyer_yes=y)[training == 1]


lr = smf.glm(
    formula="buyer_yes ~ gender_male + last + total + child + youth + cook + do_it + reference + art + geog + rnd",
    family=Binomial(link=logit()),
    data=Xs.assign(buyer_yes=y)[training == 1],
).fit()
lr.summary()


training

0        1
1        0
2        1
3        0
4        1
        ..
49995    1
49996    1
49997    1
49998    0
49999    1
Name: training, Length: 50000, dtype: int64


y[training ==1]

0        0
2        0
4        0
5        1
7        0
        ..
49994    0
49995    0
49996    0
49997    0
49999    0
Name: buyer_yes, Length: 35000, dtype: int64


clf = LogisticRegression(random_state=1234, max_iter=1000, penalty="none").fit(
    Xs[training == 1], y[training == 1]
)

coef = np.concatenate((clf.intercept_, clf.coef_[0]), axis=0)
coef.round(4)

array([-3.1829,  0.7563, -1.5879,  0.1982, -0.436 , -0.1555, -0.6247,
       -0.8167,  0.3062,  1.6123,  0.9363, -0.0265])


clf = LogisticRegression(
    random_state=1234, max_iter=1000, solver="saga", penalty="l1", C=0.0000001
).fit(Xs[training == 1], y[training == 1])

coef = np.concatenate((clf.intercept_, clf.coef_[0]), axis=0)
coef.round(4)

array([-2.2545,  0.    ,  0.    ,  0.    ,  0.    ,  0.    ,  0.    ,
        0.    ,  0.    ,  0.    ,  0.    ,  0.    ])


clf = LogisticRegression(
    random_state=1234, max_iter=1000, solver="saga", penalty="l1", C=100000
).fit(Xs[training == 1], y[training == 1])

coef = np.concatenate((clf.intercept_, clf.coef_[0]), axis=0)
coef.round(4)

array([-3.1829,  0.7564, -1.5878,  0.1982, -0.436 , -0.1555, -0.6247,
       -0.8167,  0.3062,  1.6123,  0.9364, -0.0265])


clf = LogisticRegression(
    random_state=1234, max_iter=1000, solver="saga", penalty="l1", C=0.05
).fit(Xs[training == 1], y[training == 1])

coef = np.concatenate((clf.intercept_, clf.coef_[0]), axis=0)
coef = pd.DataFrame({"labels": evar, "coefficients": coef[1:], "OR": np.exp(coef[1:])})
coef


coef.query("coefficients == 0")


eval_dat["y_lr"] = clf.predict_proba(Xs)[:, 1]


eval_dat


clf = LogisticRegression(random_state=1234, max_iter=1000, solver="saga", penalty="l1")
param_grid = {"C": [1, 0.5, 0.25, 0.1, 0.05, 0.025, 0.01, 0.005, 0.0025]}
scoring = {"AUC": "roc_auc"}
lr_cv = GridSearchCV(
    clf, param_grid, scoring=scoring, cv=5, n_jobs=4, refit="AUC", verbose=5
).fit(Xs[training == 1], y[training == 1])

Fitting 5 folds for each of 9 candidates, totalling 45 fits


cv_results = pd.DataFrame(lr_cv.cv_results_).sort_values(by=["rank_test_AUC"])
cv_results


cv_results.iloc[0, cv_results.columns.get_loc("param_C")]

1


coef = lr_cv.best_estimator_.coef_
pd.DataFrame({"labels": evar, "coefficients": coef[0], "OR": np.exp(coef[0])})


eval_dat["y_lasso"] = lr_cv.predict_proba(Xs)[:, 1]


clf = MLPClassifier(
    activation="tanh",
    solver="lbfgs",
    alpha=0.01,
    hidden_layer_sizes=(1,),
    random_state=1234,
    max_iter=10000,
).fit(Xs[training == 1], y[training == 1])


eval_dat["y_nn"] = clf.predict_proba(Xs)[:, 1]
eval_dat


nr_hnodes = range(1, 5)
hls = list(zip(nr_hnodes)) + list(zip(nr_hnodes, nr_hnodes))
hls

[(1,), (2,), (3,), (4,), (1, 1), (2, 2), (3, 3), (4, 4)]


param_grid = {"hidden_layer_sizes": hls, "alpha": [0.001, 0.01, 0.05]}
scoring = {"AUC": "roc_auc"}


clf_cv = GridSearchCV(
    clf, param_grid, scoring=scoring, cv=5, n_jobs=4, refit="AUC", verbose=5
).fit(Xs[training == 1], y[training == 1])

Fitting 5 folds for each of 24 candidates, totalling 120 fits


clf_cv.best_params_

{'alpha': 0.05, 'hidden_layer_sizes': (3, 3)}


clf_cv.best_score_

0.8183066269575724


cv_results = pd.DataFrame(clf_cv.cv_results_).sort_values(by="rank_test_AUC")
cv_results.iloc[
    0,
    [
        cv_results.columns.get_loc(c)
        for c in ["param_alpha", "param_hidden_layer_sizes"]
    ],
]

param_alpha                   0.05
param_hidden_layer_sizes    (3, 3)
Name: 22, dtype: object


eval_dat["y_nn_cv"] = clf_cv.predict_proba(Xs)[:, 1]
eval_dat


dct = {
    "training": eval_dat[eval_dat.training == 1],
    "test": eval_dat[eval_dat.training == 0],
}
rsm.evalbin(
    dct, "buyer_yes", 1, ["y_lr", "y_lasso", "y_nn", "y_nn_cv"], cost=0.5, margin=6
)


clf = tree.DecisionTreeClassifier(max_depth=2).fit(X[training == 1], y[training == 1])
fig = tree.plot_tree(clf, feature_names=evar, class_names=["yes", "no"], filled=True)


clf = tree.DecisionTreeClassifier(max_depth=20).fit(X[training == 1], y[training == 1])


eval_dat["y_tree"] = clf.predict_proba(X)[:, 1]


param_grid = {"max_depth": list(range(2, 20))}
scoring = {"AUC": "roc_auc"}

clf_cv = GridSearchCV(
    clf, param_grid, scoring=scoring, cv=5, n_jobs=4, refit="AUC", verbose=5
).fit(X[training == 1], y[training == 1])

Fitting 5 folds for each of 18 candidates, totalling 90 fits


clf_cv.best_params_

{'max_depth': 7}


clf_cv.best_score_

0.7725642016211652


cv_results = pd.DataFrame(clf_cv.cv_results_).sort_values(by="rank_test_AUC")
cv_results.iloc[0, cv_results.columns.get_loc("param_max_depth")]

7


eval_dat["y_tree_cv"] = clf_cv.predict_proba(X)[:, 1]


sqrt(X.shape[1])

3.3166247903554


clf = RandomForestClassifier(
    n_estimators=100, max_features=3, oob_score=True, random_state=1234
).fit(X[training == 1], y[training == 1])


eval_dat["y_rf"] = clf.predict_proba(X)[:, 1]


eval_dat["y_rf_oob"] = eval_dat.y_rf
eval_dat.loc[training == 1, "y_rf_oob"] = clf.oob_decision_function_[:, 1]


dct = {
    "training": eval_dat[eval_dat.training == 1],
    "test": eval_dat[eval_dat.training == 0],
}
rsm.evalbin(dct, "buyer_yes", 1, ["y_rf", "y_rf_oob"], cost=0.5, margin=6)


clf = RandomForestClassifier()
param_grid = {
    "n_estimators": list(range(100, 501, 100)),
    "max_features": range(3, 5),
}
scoring = {"AUC": "roc_auc"}


clf_cv = GridSearchCV(
    clf, param_grid, scoring=scoring, cv=5, n_jobs=4, refit="AUC", verbose=5
).fit(X[training == 1], y[training == 1])

Fitting 5 folds for each of 10 candidates, totalling 50 fits


clf_cv.best_params_

{'max_features': 3, 'n_estimators': 400}


clf_cv.best_score_

0.7651445756915533


pred = clf_cv.predict_proba(X[training == 1])
fpr, tpr, thresholds = metrics.roc_curve(y[training == 1].values, pred[:, 1])
auc_rf = metrics.auc(fpr, tpr)
auc_rf

1.0


clf = RandomForestClassifier(
    n_estimators=clf_cv.best_params_["n_estimators"],
    max_features=clf_cv.best_params_["max_features"],
    oob_score=True,
    random_state=1234,
).fit(X[training == 1], y[training == 1])
pred = clf.oob_decision_function_


fpr, tpr, thresholds = metrics.roc_curve(y[training == 1], pred[:, 1])
auc_rf = metrics.auc(fpr, tpr)
auc_rf

0.7626719774230822


# using the oob predictions for the training sample
eval_dat["y_rf_cv"] = 0  # set some initial value
eval_dat.loc[training == 1, "y_rf_cv"] = clf.oob_decision_function_[:, 1]
eval_dat.loc[training == 0, "y_rf_cv"] = clf.predict_proba(X[training == 0])[:, 1]


dct = {
    "training": eval_dat[eval_dat.training == 1],
    "test": eval_dat[eval_dat.training == 0],
}
rsm.evalbin(dct, "buyer_yes", 1, ["y_rf", "y_rf_oob", "y_rf_cv"], cost=0.5, margin=6)


clf = xgb.XGBClassifier(
    max_depth=2,
    n_estimators=500,
    objective="binary:logistic",
    use_label_encoder=False,
    # early_stopping_rounds=10,
    eval_metric="auc",
    random_state=1234,
).fit(X[training == 1], y[training == 1], verbose=True)


# predict for the entire dataset
eval_dat["y_xgb"] = clf.predict_proba(X)[:, 1]


# XGBoost with cross-validation and grid search
clf = xgb.XGBClassifier()
param_grid = {
    "max_depth": list(range(1, 3)),
    "n_estimators": list(range(100, 301, 100)),
}
scoring = {"AUC": "roc_auc"}


clf_cv = GridSearchCV(
    clf, param_grid, scoring=scoring, cv=5, n_jobs=4, refit="AUC", verbose=5
).fit(X[training == 1], y[training == 1])

Fitting 5 folds for each of 6 candidates, totalling 30 fits

/usr/local/lib/python3.8/dist-packages/xgboost/sklearn.py:888: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
  warnings.warn(label_encoder_deprecation_msg, UserWarning)

[14:19:21] WARNING: ../src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.


print(clf_cv.best_params_)
print(clf_cv.best_score_)

{'max_depth': 1, 'n_estimators': 200}
0.8112101026446842


# predict for the entire dataset
eval_dat["y_xgb_cv"] = clf_cv.predict_proba(X)[:, 1]


# performance evaluations
models = list(eval_dat.columns.values[3:])
models

['y_lr',
 'y_lasso',
 'y_nn',
 'y_nn_cv',
 'y_tree',
 'y_tree_cv',
 'y_rf',
 'y_rf_oob',
 'y_rf_cv',
 'y_xgb',
 'y_xgb_cv']


# no meaningful difference in performance visible
fig = rsm.gains_plot(eval_dat.query("training == 0"), "buyer_yes", 1, models[0:4])


fig = rsm.gains_plot(eval_dat.query("training == 0"), "buyer_yes", 1, models[4:])


dct = {
    "training": eval_dat[eval_dat.training == 1],
    "test": eval_dat[eval_dat.training == 0],
}
perf = rsm.evalbin(dct, "buyer_yes", 1, models, cost=0.5, margin=6)
perf


# sort on any metric you like to get the "best" model
perf[perf.Type == "test"].sort_values(by="profit", ascending=False).reset_index(
    drop=True
)


# compare gains from decision tree in training vs test
dct = {
    "training": eval_dat[eval_dat.training == 1],
    "test": eval_dat[eval_dat.training == 0],
}
fig = rsm.gains_plot(dct, "buyer_yes", 1, "y_tree")


# compare gains for RandomForest in training vs test without OOB
dct = {
    "training": eval_dat[eval_dat.training == 1],
    "test": eval_dat[eval_dat.training == 0],
}
fig = rsm.gains_plot(dct, "buyer_yes", 1, "y_rf")


# compare gains for RandomForest (OOB) in training vs test with OOB
dct = {
    "training": eval_dat[eval_dat.training == 1],
    "test": eval_dat[eval_dat.training == 0],
}
fig = rsm.gains_plot(dct, "buyer_yes", 1, "y_rf_oob")


clf = xgb.XGBClassifier(
    max_depth=2,
    n_estimators=500,
    objective="binary:logistic",
    use_label_encoder=False,
    # early_stopping_rounds=10,
    eval_metric="auc",
    random_state=1234,
).fit(X[training == 1], y[training == 1], verbose=True)


import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.inspection import permutation_importance, plot_partial_dependence


def importance(clf, X, y, cn):
    imp = permutation_importance(
        clf, X, y, scoring="roc_auc", n_repeats=10, random_state=1234
    )
    data = pd.DataFrame(imp.importances.T)
    data.columns = cn
    order = data.agg("mean").sort_values(ascending=False).index
    fig = sns.barplot(
        x="value", y="variable", color="slateblue", data=pd.melt(data[order])
    )
    fig.set(title="Permutation Importances", xlabel=None, ylabel=None)
    return fig


fig = importance(clf, X, y, X.columns)


fig, ax = plt.subplots(figsize=(8, 14))
ax.set_title("Partial Dependence Plots")
fig = plot_partial_dependence(clf, X, X.columns, ax=ax)


fig, ax = plt.subplots(figsize=(8, 4))
ax.set_title("Partial Dependence Plots")
fig = plot_partial_dependence(
    clf, X, ["art", "gender_male", ("art", "gender_male")], ax=ax
)

	gender_male	last	total	child	youth	cook	do_it	reference	art	geog	rnd
0	1	1.021054	0.731271	0.960066	1.179122	0.447961	-0.302516	0.566258	-0.282477	0.861636	0.234210
1	1	0.898395	-0.347599	-0.382630	0.446865	-0.395135	0.352484	-0.260499	-0.282477	0.269004	-0.596482
2	0	0.162439	-0.180103	-0.382630	-0.285392	0.447961	-0.302516	-0.260499	-0.282477	-0.323628	0.714548
3	0	-0.328197	0.312531	-0.382630	-0.285392	-0.395135	-0.302516	0.566258	-0.282477	-0.323628	-0.157592
4	0	0.162439	-0.293409	-0.382630	-0.285392	0.026413	-0.302516	-0.260499	-0.282477	-0.323628	-0.361434
...	...	...	...	...	...	...	...	...	...	...	...
49995	0	-0.205538	-0.894423	0.064935	-0.285392	-0.395135	-0.302516	-0.260499	-0.282477	-0.323628	-0.200591
49996	0	-0.450857	0.810092	0.960066	-0.285392	0.026413	0.352484	-0.260499	0.454017	0.269004	-0.204033
49997	1	-0.573516	-0.076650	-0.382630	-0.285392	-0.395135	-0.302516	0.566258	-0.282477	-0.323628	0.754321
49998	1	1.021054	0.667228	0.512501	-0.285392	0.026413	-0.302516	1.393015	0.454017	0.861636	-0.575946
49999	1	-0.696175	-0.116061	0.064935	-0.285392	0.026413	-0.302516	-0.260499	0.454017	0.861636	-0.314098

	gender_male	last	total	child	youth	cook	do_it	reference	art	geog	rnd
0	1	1.022903	0.733878	0.955141	1.182377	0.445832	-0.302226	0.573723	-0.282954	0.871340	0.233766
1	1	0.900253	-0.345663	-0.381112	0.447799	-0.395292	0.351539	-0.258621	-0.282954	0.273221	-0.596093
2	0	0.164354	-0.178063	-0.381112	-0.286779	0.445832	-0.302226	-0.258621	-0.282954	-0.324898	0.713624
3	0	-0.326245	0.314878	-0.381112	-0.286779	-0.395292	-0.302226	0.573723	-0.282954	-0.324898	-0.157642
4	0	0.164354	-0.291439	-0.381112	-0.286779	0.025270	-0.302226	-0.258621	-0.282954	-0.324898	-0.361280
...	...	...	...	...	...	...	...	...	...	...	...
49995	0	-0.203595	-0.892827	0.064306	-0.286779	-0.395292	-0.302226	-0.258621	-0.282954	-0.324898	-0.200599
49996	0	-0.448895	0.812748	0.955141	-0.286779	0.025270	0.351539	-0.258621	0.454014	0.273221	-0.204037
49997	1	-0.571545	-0.074545	-0.381112	-0.286779	-0.395292	-0.302226	0.573723	-0.282954	-0.324898	0.753356
49998	1	1.022903	0.669795	0.509723	-0.286779	0.025270	-0.302226	1.406066	0.454014	0.871340	-0.575577
49999	1	-0.694195	-0.113981	0.064306	-0.286779	0.025270	-0.302226	-0.258621	0.454014	0.871340	-0.313992

	gender_male	last	total	child	youth	cook	do_it	reference	art	geog	rnd	buyer_yes
0	1	1.022903	0.733878	0.955141	1.182377	0.445832	-0.302226	0.573723	-0.282954	0.871340	0.233766	0
2	0	0.164354	-0.178063	-0.381112	-0.286779	0.445832	-0.302226	-0.258621	-0.282954	-0.324898	0.713624	0
4	0	0.164354	-0.291439	-0.381112	-0.286779	0.025270	-0.302226	-0.258621	-0.282954	-0.324898	-0.361280	0
5	0	-0.326245	-0.468898	-0.381112	0.447799	-0.395292	-0.302226	-0.258621	-0.282954	-0.324898	0.441294	1
7	1	-0.694195	0.147278	0.509723	0.447799	0.445832	1.659068	-0.258621	-0.282954	1.469458	-0.319316	0
...	...	...	...	...	...	...	...	...	...	...	...	...
49994	0	0.041704	0.531772	0.064306	0.447799	0.025270	-0.302226	-0.258621	-0.282954	0.273221	-0.008339	0
49995	0	-0.203595	-0.892827	0.064306	-0.286779	-0.395292	-0.302226	-0.258621	-0.282954	-0.324898	-0.200599	0
49996	0	-0.448895	0.812748	0.955141	-0.286779	0.025270	0.351539	-0.258621	0.454014	0.273221	-0.204037	0
49997	1	-0.571545	-0.074545	-0.381112	-0.286779	-0.395292	-0.302226	0.573723	-0.282954	-0.324898	0.753356	0
49999	1	-0.694195	-0.113981	0.064306	-0.286779	0.025270	-0.302226	-0.258621	0.454014	0.871340	-0.313992	0

Dep. Variable:	buyer_yes	No. Observations:	35000
Model:	GLM	Df Residuals:	34988
Model Family:	Binomial	Df Model:	11
Link Function:	logit	Scale:	1.0000
Method:	IRLS	Log-Likelihood:	-8410.5
Date:	Tue, 16 Mar 2021	Deviance:	16821.
Time:	23:38:05	Pearson chi2:	3.48e+04
No. Iterations:	7
Covariance Type:	nonrobust

	coef	std err	z	P>\|z\|	[0.025	0.975]
Intercept	-3.1829	0.035	-92.193	0.000	-3.251	-3.115
gender_male	0.7564	0.043	17.678	0.000	0.673	0.840
last	-1.5878	0.055	-28.918	0.000	-1.695	-1.480
total	0.1982	0.048	4.133	0.000	0.104	0.292
child	-0.4360	0.047	-9.372	0.000	-0.527	-0.345
youth	-0.1555	0.043	-3.630	0.000	-0.239	-0.072
cook	-0.6248	0.049	-12.848	0.000	-0.720	-0.529
do_it	-0.8167	0.049	-16.601	0.000	-0.913	-0.720
reference	0.3062	0.038	7.987	0.000	0.231	0.381
art	1.6123	0.036	44.434	0.000	1.541	1.683
geog	0.9364	0.038	24.939	0.000	0.863	1.010
rnd	-0.0265	0.041	-0.644	0.520	-0.107	0.054

Cross-validation and hyper-parameter tuning¶

Decision tree¶

Random Forest¶

XGBoost¶

	acctnum	gender	state	zip	zip3	first	last	book	nonbook	total	purch	child	youth	cook	do_it	reference	art	geog	buyer	training
0	10001	M	NY	10605	106	49	29	109	248	357	10	3	2	2	0	1	0	2	no	1
1	10002	M	NY	10960	109	39	27	35	103	138	3	0	1	0	1	0	0	1	no	1
2	10003	F	PA	19146	191	19	15	25	147	172	2	0	0	2	0	0	0	0	no	0
3	10004	F	NJ	07016	070	7	7	15	257	272	1	0	0	0	0	1	0	0	no	0
4	10005	F	NY	10804	108	15	15	15	134	149	1	0	0	1	0	0	0	0	no	1
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
49995	59996	F	NY	11967	119	9	9	15	12	27	1	1	0	0	0	0	0	0	no	1
49996	59997	F	NJ	08882	088	25	5	79	294	373	7	3	0	1	1	0	1	1	no	1
49997	59998	M	NJ	07410	074	3	3	15	178	193	1	0	0	0	0	1	0	0	no	1
49998	59999	M	NJ	07090	070	49	29	98	246	344	8	2	0	1	0	2	1	2	no	1
49999	60000	M	NY	11355	113	29	1	60	125	185	5	1	0	1	0	0	1	2	no	1

	gender_male	last	total	child	youth	cook	do_it	reference	art	geog	rnd
0	1	29	357	3	2	2	0	1	0	2	0.471435
1	1	27	138	0	1	0	1	0	0	1	-1.190976
2	0	15	172	0	0	2	0	0	0	0	1.432707
3	0	7	272	0	0	0	0	1	0	0	-0.312652
4	0	15	149	0	0	1	0	0	0	0	-0.720589
...	...	...	...	...	...	...	...	...	...	...	...
49995	0	9	27	1	0	0	0	0	0	0	-0.398704
49996	0	5	373	3	0	1	1	0	1	1	-0.405591
49997	1	3	193	0	0	0	0	1	0	0	1.512301
49998	1	29	344	2	0	1	0	2	1	2	-1.149878
49999	1	1	185	1	0	1	0	0	1	2	-0.625859

	labels	coefficients	OR
0	gender_male	0.704872	2.023588
1	last	-1.511225	0.220640
2	total	0.139106	1.149246
3	child	-0.381726	0.682682
4	youth	-0.115520	0.890902
5	cook	-0.563210	0.569378
6	do_it	-0.745222	0.474629
7	reference	0.270664	1.310835
8	art	1.561671	4.766781
9	geog	0.891454	2.438672
10	rnd	0.000000	1.000000

	acctnum	buyer_yes	training	y_lr
0	10001	0	1	0.020020
1	10002	0	0	0.017419
2	10003	0	1	0.017345
3	10004	0	0	0.073923
4	10005	0	1	0.021545
...	...	...	...	...
49995	59996	0	1	0.036383
49996	59997	0	1	0.114187
49997	59998	0	1	0.181453
49998	59999	0	0	0.124892
49999	60000	0	1	0.564255

	mean_fit_time	std_fit_time	mean_score_time	std_score_time	param_C	params	split0_test_AUC	split1_test_AUC	split2_test_AUC	split3_test_AUC	split4_test_AUC	mean_test_AUC	std_test_AUC	rank_test_AUC
0	0.206224	0.038205	0.007334	0.002328	1	{'C': 1}	0.809517	0.818689	0.810102	0.816834	0.813282	0.813685	0.003614	1
1	0.141786	0.012482	0.006330	0.002636	0.5	{'C': 0.5}	0.809516	0.818706	0.810029	0.816846	0.813299	0.813679	0.003636	2
2	0.165923	0.053900	0.008248	0.002390	0.25	{'C': 0.25}	0.809504	0.818705	0.809942	0.816846	0.813344	0.813668	0.003655	3
3	0.149491	0.010946	0.006680	0.001846	0.1	{'C': 0.1}	0.809500	0.818732	0.809662	0.816818	0.813517	0.813646	0.003715	4
4	0.129103	0.024938	0.005352	0.001299	0.05	{'C': 0.05}	0.809291	0.818665	0.808997	0.816767	0.813741	0.813492	0.003883	5
5	0.131181	0.031044	0.005313	0.002150	0.025	{'C': 0.025}	0.808437	0.818076	0.807428	0.816256	0.813616	0.812763	0.004203	6
6	0.151240	0.047269	0.005445	0.000967	0.01	{'C': 0.01}	0.804170	0.815265	0.802119	0.812382	0.809766	0.808740	0.004932	7
7	0.121440	0.007415	0.005094	0.000706	0.005	{'C': 0.005}	0.786522	0.800898	0.783626	0.796770	0.792318	0.792027	0.006359	8
8	0.120359	0.019913	0.004889	0.001249	0.0025	{'C': 0.0025}	0.765408	0.777231	0.761318	0.774265	0.770765	0.769797	0.005785	9

	labels	coefficients	OR
0	gender_male	0.753761	2.124976
1	last	-1.583870	0.205180
2	total	0.195192	1.215545
3	child	-0.433185	0.648441
4	youth	-0.153442	0.857750
5	cook	-0.621593	0.537088
6	do_it	-0.813038	0.443509
7	reference	0.304380	1.355784
8	art	1.609682	5.001221
9	geog	0.934044	2.544779
10	rnd	-0.024867	0.975440

	acctnum	buyer_yes	training	y_lr	y_lasso	y_nn
0	10001	0	1	0.020020	0.017953	0.017703
1	10002	0	0	0.017419	0.015877	0.015819
2	10003	0	1	0.017345	0.015487	0.015429
3	10004	0	0	0.073923	0.077197	0.076124
4	10005	0	1	0.021545	0.020111	0.019782
...	...	...	...	...	...	...
49995	59996	0	1	0.036383	0.033690	0.032638
49996	59997	0	1	0.114187	0.112181	0.112373
49997	59998	0	1	0.181453	0.191942	0.195078
49998	59999	0	0	0.124892	0.132982	0.134921
49999	60000	0	1	0.564255	0.591839	0.586895

	Type	predictor	TP	FP	TN	FN	total	TPR	TNR	precision	Fscore	accuracy	kappa	profit	index	ROME	contact	AUC
0	training	y_lr	2356	8624	23211	809	35000	0.744	0.729	0.215	0.333	0.730	0.224	8646.0	0.983	1.575	0.314	0.814
1	training	y_lasso	2342	8501	23334	823	35000	0.740	0.733	0.216	0.334	0.734	0.226	8630.5	0.981	1.592	0.310	0.814
2	training	y_nn	2327	8373	23462	838	35000	0.735	0.737	0.217	0.336	0.737	0.228	8612.0	0.979	1.610	0.306	0.814
3	training	y_nn_cv	2341	8160	23675	824	35000	0.740	0.744	0.223	0.343	0.743	0.236	8795.5	1.000	1.675	0.300	0.822
4	test	y_lr	984	3741	9902	373	15000	0.725	0.726	0.208	0.324	0.726	0.213	3541.5	0.960	1.499	0.315	0.806
5	test	y_lasso	980	3678	9965	377	15000	0.722	0.730	0.210	0.326	0.730	0.216	3551.0	0.963	1.525	0.311	0.806
6	test	y_nn	979	3631	10012	378	15000	0.721	0.734	0.212	0.328	0.733	0.219	3569.0	0.968	1.548	0.307	0.806
7	test	y_nn_cv	994	3559	10084	363	15000	0.732	0.739	0.218	0.336	0.739	0.229	3687.5	1.000	1.620	0.304	0.813

	Type	predictor	TP	FP	TN	FN	total	TPR	TNR	precision	Fscore	accuracy	kappa	profit	index	ROME	contact	AUC
0	training	y_rf	3165	3112	28723	0	35000	1.000	0.902	0.504	0.670	0.911	0.625	15851.5	1.000	5.051	0.179	1.000
1	training	y_rf_oob	2182	9691	22144	983	35000	0.689	0.696	0.184	0.290	0.695	0.172	7155.5	0.451	1.205	0.339	0.750
2	test	y_rf	945	4093	9550	412	15000	0.696	0.700	0.188	0.296	0.700	0.178	3151.0	1.000	1.251	0.336	0.761
3	test	y_rf_oob	945	4093	9550	412	15000	0.696	0.700	0.188	0.296	0.700	0.178	3151.0	1.000	1.251	0.336	0.761