import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pyrsm as rsm
import seaborn as sns
import statsmodels.formula.api as smf
from statsmodels.genmod.families import Binomial
from statsmodels.genmod.families.links import logit

# increase plot resolution
mpl.rcParams["figure.dpi"] = 150


bbb = pd.read_pickle("data/bbb_pre.pkl")
bbb


bbb[np.isin(bbb.buyer, ["yes", "no"])].shape

(50000, 20)


bbb["buyer_yes"] = rsm.ifelse(
    bbb.buyer == "yes", 1, rsm.ifelse(bbb.buyer == "no", 0, np.NaN)
)
rsm.format_nr(np.isnan(bbb.buyer_yes).sum())

'500,000'


# testing against the bbb example from week 3
# bbb = bbb.query("training == 1")
# cost = 0.5
# margin = 6


cost = 0.99
margin = 9
breakeven = cost / margin
breakeven

0.11


train = bbb.training == 1
pd.crosstab(
    index=bbb.loc[train, "buyer"], columns="Response rate", normalize="columns"
).transform(lambda x: (100 * x).round(2).astype(str) + "%")


bbb = bbb.assign(
    rec_iq=rsm.xtile(bbb["last"], 5),
    freq_iq=rsm.xtile(bbb["purch"], 5, rev=True),
    mon_iq=rsm.xtile(bbb["total"], 5, rev=True),
)


fig = rsm.prop_plot(bbb[train], "rec_iq", "buyer", "yes")


fig = rsm.prop_plot(bbb[train], "freq_iq", "buyer", "yes")


fig = rsm.prop_plot(bbb[train], "mon_iq", "buyer", "yes")


bbb["rfm_iq"] = (
    bbb.rec_iq.astype(str) + bbb.freq_iq.astype(str) + bbb.mon_iq.astype(str)
)


bbb["freq_sq"] = bbb.groupby("rec_iq")["purch"].transform(rsm.xtile, 5, rev=True)
bbb["mon_sq"] = bbb.groupby(["rec_iq", "freq_sq"])["total"].transform(
    rsm.xtile, 5, rev=True
)


bbb["rfm_sq"] = (
    bbb.rec_iq.astype(str) + bbb.freq_sq.astype(str) + bbb.mon_sq.astype(str)
)


plt.figure(figsize=(16, 7))
fig = rsm.prop_plot(
    bbb.query("training == 1"), "rfm_iq", "buyer", "yes", breakeven=breakeven
)
fig.set_xticklabels(fig.get_xticklabels(), rotation=90)
fig = fig.set(xlabel="Independent RFM index (rsm_iq)")


plt.figure(figsize=(16, 7))
fig = rsm.prop_plot(
    bbb.query("training == 1"), "rfm_sq", "buyer", "yes", breakeven=breakeven
)
fig.set_xticklabels(fig.get_xticklabels(), rotation=90)
fig = fig.set(xlabel="Sequential RFM index (rsm_sq)")


lr = smf.glm(
    formula="buyer_yes ~ rfm_sq",
    family=Binomial(link=logit()),
    data=bbb[train],
).fit()


rsm.or_ci(lr)


bbb["resp_logit"] = lr.predict(bbb)


rfm_logit = bbb.groupby("rfm_sq")["resp_logit"].agg("mean").reset_index()
plt.figure(figsize=(16, 7))
fig = sns.barplot(x="rfm_sq", y="resp_logit", color="slateblue", data=rfm_logit)
fig.axhline(breakeven, linestyle="dashed", linewidth=1)
fig.set_xticklabels(fig.get_xticklabels(), rotation=90)
fig = fig.set(
    xlabel="Sequential RFM index (rsm_sq)",
    ylabel="Predicted probability of buyer equal to 'yes'",
)


def perf_calc(
    df, intro="", mail=None, perf="buyer", lev="yes", cost=cost, margin=margin, prn=True
):
    """
    A function to calculate performance stats for different targeting methods

    Parameters
    ----------
    df : Pandas dataframe
    intro: str
        Text introduction to use for printed performance output
    mail: str
        Column name of the mailing variable to use
    perf: str
        Column name of the performance variable to use
    lev : str
        Level in performance variable to use
    prn : bool
        Print performance output (True or False)

    Returns
    -------
    profit : float
        Estimate of the profit from the selected targeting strategy applied to the full customer database
    ROME : float
        Estimate of theh return on marketing investment (ROME) from the selected targeting strategy applied to the full customer database
    """

    rep_rate = np.nanmean(df.loc[df[mail] & (df.training == 1), perf] == lev)
    nr_mail = df[df[mail] & (df.training == 0)].shape[0]
    perc_mail = nr_mail / (df.training == 0).sum()
    
    nr_resp = nr_mail * rep_rate
    mail_cost = cost * nr_mail
    profit = margin * nr_resp - mail_cost
    ROME = profit / mail_cost

    if prn:
        prn_output = f"""{intro} the number of customers BBB should mail is {int(nr_mail):,} ({round((100 * perc_mail), 2)}%). \
The response rate for the selected customers is predicted to be {round((100 * rep_rate), 2)}% or {int(nr_resp):,} \
buyers. The expected profit is ${int(profit):,}. The mailing \
cost is estimated to be ${int(mail_cost):,} with a ROME of {round((100 * ROME), 2)}%"""

        print(prn_output)

    return profit, ROME


profit_nt, ROME_nt = perf_calc(
    bbb.assign(mailto_nt=True), mail="mailto_nt", intro="If bbb does not apply targeting"
)

If bbb does not apply targeting the number of customers BBB should mail is 500,000 (100.0%). The response rate for the selected customers is predicted to be 9.04% or 45,220 buyers. The expected profit is $-88,020. The mailing cost is estimated to be $495,000 with a ROME of -17.78%


def mailto(x, breakeven):
    return np.nanmean(x) > breakeven


bbb["mailto_iq"] = bbb.groupby("rfm_iq").buyer_yes.transform(
    mailto, breakeven=breakeven
)

profit_iq, ROME_iq = perf_calc(bbb, mail="mailto_iq", intro="Based on independent RFM")

Based on independent RFM the number of customers BBB should mail is 169,753 (33.95%). The response rate for the selected customers is predicted to be 15.71% or 26,663 buyers. The expected profit is $71,914. The mailing cost is estimated to be $168,055 with a ROME of 42.79%


bbb["mailto_sq"] = bbb.groupby("rfm_sq").buyer_yes.transform(
    mailto, breakeven=breakeven
)
profit_sq, ROME_sq = perf_calc(bbb, mail="mailto_sq", intro="Based on sequential RFM")

Based on sequential RFM the number of customers BBB should mail is 172,252 (34.45%). The response rate for the selected customers is predicted to be 15.72% or 27,076 buyers. The expected profit is $73,154. The mailing cost is estimated to be $170,529 with a ROME of 42.9%


bbb["mailto_logit"] = bbb.resp_logit > breakeven
profit_lr, ROME_lr = perf_calc(
    bbb, mail="mailto_logit", intro="Based on logistic regression"
)

Based on logistic regression the number of customers BBB should mail is 172,252 (34.45%). The response rate for the selected customers is predicted to be 15.72% or 27,076 buyers. The expected profit is $73,154. The mailing cost is estimated to be $170,529 with a ROME of 42.9%


# bbb[(bbb.mailto_sq != bbb.mailto_logit)]


bbb["resp_rfm_iq"] = bbb.groupby("rfm_iq")["buyer_yes"].transform(np.nanmean)
bbb["resp_rfm_sq"] = bbb.groupby("rfm_sq")["buyer_yes"].transform(np.nanmean)
bbb["breakeven"] = breakeven


dat = pd.DataFrame(
    {
        "name": ["No targeting", "Indep. RFM", "Seq. RFM", "Logit"],
        "Profit": [profit_nt, profit_iq, profit_sq, profit_lr],
        "ROME": [ROME_nt, ROME_iq, ROME_sq, ROME_lr],
    }
)


plt.figure(figsize=(8, 5))
fig = sns.barplot(x="name", y="Profit", color="slateblue", data=dat)
fig.set(xlabel="", ylabel="Profit", title="Campaign profit")
for index, row in dat.iterrows():
    fig.text(
        row.name, row.Profit * 0.8, f"{row.Profit:,.0f}", ha="center", color="white"
    )


plt.figure(figsize=(8, 5))
fig = sns.barplot(x="name", y="ROME", color="slateblue", data=dat)
fig.set(xlabel="", ylabel="ROME", title="Return on Marketing Expenditures (ROME)")
for index, row in dat.iterrows():
    fig.text(
        row.name,
        row.ROME * 0.8,
        f"{round((100*row.ROME), 2):,}%",
        ha="center",
        color="white",
    )


bbb_train = bbb[train].copy()


bbb_train["resp_rfm_iq"] = bbb_train.groupby("rfm_iq")["buyer_yes"].transform(
    np.nanmean
)
bbb_train["resp_rfm_sq"] = bbb_train.groupby("rfm_sq")["buyer_yes"].transform(
    np.nanmean
)


rsm.calc_qnt(bbb_train, "buyer", "yes", "resp_rfm_sq")


fig = rsm.lift_plot(bbb_train, "buyer", "yes", ["resp_rfm_iq", "resp_rfm_sq"])


fig = rsm.gains_plot(bbb_train, "buyer", "yes", ["resp_rfm_iq", "resp_rfm_sq"])


fig = rsm.profit_plot(
    bbb_train, "buyer", "yes", ["resp_rfm_iq", "resp_rfm_sq"], cost=cost, margin=margin
)


fig = rsm.ROME_plot(
    bbb_train, "buyer", "yes", ["resp_rfm_iq", "resp_rfm_sq"], cost=cost, margin=margin
)


TP, FP, TN, FN, contact = rsm.confusion(
    bbb_train, "buyer", "yes", "resp_rfm_sq", cost=cost, margin=margin
)
print(f"TP: {TP}\nFP: {FP}\nTN: {TN}\nFN: {FN}")

TP: 2706
FP: 14509
TN: 30969
FN: 1816


evb = rsm.evalbin(
    bbb_train, "buyer", "yes", ["resp_rfm_iq", "resp_rfm_sq"], cost=cost, margin=margin
)
evb


to_contact = bbb.loc[
    (bbb.training == 0) & bbb.mailto_sq,
    ["acctnum", "buyer_yes", "resp_rfm_sq", "breakeven", "mailto_sq"],
]
to_contact.to_csv("rfm_sq_to_contact.csv")
to_contact

	acctnum	gender	state	zip	zip3	first	last	book	nonbook	total	purch	child	youth	cook	do_it	reference	art	geog	buyer	training
0	U60071689	M	NY	10605	106	49	29	109	248	357	10	3	2	2	0	1	0	2	no	1.0
1	U36246702	M	NY	10960	109	39	27	35	103	138	3	0	1	0	1	0	0	1	no	1.0
2	U68717972	F	PA	19146	191	19	15	25	147	172	2	0	0	2	0	0	0	0	no	1.0
3	U19435243	F	NJ	07016	070	7	7	15	257	272	1	0	0	0	0	1	0	0	no	1.0
4	U10648559	F	NY	10804	108	15	15	15	134	149	1	0	0	1	0	0	0	0	no	1.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
549995	U69845166	F	NY	11354	113	17	17	15	246	261	1	0	0	1	0	0	0	0	NaN	0.0
549996	U77937363	F	NJ	07052	070	37	11	75	328	403	7	2	3	0	0	0	2	0	NaN	0.0
549997	U68727813	F	NY	11215	112	5	3	28	69	97	2	0	0	0	0	1	1	0	NaN	0.0
549998	U41521063	F	NY	12801	128	33	13	46	235	281	4	1	1	1	0	0	0	1	NaN	0.0
549999	U48174176	F	NY	10314	103	19	15	28	171	199	2	0	0	1	0	0	1	0	NaN	0.0

	index	OR	OR%	2.5%	97.5%	p.values
1	rfm_sq[T.112]	0.948	-5.2%	0.674	1.335	0.761
2	rfm_sq[T.113]	0.953	-4.7%	0.679	1.340	0.784
3	rfm_sq[T.114]	0.925	-7.5%	0.657	1.301	0.654
4	rfm_sq[T.115]	0.893	-10.7%	0.631	1.262	0.52
5	rfm_sq[T.121]	0.622	-37.8%	0.446	0.866	0.005	**
...	...	...	...	...	...	...	...
100	rfm_sq[T.551]	0.066	-93.4%	0.035	0.126	< .001	***
101	rfm_sq[T.552]	0.091	-90.9%	0.052	0.160	< .001	***
102	rfm_sq[T.553]	0.041	-95.9%	0.019	0.089	< .001	***
103	rfm_sq[T.554]	0.029	-97.1%	0.012	0.072	< .001	***
104	rfm_sq[T.555]	0.046	-95.4%	0.022	0.095	< .001	***

	bins	nr_obs	nr_resp	cum_obs	cum_prop	cum_resp
9	10	4961	1035	4961	0.09922	1035
8	9	4793	745	9754	0.19508	1780
7	8	5231	674	14985	0.29970	2454
6	7	4827	512	19812	0.39624	2966
5	6	5179	448	24991	0.49982	3414
4	5	4958	344	29949	0.59898	3758
3	4	4749	264	34698	0.69396	4022
2	3	4998	243	39696	0.79392	4265
1	2	4979	173	44675	0.89350	4438
0	1	5325	84	50000	1.00000	4522

	Type	predictor	TP	FP	TN	FN	total	TPR	TNR	precision	Fscore	accuracy	kappa	profit	index	ROME	contact	AUC
0	All	resp_rfm_iq	2660	14275	31203	1862	50000	0.588	0.686	0.157	0.248	0.677	0.123	7174.35	0.981	0.428	0.339	0.693
1	All	resp_rfm_sq	2706	14509	30969	1816	50000	0.598	0.681	0.157	0.249	0.674	0.123	7311.15	1.000	0.429	0.344	0.694

	acctnum	buyer_yes	resp_rfm_sq	breakeven	mailto_sq
50003	U12745070	NaN	0.230978	0.11	True
50007	U48503714	NaN	0.235457	0.11	True
50015	U83794078	NaN	0.137109	0.11	True
50016	U34212211	NaN	0.125364	0.11	True
50018	U93468194	NaN	0.209476	0.11	True
...	...	...	...	...	...
549976	U44928259	NaN	0.156156	0.11	True
549977	U62952648	NaN	0.156156	0.11	True
549985	U86074295	NaN	0.198980	0.11	True
549993	U26301571	NaN	0.190566	0.11	True
549997	U68727813	NaN	0.118976	0.11	True

What percentage of customers bought?¶

Create quintile variables for recency, frequency and monetary.¶

Create bar charts showing the response rate¶

Create a plot with all profit and ROME numbers¶

Evaluate model performance¶

col_0	Response rate
buyer
yes	9.04%
no	90.96%