import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pyrsm as rsm
import seaborn as sns
import statsmodels.formula.api as smf
from statsmodels.genmod.families import Binomial
from statsmodels.genmod.families.links import logit
# increase plot resolution
mpl.rcParams["figure.dpi"] = 150
bbb = pd.read_pickle("data/bbb_pre.pkl")
bbb
acctnum | gender | state | zip | zip3 | first | last | book | nonbook | total | purch | child | youth | cook | do_it | reference | art | geog | buyer | training | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | U60071689 | M | NY | 10605 | 106 | 49 | 29 | 109 | 248 | 357 | 10 | 3 | 2 | 2 | 0 | 1 | 0 | 2 | no | 1.0 |
1 | U36246702 | M | NY | 10960 | 109 | 39 | 27 | 35 | 103 | 138 | 3 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | no | 1.0 |
2 | U68717972 | F | PA | 19146 | 191 | 19 | 15 | 25 | 147 | 172 | 2 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | no | 1.0 |
3 | U19435243 | F | NJ | 07016 | 070 | 7 | 7 | 15 | 257 | 272 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | no | 1.0 |
4 | U10648559 | F | NY | 10804 | 108 | 15 | 15 | 15 | 134 | 149 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | no | 1.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
549995 | U69845166 | F | NY | 11354 | 113 | 17 | 17 | 15 | 246 | 261 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | NaN | 0.0 |
549996 | U77937363 | F | NJ | 07052 | 070 | 37 | 11 | 75 | 328 | 403 | 7 | 2 | 3 | 0 | 0 | 0 | 2 | 0 | NaN | 0.0 |
549997 | U68727813 | F | NY | 11215 | 112 | 5 | 3 | 28 | 69 | 97 | 2 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | NaN | 0.0 |
549998 | U41521063 | F | NY | 12801 | 128 | 33 | 13 | 46 | 235 | 281 | 4 | 1 | 1 | 1 | 0 | 0 | 0 | 1 | NaN | 0.0 |
549999 | U48174176 | F | NY | 10314 | 103 | 19 | 15 | 28 | 171 | 199 | 2 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | NaN | 0.0 |
550000 rows × 20 columns
bbb[np.isin(bbb.buyer, ["yes", "no"])].shape
(50000, 20)
bbb["buyer_yes"] = rsm.ifelse(
bbb.buyer == "yes", 1, rsm.ifelse(bbb.buyer == "no", 0, np.NaN)
)
rsm.format_nr(np.isnan(bbb.buyer_yes).sum())
'500,000'
# testing against the bbb example from week 3
# bbb = bbb.query("training == 1")
# cost = 0.5
# margin = 6
cost = 0.99
margin = 9
breakeven = cost / margin
breakeven
0.11
train = bbb.training == 1
pd.crosstab(
index=bbb.loc[train, "buyer"], columns="Response rate", normalize="columns"
).transform(lambda x: (100 * x).round(2).astype(str) + "%")
col_0 | Response rate |
---|---|
buyer | |
yes | 9.04% |
no | 90.96% |
Hint: Review the rfm-bbb.py python code file that walks through the calculations for the Bookbinders RFM analysis in detail. Use the xtile
function from the pyrsm
package to create the recency, frequency, and monetary value quantiles. Note the use of rev=True
for freq_iq
and mon_iq
to ensure the best customers are in the 1st quantile
bbb = bbb.assign(
rec_iq=rsm.xtile(bbb["last"], 5),
freq_iq=rsm.xtile(bbb["purch"], 5, rev=True),
mon_iq=rsm.xtile(bbb["total"], 5, rev=True),
)
The proportion of customers who bought AHOF per recency, frequency, and monetary quintile (i.e., 3 plots)
fig = rsm.prop_plot(bbb[train], "rec_iq", "buyer", "yes")
fig = rsm.prop_plot(bbb[train], "freq_iq", "buyer", "yes")
fig = rsm.prop_plot(bbb[train], "mon_iq", "buyer", "yes")
Create the rfm_iq
index.
bbb["rfm_iq"] = (
bbb.rec_iq.astype(str) + bbb.freq_iq.astype(str) + bbb.mon_iq.astype(str)
)
Next, create the sequential RFM variables using select rec_iq
as the variable to group by for freq_sq
. Select both rec_iq
and freq_sq
as the variables to group by for mon_sq
. Note that we do not need to create a rec_sq
variable as this is equivalent to rec_iq
.
bbb["freq_sq"] = bbb.groupby("rec_iq")["purch"].transform(rsm.xtile, 5, rev=True)
bbb["mon_sq"] = bbb.groupby(["rec_iq", "freq_sq"])["total"].transform(
rsm.xtile, 5, rev=True
)
Next create the rfm_sq
index.
bbb["rfm_sq"] = (
bbb.rec_iq.astype(str) + bbb.freq_sq.astype(str) + bbb.mon_sq.astype(str)
)
The line in the plot below shows the break-even point. Cells with a response rate above 0.0247 are predicted to be profitable.
plt.figure(figsize=(16, 7))
fig = rsm.prop_plot(
bbb.query("training == 1"), "rfm_iq", "buyer", "yes", breakeven=breakeven
)
fig.set_xticklabels(fig.get_xticklabels(), rotation=90)
fig = fig.set(xlabel="Independent RFM index (rsm_iq)")
plt.figure(figsize=(16, 7))
fig = rsm.prop_plot(
bbb.query("training == 1"), "rfm_sq", "buyer", "yes", breakeven=breakeven
)
fig.set_xticklabels(fig.get_xticklabels(), rotation=90)
fig = fig.set(xlabel="Sequential RFM index (rsm_sq)")
lr = smf.glm(
formula="buyer_yes ~ rfm_sq",
family=Binomial(link=logit()),
data=bbb[train],
).fit()
rsm.or_ci(lr)
index | OR | OR% | 2.5% | 97.5% | p.values | ||
---|---|---|---|---|---|---|---|
1 | rfm_sq[T.112] | 0.948 | -5.2% | 0.674 | 1.335 | 0.761 | |
2 | rfm_sq[T.113] | 0.953 | -4.7% | 0.679 | 1.340 | 0.784 | |
3 | rfm_sq[T.114] | 0.925 | -7.5% | 0.657 | 1.301 | 0.654 | |
4 | rfm_sq[T.115] | 0.893 | -10.7% | 0.631 | 1.262 | 0.52 | |
5 | rfm_sq[T.121] | 0.622 | -37.8% | 0.446 | 0.866 | 0.005 | ** |
... | ... | ... | ... | ... | ... | ... | ... |
100 | rfm_sq[T.551] | 0.066 | -93.4% | 0.035 | 0.126 | < .001 | *** |
101 | rfm_sq[T.552] | 0.091 | -90.9% | 0.052 | 0.160 | < .001 | *** |
102 | rfm_sq[T.553] | 0.041 | -95.9% | 0.019 | 0.089 | < .001 | *** |
103 | rfm_sq[T.554] | 0.029 | -97.1% | 0.012 | 0.072 | < .001 | *** |
104 | rfm_sq[T.555] | 0.046 | -95.4% | 0.022 | 0.095 | < .001 | *** |
104 rows × 7 columns
bbb["resp_logit"] = lr.predict(bbb)
rfm_logit = bbb.groupby("rfm_sq")["resp_logit"].agg("mean").reset_index()
plt.figure(figsize=(16, 7))
fig = sns.barplot(x="rfm_sq", y="resp_logit", color="slateblue", data=rfm_logit)
fig.axhline(breakeven, linestyle="dashed", linewidth=1)
fig.set_xticklabels(fig.get_xticklabels(), rotation=90)
fig = fig.set(
xlabel="Sequential RFM index (rsm_sq)",
ylabel="Predicted probability of buyer equal to 'yes'",
)
def perf_calc(
df, intro="", mail=None, perf="buyer", lev="yes", cost=cost, margin=margin, prn=True
):
"""
A function to calculate performance stats for different targeting methods
Parameters
----------
df : Pandas dataframe
intro: str
Text introduction to use for printed performance output
mail: str
Column name of the mailing variable to use
perf: str
Column name of the performance variable to use
lev : str
Level in performance variable to use
prn : bool
Print performance output (True or False)
Returns
-------
profit : float
Estimate of the profit from the selected targeting strategy applied to the full customer database
ROME : float
Estimate of theh return on marketing investment (ROME) from the selected targeting strategy applied to the full customer database
"""
rep_rate = np.nanmean(df.loc[df[mail] & (df.training == 1), perf] == lev)
nr_mail = df[df[mail] & (df.training == 0)].shape[0]
perc_mail = nr_mail / (df.training == 0).sum()
nr_resp = nr_mail * rep_rate
mail_cost = cost * nr_mail
profit = margin * nr_resp - mail_cost
ROME = profit / mail_cost
if prn:
prn_output = f"""{intro} the number of customers BBB should mail is {int(nr_mail):,} ({round((100 * perc_mail), 2)}%). \
The response rate for the selected customers is predicted to be {round((100 * rep_rate), 2)}% or {int(nr_resp):,} \
buyers. The expected profit is ${int(profit):,}. The mailing \
cost is estimated to be ${int(mail_cost):,} with a ROME of {round((100 * ROME), 2)}%"""
print(prn_output)
return profit, ROME
profit_nt, ROME_nt = perf_calc(
bbb.assign(mailto_nt=True), mail="mailto_nt", intro="If bbb does not apply targeting"
)
If bbb does not apply targeting the number of customers BBB should mail is 500,000 (100.0%). The response rate for the selected customers is predicted to be 9.04% or 45,220 buyers. The expected profit is $-88,020. The mailing cost is estimated to be $495,000 with a ROME of -17.78%
Offering the deal only to those the customers in (sequential)
RFM cells with a response rate that is greater than the breakeven response rate. Specifically, follow these steps:
def mailto(x, breakeven):
return np.nanmean(x) > breakeven
bbb["mailto_iq"] = bbb.groupby("rfm_iq").buyer_yes.transform(
mailto, breakeven=breakeven
)
profit_iq, ROME_iq = perf_calc(bbb, mail="mailto_iq", intro="Based on independent RFM")
Based on independent RFM the number of customers BBB should mail is 169,753 (33.95%). The response rate for the selected customers is predicted to be 15.71% or 26,663 buyers. The expected profit is $71,914. The mailing cost is estimated to be $168,055 with a ROME of 42.79%
bbb["mailto_sq"] = bbb.groupby("rfm_sq").buyer_yes.transform(
mailto, breakeven=breakeven
)
profit_sq, ROME_sq = perf_calc(bbb, mail="mailto_sq", intro="Based on sequential RFM")
Based on sequential RFM the number of customers BBB should mail is 172,252 (34.45%). The response rate for the selected customers is predicted to be 15.72% or 27,076 buyers. The expected profit is $73,154. The mailing cost is estimated to be $170,529 with a ROME of 42.9%
bbb["mailto_logit"] = bbb.resp_logit > breakeven
profit_lr, ROME_lr = perf_calc(
bbb, mail="mailto_logit", intro="Based on logistic regression"
)
Based on logistic regression the number of customers BBB should mail is 172,252 (34.45%). The response rate for the selected customers is predicted to be 15.72% or 27,076 buyers. The expected profit is $73,154. The mailing cost is estimated to be $170,529 with a ROME of 42.9%
# bbb[(bbb.mailto_sq != bbb.mailto_logit)]
bbb["resp_rfm_iq"] = bbb.groupby("rfm_iq")["buyer_yes"].transform(np.nanmean)
bbb["resp_rfm_sq"] = bbb.groupby("rfm_sq")["buyer_yes"].transform(np.nanmean)
bbb["breakeven"] = breakeven
dat = pd.DataFrame(
{
"name": ["No targeting", "Indep. RFM", "Seq. RFM", "Logit"],
"Profit": [profit_nt, profit_iq, profit_sq, profit_lr],
"ROME": [ROME_nt, ROME_iq, ROME_sq, ROME_lr],
}
)
plt.figure(figsize=(8, 5))
fig = sns.barplot(x="name", y="Profit", color="slateblue", data=dat)
fig.set(xlabel="", ylabel="Profit", title="Campaign profit")
for index, row in dat.iterrows():
fig.text(
row.name, row.Profit * 0.8, f"{row.Profit:,.0f}", ha="center", color="white"
)
plt.figure(figsize=(8, 5))
fig = sns.barplot(x="name", y="ROME", color="slateblue", data=dat)
fig.set(xlabel="", ylabel="ROME", title="Return on Marketing Expenditures (ROME)")
for index, row in dat.iterrows():
fig.text(
row.name,
row.ROME * 0.8,
f"{round((100*row.ROME), 2):,}%",
ha="center",
color="white",
)
bbb_train = bbb[train].copy()
bbb_train["resp_rfm_iq"] = bbb_train.groupby("rfm_iq")["buyer_yes"].transform(
np.nanmean
)
bbb_train["resp_rfm_sq"] = bbb_train.groupby("rfm_sq")["buyer_yes"].transform(
np.nanmean
)
rsm.calc_qnt(bbb_train, "buyer", "yes", "resp_rfm_sq")
bins | nr_obs | nr_resp | cum_obs | cum_prop | cum_resp | |
---|---|---|---|---|---|---|
9 | 10 | 4961 | 1035 | 4961 | 0.09922 | 1035 |
8 | 9 | 4793 | 745 | 9754 | 0.19508 | 1780 |
7 | 8 | 5231 | 674 | 14985 | 0.29970 | 2454 |
6 | 7 | 4827 | 512 | 19812 | 0.39624 | 2966 |
5 | 6 | 5179 | 448 | 24991 | 0.49982 | 3414 |
4 | 5 | 4958 | 344 | 29949 | 0.59898 | 3758 |
3 | 4 | 4749 | 264 | 34698 | 0.69396 | 4022 |
2 | 3 | 4998 | 243 | 39696 | 0.79392 | 4265 |
1 | 2 | 4979 | 173 | 44675 | 0.89350 | 4438 |
0 | 1 | 5325 | 84 | 50000 | 1.00000 | 4522 |
fig = rsm.lift_plot(bbb_train, "buyer", "yes", ["resp_rfm_iq", "resp_rfm_sq"])
fig = rsm.gains_plot(bbb_train, "buyer", "yes", ["resp_rfm_iq", "resp_rfm_sq"])
fig = rsm.profit_plot(
bbb_train, "buyer", "yes", ["resp_rfm_iq", "resp_rfm_sq"], cost=cost, margin=margin
)
fig = rsm.ROME_plot(
bbb_train, "buyer", "yes", ["resp_rfm_iq", "resp_rfm_sq"], cost=cost, margin=margin
)
TP, FP, TN, FN, contact = rsm.confusion(
bbb_train, "buyer", "yes", "resp_rfm_sq", cost=cost, margin=margin
)
print(f"TP: {TP}\nFP: {FP}\nTN: {TN}\nFN: {FN}")
TP: 2706 FP: 14509 TN: 30969 FN: 1816
evb = rsm.evalbin(
bbb_train, "buyer", "yes", ["resp_rfm_iq", "resp_rfm_sq"], cost=cost, margin=margin
)
evb
Type | predictor | TP | FP | TN | FN | total | TPR | TNR | precision | Fscore | accuracy | kappa | profit | index | ROME | contact | AUC | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | All | resp_rfm_iq | 2660 | 14275 | 31203 | 1862 | 50000 | 0.588 | 0.686 | 0.157 | 0.248 | 0.677 | 0.123 | 7174.35 | 0.981 | 0.428 | 0.339 | 0.693 |
1 | All | resp_rfm_sq | 2706 | 14509 | 30969 | 1816 | 50000 | 0.598 | 0.681 | 0.157 | 0.249 | 0.674 | 0.123 | 7311.15 | 1.000 | 0.429 | 0.344 | 0.694 |
Get the list of prospect to contact from the rest of the customer database
to_contact = bbb.loc[
(bbb.training == 0) & bbb.mailto_sq,
["acctnum", "buyer_yes", "resp_rfm_sq", "breakeven", "mailto_sq"],
]
to_contact.to_csv("rfm_sq_to_contact.csv")
to_contact
acctnum | buyer_yes | resp_rfm_sq | breakeven | mailto_sq | |
---|---|---|---|---|---|
50003 | U12745070 | NaN | 0.230978 | 0.11 | True |
50007 | U48503714 | NaN | 0.235457 | 0.11 | True |
50015 | U83794078 | NaN | 0.137109 | 0.11 | True |
50016 | U34212211 | NaN | 0.125364 | 0.11 | True |
50018 | U93468194 | NaN | 0.209476 | 0.11 | True |
... | ... | ... | ... | ... | ... |
549976 | U44928259 | NaN | 0.156156 | 0.11 | True |
549977 | U62952648 | NaN | 0.156156 | 0.11 | True |
549985 | U86074295 | NaN | 0.198980 | 0.11 | True |
549993 | U26301571 | NaN | 0.190566 | 0.11 | True |
549997 | U68727813 | NaN | 0.118976 | 0.11 | True |
172252 rows × 5 columns