In [26]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler

# Setting to make numbers easier to read on display
pd.options.display.float_format = '{:20.2f}'.format

# Show all columns on output
pd.set_option('display.max_columns', 999)
In [27]:
# Load CSVs
df = pd.read_csv(r"C:\Users\USER\Downloads\Maven_Cafe\Data\maven_cafe.csv", low_memory=False)
In [28]:
maven_cafe = df.copy()
In [29]:
print("Duplicates in df:", maven_cafe.duplicated().sum())
Duplicates in df: 0
In [30]:
maven_cafe
Out[30]:
customer_id event value Amount became_member_on gender age income offer_id offer_type difficulty reward duration channels time_in_days web email mobile social age_group income_group
0 0009655768c64bdeb2e877511632db8f offer viewed fafdcd668e3743c1bb461111dcafc2a4 NaN 2017-04-21 M 33.00 72000.00 fafdcd668e3743c1bb461111dcafc2a4 discount 10.00 2.00 10.00 ['web', 'email', 'mobile', 'social'] 22.50 1 1 1 1 Early Career Professionals Middle Income
1 0009655768c64bdeb2e877511632db8f offer received 5a8bc65990b245e5a138643cd4eb9837 NaN 2017-04-21 M 33.00 72000.00 5a8bc65990b245e5a138643cd4eb9837 informational 0.00 0.00 3.00 ['email', 'mobile', 'social'] 7.00 0 1 1 1 Early Career Professionals Middle Income
2 0009655768c64bdeb2e877511632db8f offer received f19421c1d4aa40978ebb69ca19b0e20d NaN 2017-04-21 M 33.00 72000.00 f19421c1d4aa40978ebb69ca19b0e20d bogo 5.00 5.00 5.00 ['web', 'email', 'mobile', 'social'] 17.00 1 1 1 1 Early Career Professionals Middle Income
3 0009655768c64bdeb2e877511632db8f transaction NaN 8.57 2017-04-21 M 33.00 72000.00 NaN NaN NaN NaN NaN NaN 17.25 0 0 0 0 Early Career Professionals Middle Income
4 0009655768c64bdeb2e877511632db8f offer viewed f19421c1d4aa40978ebb69ca19b0e20d NaN 2017-04-21 M 33.00 72000.00 f19421c1d4aa40978ebb69ca19b0e20d bogo 5.00 5.00 5.00 ['web', 'email', 'mobile', 'social'] 19.00 1 1 1 1 Early Career Professionals Middle Income
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
272383 ffff82501cea40309d5fdd7edcca4a07 offer completed fafdcd668e3743c1bb461111dcafc2a4 NaN 2016-11-25 F 45.00 62000.00 fafdcd668e3743c1bb461111dcafc2a4 discount 10.00 2.00 10.00 ['web', 'email', 'mobile', 'social'] 2.50 1 1 1 1 Mature Professionals Middle Income
272384 ffff82501cea40309d5fdd7edcca4a07 offer completed 9b98b8c7a33c4b65b9aebfe6a799e6d9 NaN 2016-11-25 F 45.00 62000.00 9b98b8c7a33c4b65b9aebfe6a799e6d9 bogo 5.00 5.00 7.00 ['web', 'email', 'mobile'] 21.00 1 1 1 0 Mature Professionals Middle Income
272385 ffff82501cea40309d5fdd7edcca4a07 offer viewed fafdcd668e3743c1bb461111dcafc2a4 NaN 2016-11-25 F 45.00 62000.00 fafdcd668e3743c1bb461111dcafc2a4 discount 10.00 2.00 10.00 ['web', 'email', 'mobile', 'social'] 0.25 1 1 1 1 Mature Professionals Middle Income
272386 ffff82501cea40309d5fdd7edcca4a07 offer completed 0b1e1539f2cc45b7b9fa7c272da2e1d7 NaN 2016-11-25 F 45.00 62000.00 0b1e1539f2cc45b7b9fa7c272da2e1d7 discount 20.00 5.00 10.00 ['web', 'email'] 8.25 1 1 0 0 Mature Professionals Middle Income
272387 ffff82501cea40309d5fdd7edcca4a07 transaction NaN 15.57 2016-11-25 F 45.00 62000.00 NaN NaN NaN NaN NaN NaN 16.00 0 0 0 0 Mature Professionals Middle Income

272388 rows × 21 columns

In [31]:
# Filter only transaction events
df_transactions = maven_cafe[maven_cafe['event'] == 'transaction']

# Get latest day in the dataset
latest_day = df_transactions['time_in_days'].max()

# Aggregate RFM
rfm_df = df_transactions.groupby('customer_id', as_index=False).agg(
    MonetaryValue=('Amount', 'sum'),
    Frequency=('time_in_days', 'count'),  # or .size()
    LastPurchase=('time_in_days', 'max')
)

# Calculate Recency
rfm_df['Recency'] = latest_day - rfm_df['LastPurchase']

# Optional: Drop LastPurchase if you don’t need it
rfm_df.drop(columns='LastPurchase', inplace=True)

rfm_df.head()
Out[31]:
customer_id MonetaryValue Frequency Recency
0 0009655768c64bdeb2e877511632db8f 127.60 8 0.75
1 0011e0d4e6b944f998e987f904e8c1e5 79.46 5 2.50
2 0020c2b971eb4e9188eac86d93036a77 196.86 8 0.25
3 0020ccbbb6d84e358d3414a3ff76cffd 154.05 12 1.75
4 003d66b6608740288d6cc97a6903f4f0 48.34 18 0.75
In [32]:
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
plt.hist(rfm_df['MonetaryValue'], bins=10, color='skyblue', edgecolor='black')
plt.title('Monetary Value Distribution')
plt.xlabel('Monetary Value')
plt.ylabel('Count')

plt.subplot(1, 3, 2)
plt.hist(rfm_df['Frequency'], bins=10, color='lightgreen', edgecolor='black')
plt.title('Frequency Distribution')
plt.xlabel('Frequency')
plt.ylabel('Count')

plt.subplot(1, 3, 3)
plt.hist(rfm_df['Recency'], bins=20, color='salmon', edgecolor='black')
plt.title('Recency Distribution')
plt.xlabel('Recency')
plt.ylabel('Count')

plt.tight_layout()
plt.show()
No description has been provided for this image
In [33]:
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
sns.boxplot(data=rfm_df['MonetaryValue'], color='skyblue')
plt.title('Monetary Value Boxplot')
plt.xlabel('Monetary Value')

plt.subplot(1, 3, 2)
sns.boxplot(data=rfm_df['Frequency'], color='lightgreen')
plt.title('Frequency Boxplot')
plt.xlabel('Frequency')

plt.subplot(1, 3, 3)
sns.boxplot(data=rfm_df['Recency'], color='salmon')
plt.title('Recency Boxplot')
plt.xlabel('Recency')

plt.tight_layout()
plt.show()
No description has been provided for this image
In [34]:
M_Q1 = rfm_df["MonetaryValue"].quantile(0.25)
M_Q3 = rfm_df["MonetaryValue"].quantile(0.75)
M_IQR = M_Q3 - M_Q1

monetary_outliers_df = rfm_df[(rfm_df["MonetaryValue"] > (M_Q3 + 1.5 * M_IQR)) | (rfm_df["MonetaryValue"] < (M_Q1 - 1.5 * M_IQR))].copy()

monetary_outliers_df.describe()
Out[34]:
MonetaryValue Frequency Recency
count 470.00 470.00 470.00
mean 630.92 12.52 2.47
std 250.01 5.29 2.30
min 355.68 2.00 0.00
25% 405.56 9.00 0.75
50% 560.86 12.00 1.75
75% 808.53 16.00 3.50
max 1608.69 31.00 12.75
In [35]:
F_Q1 = rfm_df['Frequency'].quantile(0.25)
F_Q3 = rfm_df['Frequency'].quantile(0.75)
F_IQR = F_Q3 - F_Q1

frequency_outliers_df = rfm_df[(rfm_df['Frequency'] > (F_Q3 + 1.5 * F_IQR)) | (rfm_df['Frequency'] < (F_Q1 - 1.5 * F_IQR))].copy()

frequency_outliers_df.describe()
Out[35]:
MonetaryValue Frequency Recency
count 399.00 399.00 399.00
mean 157.54 23.13 1.44
std 145.61 2.47 1.40
min 38.34 21.00 0.00
25% 70.39 21.00 0.50
50% 89.21 22.00 1.00
75% 189.12 24.00 2.00
max 1040.98 36.00 8.25
In [36]:
F_Q1 = rfm_df['Recency'].quantile(0.25)
F_Q3 = rfm_df['Recency'].quantile(0.75)
F_IQR = F_Q3 - F_Q1

recency_outliers_df = rfm_df[(rfm_df['Recency'] > (F_Q3 + 1.5 * F_IQR)) | (rfm_df['Recency'] < (F_Q1 - 1.5 * F_IQR))].copy()

recency_outliers_df.describe()
Out[36]:
MonetaryValue Frequency Recency
count 578.00 578.00 578.00
mean 55.84 3.31 14.27
std 72.32 2.07 2.99
min 0.28 1.00 11.25
25% 17.95 2.00 12.25
50% 40.70 3.00 13.50
75% 72.05 4.00 15.44
max 946.98 15.00 29.25
In [37]:
non_outliers_df = rfm_df[
    (~rfm_df.index.isin(monetary_outliers_df.index)) &
    (~rfm_df.index.isin(frequency_outliers_df.index)) &
    (~rfm_df.index.isin(recency_outliers_df.index))
]
non_outliers_df.describe()
Out[37]:
MonetaryValue Frequency Recency
count 13084.00 13084.00 13084.00
mean 104.25 8.24 3.13
std 80.27 4.42 2.59
min 0.15 1.00 0.00
25% 33.14 5.00 1.00
50% 89.34 7.00 2.50
75% 157.29 11.00 4.50
max 355.49 20.00 11.00

🟢 MonetaryValue Typical spending:

IQR (25th–75th percentile): $33.14–$157.29 (most customers fall here).

Median (50th percentile): $89.34, close to the mean ($104.25), suggesting a mild right skew (higher spenders pull the mean up slightly).

Max ($355.49) is only ~2.2x the 75th percentile, so no extreme outliers.

Interpretation: Healthy spending distribution. The skew is manageable; no need for log transformation unless modeling requires normality.

🔵 Frequency Typical behavior:

Median: 7 purchases, mean 8.24 (slight right skew due to high-frequency buyers).

IQR: 5–11 purchases (50% of customers).

Max (20) is high but reasonable—likely loyal/VIP customers (e.g., top 25% buy 11+ times).

Interpretation: Strong repeat purchase behavior. Segment customers by frequency (e.g., 5–7 = regular, 11+ = VIP).

🔴 Recency Key metrics:

Median: 2.5 days, mean 3.13 days (most purchased very recently).

IQR: 1–4.5 days (75% purchased within the last 4.5 days).

Max (11 days): Even the least recent customer is highly active.

Interpretation: Exceptional engagement. Likely a high-frequency business (e.g., grocery, subscription). Monitor for upward shifts in median recency (could signal churn risk).

Actionable Insights MonetaryValue:

The mild skew is acceptable, but consider segmenting high spenders (top 25%: >$157) for targeted promotions.

Frequency:

Reward VIPs (11+ purchases) to retain them.

Encourage 5–7x buyers to reach VIP tiers (e.g., loyalty programs).

Recency:

Re-engage the 75th percentile (4.5+ days since last purchase)—they’re still active but may need reminders.

Investigate 0-day recency (min=0): Are these same-day purchases or data artifacts?

To be reqorous on outlier detection i used seaborn boxplots

In [38]:
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
sns.boxplot(data=non_outliers_df['MonetaryValue'], color='skyblue')
plt.title('Monetary Value Boxplot')
plt.xlabel('Monetary Value')

plt.subplot(1, 3, 2)
sns.boxplot(data=non_outliers_df['Frequency'], color='lightgreen')
plt.title('Frequency Boxplot')
plt.xlabel('Frequency')

plt.subplot(1, 3, 3)
sns.boxplot(data=non_outliers_df['Recency'], color='salmon')
plt.title('Recency Boxplot')
plt.xlabel('Recency')

plt.tight_layout()
plt.show()
No description has been provided for this image

📦 Boxplot Summary (Post-Outlier Filtering) 💰 Monetary Value Outliers above ~350 still exist, but far fewer — looks like you've successfully filtered the extreme values (like that 946.98 earlier).

The box is centered around 90, and the spread looks reasonable.

✅ Much improved — this distribution is now balanced.

🔁 Frequency No visible outliers (✔️), max value is about 20, and most data is between 5–12.

Very clean distribution.

✅ Good job — outliers properly handled.

⏱️ Recency A few outliers still appear above 10.

Majority of customers made purchases within 0–5 days.

You might want to tighten the outlier boundary, depending on how strict you want your cleaning to be.

In [39]:
fig = plt.figure(figsize=(8, 8))

ax = fig.add_subplot(projection="3d")

scatter = ax.scatter(non_outliers_df["MonetaryValue"], non_outliers_df["Frequency"], non_outliers_df["Recency"])

ax.set_xlabel('Monetary Value')
ax.set_ylabel('Frequency')
ax.set_zlabel('Recency')

ax.set_title('3D Scatter Plot of Customer Data')

plt.show()
No description has been provided for this image

Standard scaling transforms the features of your data to have a mean of 0 and a standard deviation of 1, ensuring that each feature contributes equally to the analysis.

$$ z = \frac{x - \mu}{\sigma} $$

Where:

  • $z$ is the standardized value,
  • $x$ is the original value,
  • $\mu$ is the mean of the feature,
  • $\sigma$ is the standard deviation of the feature.

As a caveat this assumes that our data is normally distribution but it's a fair enough assumption

In [40]:
scaler = StandardScaler()

scaled_data = scaler.fit_transform(non_outliers_df[["MonetaryValue", "Frequency", "Recency"]])

scaled_data
Out[40]:
array([[ 0.29094268, -0.05371739, -0.9156766 ],
       [-0.30883151, -0.73322599, -0.24098381],
       [ 1.15385004, -0.05371739, -1.10844597],
       ...,
       [-1.1474439 , -0.73322599,  1.01201709],
       [-0.19209108,  0.85229407, -0.62652255],
       [ 1.51777617,  1.53180267, -0.14459913]], shape=(13084, 3))

This is in numpy array but im going to make data easy to deal by converting to data frame

In [41]:
scaled_data_df = pd.DataFrame(scaled_data, index=non_outliers_df.index, columns=("MonetaryValue", "Frequency", "Recency"))

scaled_data_df
Out[41]:
MonetaryValue Frequency Recency
0 0.29 -0.05 -0.92
1 -0.31 -0.73 -0.24
2 1.15 -0.05 -1.11
3 0.62 0.85 -0.53
4 -0.70 2.21 -0.92
... ... ... ...
14486 2.33 0.63 -0.72
14488 -0.93 -0.51 -0.92
14489 -1.15 -0.73 1.01
14490 -0.19 0.85 -0.63
14491 1.52 1.53 -0.14

13084 rows × 3 columns

In [42]:
fig = plt.figure(figsize=(12, 12))

ax = fig.add_subplot(projection="3d")

scatter = ax.scatter(scaled_data_df["MonetaryValue"], scaled_data_df["Frequency"], scaled_data_df["Recency"])

ax.set_xlabel('Monetary Value')
ax.set_ylabel('Frequency')
ax.set_zlabel('Recency')

ax.set_title('3D Scatter Plot of Customer Data')

plt.show()
No description has been provided for this image

K means clustering

to determine how many clusters we're going to use elbow mwthod to determine no. of clusters

In [43]:
max_k = 12

inertia = []
silhoutte_scores = []
k_values = range(2, max_k + 1)

for k in k_values:

    kmeans = KMeans(n_clusters=k, random_state=42, max_iter=1000)

    cluster_labels = kmeans.fit_predict(scaled_data_df)

    sil_score = silhouette_score(scaled_data_df, cluster_labels)

    silhoutte_scores.append(sil_score)

    inertia.append(kmeans.inertia_)

plt.figure(figsize=(14, 6))

plt.subplot(1, 2, 1)
plt.plot(k_values, inertia, marker='o')
plt.title('KMeans Inertia for Different Values of k')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia')
plt.xticks(k_values)
plt.grid(True)

plt.subplot(1, 2, 2)
plt.plot(k_values, silhoutte_scores, marker='o', color='orange')
plt.title('Silhouette Scores for Different Values of k')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Silhouette Score')
plt.xticks(k_values)
plt.grid(True)

plt.tight_layout()
plt.show()
No description has been provided for this image
In [44]:
import numpy as np

# Prepare a summary DataFrame for the optimal k selection metrics
summary_df = pd.DataFrame({
    'k': list(k_values),
    'Inertia': [int(round(val)) for val in inertia],
    'Silhouette': [round(val, 2) for val in silhoutte_scores]
})

# Format numbers for display
summary_df['Inertia'] = summary_df['Inertia'].apply(lambda x: f"{x:,}")

# Pivot for pretty printing
summary_df_display = summary_df.set_index('k').T
summary_df_display.index = ['Inertia', 'Silhouette']
summary_df_display.columns = [f'k={k}' for k in summary_df_display.columns]

print("Example Output (Optimal k):")
display(summary_df_display)
Example Output (Optimal k):
k=2 k=3 k=4 k=5 k=6 k=7 k=8 k=9 k=10 k=11 k=12
Inertia 24,493 17,874 13,842 11,655 10,262 9,628 8,872 7,839 7,307 6,838 6,398
Silhouette 0.33 0.32 0.34 0.32 0.32 0.30 0.28 0.28 0.28 0.27 0.28

Silhouette Score¶

$$ s(i) = \frac{b(i) - a(i)}{\max(a(i), b(i))} $$

Where:

  • $s(i)$ is the silhouette score for a single sample $i$
  • $a(i)$ is the average distance between $i$ and all other points in the same cluster,
  • $b(i)$ is the minimum average distance between $i$ and all points in the nearest cluster to which $i$ does not belong.

The silhoutte score ranges between $[-1, 1]$, a higher value indicates more distinct clusters.

  1. Inertia Analysis (Elbow Method)

Inertia measures the sum of squared distances of data points to their nearest cluster center. Lower inertia indicates tighter clusters.

Trend:

Inertia decreases sharply from k=2 (24,493) to k=4 (13,842), then more gradually afterward.

The "elbow point" (where the rate of decrease slows) occurs around k=4 or k=5.

Key Insight: The elbow suggests diminishing returns beyond k=4 or k=5.

  1. Silhouette Score Analysis

Silhouette Score quantifies cluster separation (higher = better-defined clusters). Scores range from -1 to +1.

Trend:

The highest score is 0.34 at k=4, followed by 0.33 at k=2 and 0.32 at k=3/k=5/k=6.

Scores decline steadily after k=4.

Key Insight: k=4 maximizes cluster quality according to silhouette scores.

  1. Combined Decision: Optimal k

Metric k=2 k=3 k=4 k=5 k=6 ... Inertia 24,493 17,874 13,842 11,655 10,262 ... Silhouette 0.33 0.32 0.34 0.32 0.32 ... Recommendation: k=4 is optimal because:

It has the highest silhouette score (0.34).

The elbow point in inertia occurs here, indicating a good trade-off between compactness and complexity.

Beyond k=4, gains in inertia reduction are marginal, and silhouette scores degrade

performing K-Means clustering on your dataset.

In [45]:
kmeans = KMeans(n_clusters=4, random_state=42, max_iter=1000)

cluster_labels = kmeans.fit_predict(scaled_data_df)

cluster_labels
Out[45]:
array([3, 3, 2, ..., 1, 0, 2], shape=(13084,), dtype=int32)
In [46]:
non_outliers_df["Cluster"] = cluster_labels

non_outliers_df
C:\Users\USER\AppData\Local\Temp\ipykernel_20060\3577770544.py:1: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

Out[46]:
customer_id MonetaryValue Frequency Recency Cluster
0 0009655768c64bdeb2e877511632db8f 127.60 8 0.75 3
1 0011e0d4e6b944f998e987f904e8c1e5 79.46 5 2.50 3
2 0020c2b971eb4e9188eac86d93036a77 196.86 8 0.25 2
3 0020ccbbb6d84e358d3414a3ff76cffd 154.05 12 1.75 2
4 003d66b6608740288d6cc97a6903f4f0 48.34 18 0.75 0
... ... ... ... ... ...
14486 fff29fb549084123bd046dbc5ceb4faa 291.47 11 1.25 2
14488 fff7576017104bcc8677a8d63322b5e1 29.94 6 0.75 3
14489 fff8957ea8b240a6b5e634b6ee8eafcf 12.15 5 5.75 1
14490 fffad4f4828548d1b5583907f2e9906b 88.83 12 1.50 0
14491 ffff82501cea40309d5fdd7edcca4a07 226.07 15 2.75 2

13084 rows × 5 columns

In [47]:
cluster_colors = {0: '#6f1d1b', 
                  1: '#99582a', 
                  2: '#432818',  
                  3: '#bb9457'}  

colors = non_outliers_df['Cluster'].map(cluster_colors)

fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(projection='3d')

scatter = ax.scatter(non_outliers_df['MonetaryValue'], 
                     non_outliers_df['Frequency'], 
                     non_outliers_df['Recency'], 
                     c=colors,  # Use mapped solid colors
                     marker='o')

ax.set_xlabel('Monetary Value')
ax.set_ylabel('Frequency')
ax.set_zlabel('Recency')

ax.set_title('3D Scatter Plot of Customer Data by Cluster')

plt.show()
No description has been provided for this image
In [48]:
import plotly.express as px

# Convert cluster_colors keys to strings for plotly compatibility
color_discrete_map = {str(k): v for k, v in cluster_colors.items()}

fig = px.scatter_3d(
    non_outliers_df,
    x="MonetaryValue",
    y="Frequency",
    z="Recency",
    color="Cluster",
    color_discrete_map=color_discrete_map,
    hover_data=["customer_id", "MonetaryValue", "Frequency", "Recency"],
    title="Interactive 3D Scatter Plot of Customer Data by Cluster"
)
fig.update_traces(marker=dict(size=5))
fig.show()
In [49]:
plt.figure(figsize=(12, 18))

plt.subplot(3, 1, 1)
sns.violinplot(x=non_outliers_df['Cluster'], y=non_outliers_df['MonetaryValue'], palette=cluster_colors, hue=non_outliers_df["Cluster"])
sns.violinplot(y=non_outliers_df['MonetaryValue'], color='gray', linewidth=1.0)
plt.title('Monetary Value by Cluster')
plt.ylabel('Monetary Value')

plt.subplot(3, 1, 2)
sns.violinplot(x=non_outliers_df['Cluster'], y=non_outliers_df['Frequency'], palette=cluster_colors, hue=non_outliers_df["Cluster"])
sns.violinplot(y=non_outliers_df['Frequency'], color='gray', linewidth=1.0)
plt.title('Frequency by Cluster')
plt.ylabel('Frequency')


plt.subplot(3, 1, 3)
sns.violinplot(x=non_outliers_df['Cluster'], y=non_outliers_df['Recency'], palette=cluster_colors, hue=non_outliers_df["Cluster"])
sns.violinplot(y=non_outliers_df['Recency'], color='gray', linewidth=1.0)
plt.title('Recency by Cluster')
plt.ylabel('Recency')

plt.tight_layout()
plt.show()
No description has been provided for this image

🔵 Cluster 0 – "Champions" High frequency, high monetary, low recency Interpretation: These customers purchase frequently, spend a lot, and have bought recently. They’re your top-tier loyal customers.

📌 Action:

Prioritize them for exclusive offers, early access to new products, and VIP programs.

Keep them engaged through personalized emails and thank-you gestures.

🟠 Cluster 1 – "Slipping Away" Medium-low frequency, low monetary, high recency Interpretation: These customers haven’t purchased in a while. Their spending and frequency are low compared to others. They're at risk of churning.

📌 Action:

Win-back campaigns, reminders, or time-sensitive discounts.

Ask for feedback: “We miss you!” emails to understand their drop-off.

🟢 Cluster 2 – "Potential Loyalists" Mid-high monetary, decent frequency, low recency Interpretation: These customers made recent purchases, spend a good amount, and could become your next champions if nurtured properly.

📌 Action:

Encourage repeat purchases through loyalty points, upselling, and next-best offers.

Offer personalized experiences to build strong habits.

🔴 Cluster 3 – "New & Low-Engaged" Low frequency, low monetary, recent transactions Interpretation: New or low-engagement users. They’ve recently interacted but not consistently.

📌 Action:

Provide onboarding offers and a warm welcome.

Use email flows, introductory discounts, or “How to get the best from us” content.

Customer Metrics Distribution by Cluster¶

The violin plots above visualize the distribution of three key customer metrics—Monetary Value, Frequency, and Recency—across the identified customer clusters.

  • Monetary Value: Shows how much customers in each cluster spend on average. Wider sections indicate more customers at that spending level. Some clusters have higher average spenders (e.g., "DELIGHT", "PAMPER"), while others are lower.

  • Frequency: Indicates how often customers in each cluster make purchases. Clusters like "DELIGHT" and "UPSELL" show higher purchase frequencies, suggesting highly engaged customers.

  • Recency: Measures how recently customers made a purchase. Lower values mean more recent activity. Clusters with low recency (e.g., "RETAIN", "DELIGHT") represent customers who have purchased very recently.

Interpretation:

  • Each cluster has a distinct profile, combining spending, engagement, and recency.
  • Outlier clusters (e.g., "DELIGHT", "PAMPER", "UPSELL") are separated and show extreme values in one or more metrics.
  • The distribution shapes help identify which clusters are high-value, frequent, or recently active, supporting targeted marketing strategies.

Yes, there is overlap between monetary and frequency outliers.

  • The variable overlap_indices contains the indices of customers who are outliers in both monetary value and frequency.
  • The DataFrame monetary_and_frequency_outliers shows these overlapping outliers.

There are 35 such customers, as seen in monetary_and_frequency_outliers.

Are the overlaps significant?

  • Monetary & Frequency overlap: 35 customers (2.48% of all outliers)
  • Monetary & Recency overlap: 4 customers (0.28% of all outliers)
  • Frequency & Recency overlap: 0 customers (0.00% of all outliers)
  • Total outliers: 1,412

Interpretation:

  • The overlaps between outlier groups are very small relative to the total number of outliers.
  • Only a small fraction of customers are extreme in more than one dimension.
  • This suggests that most outliers are unique to a single metric, and multi-metric outliers are not a major driver in the dataset.
  • The low overlap means that segmentation and interventions can be tailored to specific behaviors (spending, frequency, or recency) rather than broad, multi-dimensional outlier profiles.

This code is labeling outliers into distinct custom groups (clusters) based on whether they are outliers in Monetary, Frequency, or Recency, or some combination — and combining them all into a single DataFrame

In [50]:
overlap_indices = monetary_outliers_df.index.intersection(frequency_outliers_df.index)
recency_overlap_indices = recency_outliers_df.index.intersection(monetary_outliers_df.index.union(frequency_outliers_df.index))

monetary_only_outliers = monetary_outliers_df.drop(overlap_indices)
frequency_only_outliers = frequency_outliers_df.drop(overlap_indices)
recency_only_outliers = recency_outliers_df.drop(recency_overlap_indices)
monetary_and_frequency_outliers = monetary_outliers_df.loc[overlap_indices]

monetary_only_outliers["Cluster"] = -1
frequency_only_outliers["Cluster"] = -2
recency_only_outliers["Cluster"] = -3
monetary_and_frequency_outliers["Cluster"] = -4

outlier_clusters_df = pd.concat([monetary_only_outliers, frequency_only_outliers,recency_only_outliers, monetary_and_frequency_outliers])

outlier_clusters_df
Out[50]:
customer_id MonetaryValue Frequency Recency Cluster
10 00715b6e55c3431cb56ff7307eb19675 375.12 15 0.50 -1
43 00e9f403afa641889cd034ee7c7ca6e9 614.59 17 0.25 -1
60 01443a2afce54939a323c978f467c540 417.93 16 1.75 -1
76 017febbe52e64ac19cf28cf0d44386e4 1003.41 10 7.00 -1
87 019ed95d987446e3947da0246cdab831 515.09 8 6.25 -1
... ... ... ... ... ...
10972 c06cbc22ae8d4ce39d8c2cdee4272ace 460.52 22 0.75 -4
11925 d237d3afa2cf4463a07e06b9a8fcba0c 400.87 26 2.75 -4
13668 f130699fa7934e2ebbfa0a3f976d645c 405.54 23 2.75 -4
13881 f5205b328b564bf0a55952a8dc19ce6a 395.59 24 0.00 -4
14430 fee2d799672d4f81bfa3237207290f79 361.49 21 4.00 -4

1408 rows × 5 columns

In [51]:
# Total number of outliers (from outlier_clusters_df)
total_outliers = len(outlier_clusters_df)

# Overlap counts
monetary_frequency_overlap = 35
monetary_recency_overlap = 4
frequency_recency_overlap = 0

# Calculate percentages
mf_pct = (monetary_frequency_overlap / total_outliers) * 100
mr_pct = (monetary_recency_overlap / total_outliers) * 100
fr_pct = (frequency_recency_overlap / total_outliers) * 100

print(f"Monetary & Frequency overlap: {monetary_frequency_overlap} ({mf_pct:.2f}%)")
print(f"Monetary & Recency overlap: {monetary_recency_overlap} ({mr_pct:.2f}%)")
print(f"Frequency & Recency overlap: {frequency_recency_overlap} ({fr_pct:.2f}%)")
print(f"Total outliers: {total_outliers}")
Monetary & Frequency overlap: 35 (2.49%)
Monetary & Recency overlap: 4 (0.28%)
Frequency & Recency overlap: 0 (0.00%)
Total outliers: 1408
In [52]:
print("Total outlier records:", outlier_clusters_df.shape[0])
print("Unique records:", outlier_clusters_df.index.nunique())
Total outlier records: 1408
Unique records: 1408
In [53]:
outlier_clusters_df["Cluster"].value_counts()
Out[53]:
Cluster
-3    574
-1    435
-2    364
-4     35
Name: count, dtype: int64
In [54]:
cluster_colors = {-1: '#9467bd',
                  -2: '#8c564b',
                  -3: '#e377c2',
                  -4: '#6f1d1b'}

plt.figure(figsize=(12, 18))

plt.subplot(3, 1, 1)
sns.violinplot(x=outlier_clusters_df['Cluster'], y=outlier_clusters_df['MonetaryValue'], palette=cluster_colors, hue=outlier_clusters_df["Cluster"])
sns.violinplot(y=outlier_clusters_df['MonetaryValue'], color='gray', linewidth=1.0)
plt.title('Monetary Value by Cluster')
plt.ylabel('Monetary Value')

plt.subplot(3, 1, 2)
sns.violinplot(x=outlier_clusters_df['Cluster'], y=outlier_clusters_df['Frequency'], palette=cluster_colors, hue=outlier_clusters_df["Cluster"])
sns.violinplot(y=outlier_clusters_df['Frequency'], color='gray', linewidth=1.0)
plt.title('Frequency by Cluster')
plt.ylabel('Frequency')

plt.subplot(3, 1, 3)
sns.violinplot(x=outlier_clusters_df['Cluster'], y=outlier_clusters_df['Recency'], palette=cluster_colors, hue=outlier_clusters_df["Cluster"])
sns.violinplot(y=outlier_clusters_df['Recency'], color='gray', linewidth=1.0)
plt.title('Recency by Cluster')
plt.ylabel('Recency')

plt.tight_layout()
plt.show()
No description has been provided for this image

🔴 Cluster -4: DELIGHT 🧠 Characteristics:

Extremely high Monetary value — the highest of all clusters.

Very high Frequency — they buy often.

Very low Recency — meaning they purchased very recently. These are your best customers: frequent, recent, and high-spending.

💡 Strategy – “Delight”

VIP Programs – Invite-only clubs or premium memberships.

Surprise Gifts – Unannounced freebies or bonus points.

First Access – Let them preview new products or features.

Personal Concierge – If applicable, assign relationship managers.

🌸 Cluster -3: RE-ENGAGE 🧠 Characteristics:

Low Monetary, Low Frequency, and high Recency.

They barely buy, don’t spend much, and haven't purchased recently. These users are the least engaged and at high churn risk.

💡 Strategy – “Re-Engage”

Win-back Campaigns – “We miss you” emails or SMS with strong incentives.

Reactivation Offers – Deep discounts or free shipping to lure them back.

Feedback Loop – Ask them why they left or what they need.

☕ Cluster -2: UPSELL 🧠 Characteristics:

Low Monetary but high Frequency.

Recency is also very low (they purchase frequently and recently). These are habitual but low-value spenders.

💡 Strategy – “Upsell”

Bundling – Promote combos and packages to raise spend per visit.

Tiered Loyalty – Unlock rewards as they spend more.

Premium Add-ons – Offer product upgrades or extras at checkout.

🟣 Cluster -1: PAMPER 🧠 Characteristics:

High Monetary, medium Frequency, low Recency.

They don’t buy that often but when they do, they spend big, and they’ve done so recently. These are valuable infrequent buyers.

💡 Strategy – “Pamper”

Personalized Offers – Based on their preferences or past purchases.

Luxury Treatment – Priority support, curated experiences.

Special Event Invites – Like tasting events, webinars, or launches.

In [55]:
cluster_labels = {
    0: "Loyal Core",             
    1: "Sleeping Buyers",        
    2: "First-Time Buyers",      
    3: "Frequent Shoppers",      
    -1: "High Spenders",         
    -2: "Potential Upgraders",   
    -3: "VIP Customers",         
    -4: "Win-Back Targets"       
}
In [56]:
full_clustering_df = pd.concat([non_outliers_df, outlier_clusters_df])

full_clustering_df
Out[56]:
customer_id MonetaryValue Frequency Recency Cluster
0 0009655768c64bdeb2e877511632db8f 127.60 8 0.75 3
1 0011e0d4e6b944f998e987f904e8c1e5 79.46 5 2.50 3
2 0020c2b971eb4e9188eac86d93036a77 196.86 8 0.25 2
3 0020ccbbb6d84e358d3414a3ff76cffd 154.05 12 1.75 2
4 003d66b6608740288d6cc97a6903f4f0 48.34 18 0.75 0
... ... ... ... ... ...
10972 c06cbc22ae8d4ce39d8c2cdee4272ace 460.52 22 0.75 -4
11925 d237d3afa2cf4463a07e06b9a8fcba0c 400.87 26 2.75 -4
13668 f130699fa7934e2ebbfa0a3f976d645c 405.54 23 2.75 -4
13881 f5205b328b564bf0a55952a8dc19ce6a 395.59 24 0.00 -4
14430 fee2d799672d4f81bfa3237207290f79 361.49 21 4.00 -4

14492 rows × 5 columns

In [57]:
full_clustering_df["ClusterLabel"] = full_clustering_df["Cluster"].map(cluster_labels)

full_clustering_df
Out[57]:
customer_id MonetaryValue Frequency Recency Cluster ClusterLabel
0 0009655768c64bdeb2e877511632db8f 127.60 8 0.75 3 Frequent Shoppers
1 0011e0d4e6b944f998e987f904e8c1e5 79.46 5 2.50 3 Frequent Shoppers
2 0020c2b971eb4e9188eac86d93036a77 196.86 8 0.25 2 First-Time Buyers
3 0020ccbbb6d84e358d3414a3ff76cffd 154.05 12 1.75 2 First-Time Buyers
4 003d66b6608740288d6cc97a6903f4f0 48.34 18 0.75 0 Loyal Core
... ... ... ... ... ... ...
10972 c06cbc22ae8d4ce39d8c2cdee4272ace 460.52 22 0.75 -4 Win-Back Targets
11925 d237d3afa2cf4463a07e06b9a8fcba0c 400.87 26 2.75 -4 Win-Back Targets
13668 f130699fa7934e2ebbfa0a3f976d645c 405.54 23 2.75 -4 Win-Back Targets
13881 f5205b328b564bf0a55952a8dc19ce6a 395.59 24 0.00 -4 Win-Back Targets
14430 fee2d799672d4f81bfa3237207290f79 361.49 21 4.00 -4 Win-Back Targets

14492 rows × 6 columns

In [58]:
cluster_counts = full_clustering_df['ClusterLabel'].value_counts()
full_clustering_df["MonetaryValue per 100 pounds"] = full_clustering_df["MonetaryValue"] / 100.00
feature_means = full_clustering_df.groupby('ClusterLabel')[['Recency', 'Frequency', 'MonetaryValue per 100 pounds']].mean()

fig, ax1 = plt.subplots(figsize=(15, 8))

sns.barplot(x=cluster_counts.index, y=cluster_counts.values, ax=ax1, palette='viridis', hue=cluster_counts.index)
ax1.set_ylabel('Number of Customers', color='b')
ax1.set_title('Cluster Distribution with Average Feature Values')

ax2 = ax1.twinx()

sns.lineplot(data=feature_means, ax=ax2, palette='Set2', marker='o')
ax2.set_ylabel('Average Value', color='g')

plt.show()
No description has been provided for this image

To join full_clustering_df with maven_cafe, use a left merge on customer_id. This will add the cluster labels to every transaction/event in maven_cafe.

In [59]:
full_export_df = maven_cafe.merge(
    full_clustering_df[['customer_id', 'Cluster', 'ClusterLabel']],
    on='customer_id',
    how='left'
)
full_export_df.to_csv('maven_cafe_with_clusters.csv', index=False)
In [60]:
full_export_df
Out[60]:
customer_id event value Amount became_member_on gender age income offer_id offer_type difficulty reward duration channels time_in_days web email mobile social age_group income_group Cluster ClusterLabel
0 0009655768c64bdeb2e877511632db8f offer viewed fafdcd668e3743c1bb461111dcafc2a4 NaN 2017-04-21 M 33.00 72000.00 fafdcd668e3743c1bb461111dcafc2a4 discount 10.00 2.00 10.00 ['web', 'email', 'mobile', 'social'] 22.50 1 1 1 1 Early Career Professionals Middle Income 3.00 Frequent Shoppers
1 0009655768c64bdeb2e877511632db8f offer received 5a8bc65990b245e5a138643cd4eb9837 NaN 2017-04-21 M 33.00 72000.00 5a8bc65990b245e5a138643cd4eb9837 informational 0.00 0.00 3.00 ['email', 'mobile', 'social'] 7.00 0 1 1 1 Early Career Professionals Middle Income 3.00 Frequent Shoppers
2 0009655768c64bdeb2e877511632db8f offer received f19421c1d4aa40978ebb69ca19b0e20d NaN 2017-04-21 M 33.00 72000.00 f19421c1d4aa40978ebb69ca19b0e20d bogo 5.00 5.00 5.00 ['web', 'email', 'mobile', 'social'] 17.00 1 1 1 1 Early Career Professionals Middle Income 3.00 Frequent Shoppers
3 0009655768c64bdeb2e877511632db8f transaction NaN 8.57 2017-04-21 M 33.00 72000.00 NaN NaN NaN NaN NaN NaN 17.25 0 0 0 0 Early Career Professionals Middle Income 3.00 Frequent Shoppers
4 0009655768c64bdeb2e877511632db8f offer viewed f19421c1d4aa40978ebb69ca19b0e20d NaN 2017-04-21 M 33.00 72000.00 f19421c1d4aa40978ebb69ca19b0e20d bogo 5.00 5.00 5.00 ['web', 'email', 'mobile', 'social'] 19.00 1 1 1 1 Early Career Professionals Middle Income 3.00 Frequent Shoppers
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
272383 ffff82501cea40309d5fdd7edcca4a07 offer completed fafdcd668e3743c1bb461111dcafc2a4 NaN 2016-11-25 F 45.00 62000.00 fafdcd668e3743c1bb461111dcafc2a4 discount 10.00 2.00 10.00 ['web', 'email', 'mobile', 'social'] 2.50 1 1 1 1 Mature Professionals Middle Income 2.00 First-Time Buyers
272384 ffff82501cea40309d5fdd7edcca4a07 offer completed 9b98b8c7a33c4b65b9aebfe6a799e6d9 NaN 2016-11-25 F 45.00 62000.00 9b98b8c7a33c4b65b9aebfe6a799e6d9 bogo 5.00 5.00 7.00 ['web', 'email', 'mobile'] 21.00 1 1 1 0 Mature Professionals Middle Income 2.00 First-Time Buyers
272385 ffff82501cea40309d5fdd7edcca4a07 offer viewed fafdcd668e3743c1bb461111dcafc2a4 NaN 2016-11-25 F 45.00 62000.00 fafdcd668e3743c1bb461111dcafc2a4 discount 10.00 2.00 10.00 ['web', 'email', 'mobile', 'social'] 0.25 1 1 1 1 Mature Professionals Middle Income 2.00 First-Time Buyers
272386 ffff82501cea40309d5fdd7edcca4a07 offer completed 0b1e1539f2cc45b7b9fa7c272da2e1d7 NaN 2016-11-25 F 45.00 62000.00 0b1e1539f2cc45b7b9fa7c272da2e1d7 discount 20.00 5.00 10.00 ['web', 'email'] 8.25 1 1 0 0 Mature Professionals Middle Income 2.00 First-Time Buyers
272387 ffff82501cea40309d5fdd7edcca4a07 transaction NaN 15.57 2016-11-25 F 45.00 62000.00 NaN NaN NaN NaN NaN NaN 16.00 0 0 0 0 Mature Professionals Middle Income 2.00 First-Time Buyers

272388 rows × 23 columns

In [61]:
maven_cafe
Out[61]:
customer_id event value Amount became_member_on gender age income offer_id offer_type difficulty reward duration channels time_in_days web email mobile social age_group income_group
0 0009655768c64bdeb2e877511632db8f offer viewed fafdcd668e3743c1bb461111dcafc2a4 NaN 2017-04-21 M 33.00 72000.00 fafdcd668e3743c1bb461111dcafc2a4 discount 10.00 2.00 10.00 ['web', 'email', 'mobile', 'social'] 22.50 1 1 1 1 Early Career Professionals Middle Income
1 0009655768c64bdeb2e877511632db8f offer received 5a8bc65990b245e5a138643cd4eb9837 NaN 2017-04-21 M 33.00 72000.00 5a8bc65990b245e5a138643cd4eb9837 informational 0.00 0.00 3.00 ['email', 'mobile', 'social'] 7.00 0 1 1 1 Early Career Professionals Middle Income
2 0009655768c64bdeb2e877511632db8f offer received f19421c1d4aa40978ebb69ca19b0e20d NaN 2017-04-21 M 33.00 72000.00 f19421c1d4aa40978ebb69ca19b0e20d bogo 5.00 5.00 5.00 ['web', 'email', 'mobile', 'social'] 17.00 1 1 1 1 Early Career Professionals Middle Income
3 0009655768c64bdeb2e877511632db8f transaction NaN 8.57 2017-04-21 M 33.00 72000.00 NaN NaN NaN NaN NaN NaN 17.25 0 0 0 0 Early Career Professionals Middle Income
4 0009655768c64bdeb2e877511632db8f offer viewed f19421c1d4aa40978ebb69ca19b0e20d NaN 2017-04-21 M 33.00 72000.00 f19421c1d4aa40978ebb69ca19b0e20d bogo 5.00 5.00 5.00 ['web', 'email', 'mobile', 'social'] 19.00 1 1 1 1 Early Career Professionals Middle Income
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
272383 ffff82501cea40309d5fdd7edcca4a07 offer completed fafdcd668e3743c1bb461111dcafc2a4 NaN 2016-11-25 F 45.00 62000.00 fafdcd668e3743c1bb461111dcafc2a4 discount 10.00 2.00 10.00 ['web', 'email', 'mobile', 'social'] 2.50 1 1 1 1 Mature Professionals Middle Income
272384 ffff82501cea40309d5fdd7edcca4a07 offer completed 9b98b8c7a33c4b65b9aebfe6a799e6d9 NaN 2016-11-25 F 45.00 62000.00 9b98b8c7a33c4b65b9aebfe6a799e6d9 bogo 5.00 5.00 7.00 ['web', 'email', 'mobile'] 21.00 1 1 1 0 Mature Professionals Middle Income
272385 ffff82501cea40309d5fdd7edcca4a07 offer viewed fafdcd668e3743c1bb461111dcafc2a4 NaN 2016-11-25 F 45.00 62000.00 fafdcd668e3743c1bb461111dcafc2a4 discount 10.00 2.00 10.00 ['web', 'email', 'mobile', 'social'] 0.25 1 1 1 1 Mature Professionals Middle Income
272386 ffff82501cea40309d5fdd7edcca4a07 offer completed 0b1e1539f2cc45b7b9fa7c272da2e1d7 NaN 2016-11-25 F 45.00 62000.00 0b1e1539f2cc45b7b9fa7c272da2e1d7 discount 20.00 5.00 10.00 ['web', 'email'] 8.25 1 1 0 0 Mature Professionals Middle Income
272387 ffff82501cea40309d5fdd7edcca4a07 transaction NaN 15.57 2016-11-25 F 45.00 62000.00 NaN NaN NaN NaN NaN NaN 16.00 0 0 0 0 Mature Professionals Middle Income

272388 rows × 21 columns