import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
# Setting to make numbers easier to read on display
pd.options.display.float_format = '{:20.2f}'.format
# Show all columns on output
pd.set_option('display.max_columns', 999)
# Load CSVs
df = pd.read_csv(r"C:\Users\USER\Downloads\Maven_Cafe\Data\maven_cafe.csv", low_memory=False)
maven_cafe = df.copy()
print("Duplicates in df:", maven_cafe.duplicated().sum())
Duplicates in df: 0
maven_cafe
customer_id | event | value | Amount | became_member_on | gender | age | income | offer_id | offer_type | difficulty | reward | duration | channels | time_in_days | web | mobile | social | age_group | income_group | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0009655768c64bdeb2e877511632db8f | offer viewed | fafdcd668e3743c1bb461111dcafc2a4 | NaN | 2017-04-21 | M | 33.00 | 72000.00 | fafdcd668e3743c1bb461111dcafc2a4 | discount | 10.00 | 2.00 | 10.00 | ['web', 'email', 'mobile', 'social'] | 22.50 | 1 | 1 | 1 | 1 | Early Career Professionals | Middle Income |
1 | 0009655768c64bdeb2e877511632db8f | offer received | 5a8bc65990b245e5a138643cd4eb9837 | NaN | 2017-04-21 | M | 33.00 | 72000.00 | 5a8bc65990b245e5a138643cd4eb9837 | informational | 0.00 | 0.00 | 3.00 | ['email', 'mobile', 'social'] | 7.00 | 0 | 1 | 1 | 1 | Early Career Professionals | Middle Income |
2 | 0009655768c64bdeb2e877511632db8f | offer received | f19421c1d4aa40978ebb69ca19b0e20d | NaN | 2017-04-21 | M | 33.00 | 72000.00 | f19421c1d4aa40978ebb69ca19b0e20d | bogo | 5.00 | 5.00 | 5.00 | ['web', 'email', 'mobile', 'social'] | 17.00 | 1 | 1 | 1 | 1 | Early Career Professionals | Middle Income |
3 | 0009655768c64bdeb2e877511632db8f | transaction | NaN | 8.57 | 2017-04-21 | M | 33.00 | 72000.00 | NaN | NaN | NaN | NaN | NaN | NaN | 17.25 | 0 | 0 | 0 | 0 | Early Career Professionals | Middle Income |
4 | 0009655768c64bdeb2e877511632db8f | offer viewed | f19421c1d4aa40978ebb69ca19b0e20d | NaN | 2017-04-21 | M | 33.00 | 72000.00 | f19421c1d4aa40978ebb69ca19b0e20d | bogo | 5.00 | 5.00 | 5.00 | ['web', 'email', 'mobile', 'social'] | 19.00 | 1 | 1 | 1 | 1 | Early Career Professionals | Middle Income |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
272383 | ffff82501cea40309d5fdd7edcca4a07 | offer completed | fafdcd668e3743c1bb461111dcafc2a4 | NaN | 2016-11-25 | F | 45.00 | 62000.00 | fafdcd668e3743c1bb461111dcafc2a4 | discount | 10.00 | 2.00 | 10.00 | ['web', 'email', 'mobile', 'social'] | 2.50 | 1 | 1 | 1 | 1 | Mature Professionals | Middle Income |
272384 | ffff82501cea40309d5fdd7edcca4a07 | offer completed | 9b98b8c7a33c4b65b9aebfe6a799e6d9 | NaN | 2016-11-25 | F | 45.00 | 62000.00 | 9b98b8c7a33c4b65b9aebfe6a799e6d9 | bogo | 5.00 | 5.00 | 7.00 | ['web', 'email', 'mobile'] | 21.00 | 1 | 1 | 1 | 0 | Mature Professionals | Middle Income |
272385 | ffff82501cea40309d5fdd7edcca4a07 | offer viewed | fafdcd668e3743c1bb461111dcafc2a4 | NaN | 2016-11-25 | F | 45.00 | 62000.00 | fafdcd668e3743c1bb461111dcafc2a4 | discount | 10.00 | 2.00 | 10.00 | ['web', 'email', 'mobile', 'social'] | 0.25 | 1 | 1 | 1 | 1 | Mature Professionals | Middle Income |
272386 | ffff82501cea40309d5fdd7edcca4a07 | offer completed | 0b1e1539f2cc45b7b9fa7c272da2e1d7 | NaN | 2016-11-25 | F | 45.00 | 62000.00 | 0b1e1539f2cc45b7b9fa7c272da2e1d7 | discount | 20.00 | 5.00 | 10.00 | ['web', 'email'] | 8.25 | 1 | 1 | 0 | 0 | Mature Professionals | Middle Income |
272387 | ffff82501cea40309d5fdd7edcca4a07 | transaction | NaN | 15.57 | 2016-11-25 | F | 45.00 | 62000.00 | NaN | NaN | NaN | NaN | NaN | NaN | 16.00 | 0 | 0 | 0 | 0 | Mature Professionals | Middle Income |
272388 rows × 21 columns
# Filter only transaction events
df_transactions = maven_cafe[maven_cafe['event'] == 'transaction']
# Get latest day in the dataset
latest_day = df_transactions['time_in_days'].max()
# Aggregate RFM
rfm_df = df_transactions.groupby('customer_id', as_index=False).agg(
MonetaryValue=('Amount', 'sum'),
Frequency=('time_in_days', 'count'), # or .size()
LastPurchase=('time_in_days', 'max')
)
# Calculate Recency
rfm_df['Recency'] = latest_day - rfm_df['LastPurchase']
# Optional: Drop LastPurchase if you don’t need it
rfm_df.drop(columns='LastPurchase', inplace=True)
rfm_df.head()
customer_id | MonetaryValue | Frequency | Recency | |
---|---|---|---|---|
0 | 0009655768c64bdeb2e877511632db8f | 127.60 | 8 | 0.75 |
1 | 0011e0d4e6b944f998e987f904e8c1e5 | 79.46 | 5 | 2.50 |
2 | 0020c2b971eb4e9188eac86d93036a77 | 196.86 | 8 | 0.25 |
3 | 0020ccbbb6d84e358d3414a3ff76cffd | 154.05 | 12 | 1.75 |
4 | 003d66b6608740288d6cc97a6903f4f0 | 48.34 | 18 | 0.75 |
plt.figure(figsize=(15, 5))
plt.subplot(1, 3, 1)
plt.hist(rfm_df['MonetaryValue'], bins=10, color='skyblue', edgecolor='black')
plt.title('Monetary Value Distribution')
plt.xlabel('Monetary Value')
plt.ylabel('Count')
plt.subplot(1, 3, 2)
plt.hist(rfm_df['Frequency'], bins=10, color='lightgreen', edgecolor='black')
plt.title('Frequency Distribution')
plt.xlabel('Frequency')
plt.ylabel('Count')
plt.subplot(1, 3, 3)
plt.hist(rfm_df['Recency'], bins=20, color='salmon', edgecolor='black')
plt.title('Recency Distribution')
plt.xlabel('Recency')
plt.ylabel('Count')
plt.tight_layout()
plt.show()
plt.figure(figsize=(15, 5))
plt.subplot(1, 3, 1)
sns.boxplot(data=rfm_df['MonetaryValue'], color='skyblue')
plt.title('Monetary Value Boxplot')
plt.xlabel('Monetary Value')
plt.subplot(1, 3, 2)
sns.boxplot(data=rfm_df['Frequency'], color='lightgreen')
plt.title('Frequency Boxplot')
plt.xlabel('Frequency')
plt.subplot(1, 3, 3)
sns.boxplot(data=rfm_df['Recency'], color='salmon')
plt.title('Recency Boxplot')
plt.xlabel('Recency')
plt.tight_layout()
plt.show()
M_Q1 = rfm_df["MonetaryValue"].quantile(0.25)
M_Q3 = rfm_df["MonetaryValue"].quantile(0.75)
M_IQR = M_Q3 - M_Q1
monetary_outliers_df = rfm_df[(rfm_df["MonetaryValue"] > (M_Q3 + 1.5 * M_IQR)) | (rfm_df["MonetaryValue"] < (M_Q1 - 1.5 * M_IQR))].copy()
monetary_outliers_df.describe()
MonetaryValue | Frequency | Recency | |
---|---|---|---|
count | 470.00 | 470.00 | 470.00 |
mean | 630.92 | 12.52 | 2.47 |
std | 250.01 | 5.29 | 2.30 |
min | 355.68 | 2.00 | 0.00 |
25% | 405.56 | 9.00 | 0.75 |
50% | 560.86 | 12.00 | 1.75 |
75% | 808.53 | 16.00 | 3.50 |
max | 1608.69 | 31.00 | 12.75 |
F_Q1 = rfm_df['Frequency'].quantile(0.25)
F_Q3 = rfm_df['Frequency'].quantile(0.75)
F_IQR = F_Q3 - F_Q1
frequency_outliers_df = rfm_df[(rfm_df['Frequency'] > (F_Q3 + 1.5 * F_IQR)) | (rfm_df['Frequency'] < (F_Q1 - 1.5 * F_IQR))].copy()
frequency_outliers_df.describe()
MonetaryValue | Frequency | Recency | |
---|---|---|---|
count | 399.00 | 399.00 | 399.00 |
mean | 157.54 | 23.13 | 1.44 |
std | 145.61 | 2.47 | 1.40 |
min | 38.34 | 21.00 | 0.00 |
25% | 70.39 | 21.00 | 0.50 |
50% | 89.21 | 22.00 | 1.00 |
75% | 189.12 | 24.00 | 2.00 |
max | 1040.98 | 36.00 | 8.25 |
F_Q1 = rfm_df['Recency'].quantile(0.25)
F_Q3 = rfm_df['Recency'].quantile(0.75)
F_IQR = F_Q3 - F_Q1
recency_outliers_df = rfm_df[(rfm_df['Recency'] > (F_Q3 + 1.5 * F_IQR)) | (rfm_df['Recency'] < (F_Q1 - 1.5 * F_IQR))].copy()
recency_outliers_df.describe()
MonetaryValue | Frequency | Recency | |
---|---|---|---|
count | 578.00 | 578.00 | 578.00 |
mean | 55.84 | 3.31 | 14.27 |
std | 72.32 | 2.07 | 2.99 |
min | 0.28 | 1.00 | 11.25 |
25% | 17.95 | 2.00 | 12.25 |
50% | 40.70 | 3.00 | 13.50 |
75% | 72.05 | 4.00 | 15.44 |
max | 946.98 | 15.00 | 29.25 |
non_outliers_df = rfm_df[
(~rfm_df.index.isin(monetary_outliers_df.index)) &
(~rfm_df.index.isin(frequency_outliers_df.index)) &
(~rfm_df.index.isin(recency_outliers_df.index))
]
non_outliers_df.describe()
MonetaryValue | Frequency | Recency | |
---|---|---|---|
count | 13084.00 | 13084.00 | 13084.00 |
mean | 104.25 | 8.24 | 3.13 |
std | 80.27 | 4.42 | 2.59 |
min | 0.15 | 1.00 | 0.00 |
25% | 33.14 | 5.00 | 1.00 |
50% | 89.34 | 7.00 | 2.50 |
75% | 157.29 | 11.00 | 4.50 |
max | 355.49 | 20.00 | 11.00 |
🟢 MonetaryValue Typical spending:
IQR (25th–75th percentile): $33.14–$157.29 (most customers fall here).
Median (50th percentile): $89.34, close to the mean ($104.25), suggesting a mild right skew (higher spenders pull the mean up slightly).
Max ($355.49) is only ~2.2x the 75th percentile, so no extreme outliers.
Interpretation: Healthy spending distribution. The skew is manageable; no need for log transformation unless modeling requires normality.
🔵 Frequency Typical behavior:
Median: 7 purchases, mean 8.24 (slight right skew due to high-frequency buyers).
IQR: 5–11 purchases (50% of customers).
Max (20) is high but reasonable—likely loyal/VIP customers (e.g., top 25% buy 11+ times).
Interpretation: Strong repeat purchase behavior. Segment customers by frequency (e.g., 5–7 = regular, 11+ = VIP).
🔴 Recency Key metrics:
Median: 2.5 days, mean 3.13 days (most purchased very recently).
IQR: 1–4.5 days (75% purchased within the last 4.5 days).
Max (11 days): Even the least recent customer is highly active.
Interpretation: Exceptional engagement. Likely a high-frequency business (e.g., grocery, subscription). Monitor for upward shifts in median recency (could signal churn risk).
Actionable Insights MonetaryValue:
The mild skew is acceptable, but consider segmenting high spenders (top 25%: >$157) for targeted promotions.
Frequency:
Reward VIPs (11+ purchases) to retain them.
Encourage 5–7x buyers to reach VIP tiers (e.g., loyalty programs).
Recency:
Re-engage the 75th percentile (4.5+ days since last purchase)—they’re still active but may need reminders.
Investigate 0-day recency (min=0): Are these same-day purchases or data artifacts?
To be reqorous on outlier detection i used seaborn boxplots
plt.figure(figsize=(15, 5))
plt.subplot(1, 3, 1)
sns.boxplot(data=non_outliers_df['MonetaryValue'], color='skyblue')
plt.title('Monetary Value Boxplot')
plt.xlabel('Monetary Value')
plt.subplot(1, 3, 2)
sns.boxplot(data=non_outliers_df['Frequency'], color='lightgreen')
plt.title('Frequency Boxplot')
plt.xlabel('Frequency')
plt.subplot(1, 3, 3)
sns.boxplot(data=non_outliers_df['Recency'], color='salmon')
plt.title('Recency Boxplot')
plt.xlabel('Recency')
plt.tight_layout()
plt.show()
📦 Boxplot Summary (Post-Outlier Filtering) 💰 Monetary Value Outliers above ~350 still exist, but far fewer — looks like you've successfully filtered the extreme values (like that 946.98 earlier).
The box is centered around 90, and the spread looks reasonable.
✅ Much improved — this distribution is now balanced.
🔁 Frequency No visible outliers (✔️), max value is about 20, and most data is between 5–12.
Very clean distribution.
✅ Good job — outliers properly handled.
⏱️ Recency A few outliers still appear above 10.
Majority of customers made purchases within 0–5 days.
You might want to tighten the outlier boundary, depending on how strict you want your cleaning to be.
fig = plt.figure(figsize=(8, 8))
ax = fig.add_subplot(projection="3d")
scatter = ax.scatter(non_outliers_df["MonetaryValue"], non_outliers_df["Frequency"], non_outliers_df["Recency"])
ax.set_xlabel('Monetary Value')
ax.set_ylabel('Frequency')
ax.set_zlabel('Recency')
ax.set_title('3D Scatter Plot of Customer Data')
plt.show()
Standard scaling transforms the features of your data to have a mean of 0 and a standard deviation of 1, ensuring that each feature contributes equally to the analysis.
$$ z = \frac{x - \mu}{\sigma} $$Where:
- $z$ is the standardized value,
- $x$ is the original value,
- $\mu$ is the mean of the feature,
- $\sigma$ is the standard deviation of the feature.
As a caveat this assumes that our data is normally distribution but it's a fair enough assumption
scaler = StandardScaler()
scaled_data = scaler.fit_transform(non_outliers_df[["MonetaryValue", "Frequency", "Recency"]])
scaled_data
array([[ 0.29094268, -0.05371739, -0.9156766 ], [-0.30883151, -0.73322599, -0.24098381], [ 1.15385004, -0.05371739, -1.10844597], ..., [-1.1474439 , -0.73322599, 1.01201709], [-0.19209108, 0.85229407, -0.62652255], [ 1.51777617, 1.53180267, -0.14459913]], shape=(13084, 3))
This is in numpy array but im going to make data easy to deal by converting to data frame
scaled_data_df = pd.DataFrame(scaled_data, index=non_outliers_df.index, columns=("MonetaryValue", "Frequency", "Recency"))
scaled_data_df
MonetaryValue | Frequency | Recency | |
---|---|---|---|
0 | 0.29 | -0.05 | -0.92 |
1 | -0.31 | -0.73 | -0.24 |
2 | 1.15 | -0.05 | -1.11 |
3 | 0.62 | 0.85 | -0.53 |
4 | -0.70 | 2.21 | -0.92 |
... | ... | ... | ... |
14486 | 2.33 | 0.63 | -0.72 |
14488 | -0.93 | -0.51 | -0.92 |
14489 | -1.15 | -0.73 | 1.01 |
14490 | -0.19 | 0.85 | -0.63 |
14491 | 1.52 | 1.53 | -0.14 |
13084 rows × 3 columns
fig = plt.figure(figsize=(12, 12))
ax = fig.add_subplot(projection="3d")
scatter = ax.scatter(scaled_data_df["MonetaryValue"], scaled_data_df["Frequency"], scaled_data_df["Recency"])
ax.set_xlabel('Monetary Value')
ax.set_ylabel('Frequency')
ax.set_zlabel('Recency')
ax.set_title('3D Scatter Plot of Customer Data')
plt.show()
K means clustering
to determine how many clusters we're going to use elbow mwthod to determine no. of clusters
max_k = 12
inertia = []
silhoutte_scores = []
k_values = range(2, max_k + 1)
for k in k_values:
kmeans = KMeans(n_clusters=k, random_state=42, max_iter=1000)
cluster_labels = kmeans.fit_predict(scaled_data_df)
sil_score = silhouette_score(scaled_data_df, cluster_labels)
silhoutte_scores.append(sil_score)
inertia.append(kmeans.inertia_)
plt.figure(figsize=(14, 6))
plt.subplot(1, 2, 1)
plt.plot(k_values, inertia, marker='o')
plt.title('KMeans Inertia for Different Values of k')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia')
plt.xticks(k_values)
plt.grid(True)
plt.subplot(1, 2, 2)
plt.plot(k_values, silhoutte_scores, marker='o', color='orange')
plt.title('Silhouette Scores for Different Values of k')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Silhouette Score')
plt.xticks(k_values)
plt.grid(True)
plt.tight_layout()
plt.show()
import numpy as np
# Prepare a summary DataFrame for the optimal k selection metrics
summary_df = pd.DataFrame({
'k': list(k_values),
'Inertia': [int(round(val)) for val in inertia],
'Silhouette': [round(val, 2) for val in silhoutte_scores]
})
# Format numbers for display
summary_df['Inertia'] = summary_df['Inertia'].apply(lambda x: f"{x:,}")
# Pivot for pretty printing
summary_df_display = summary_df.set_index('k').T
summary_df_display.index = ['Inertia', 'Silhouette']
summary_df_display.columns = [f'k={k}' for k in summary_df_display.columns]
print("Example Output (Optimal k):")
display(summary_df_display)
Example Output (Optimal k):
k=2 | k=3 | k=4 | k=5 | k=6 | k=7 | k=8 | k=9 | k=10 | k=11 | k=12 | |
---|---|---|---|---|---|---|---|---|---|---|---|
Inertia | 24,493 | 17,874 | 13,842 | 11,655 | 10,262 | 9,628 | 8,872 | 7,839 | 7,307 | 6,838 | 6,398 |
Silhouette | 0.33 | 0.32 | 0.34 | 0.32 | 0.32 | 0.30 | 0.28 | 0.28 | 0.28 | 0.27 | 0.28 |
Silhouette Score¶
$$ s(i) = \frac{b(i) - a(i)}{\max(a(i), b(i))} $$Where:
- $s(i)$ is the silhouette score for a single sample $i$
- $a(i)$ is the average distance between $i$ and all other points in the same cluster,
- $b(i)$ is the minimum average distance between $i$ and all points in the nearest cluster to which $i$ does not belong.
The silhoutte score ranges between $[-1, 1]$, a higher value indicates more distinct clusters.
- Inertia Analysis (Elbow Method)
Inertia measures the sum of squared distances of data points to their nearest cluster center. Lower inertia indicates tighter clusters.
Trend:
Inertia decreases sharply from k=2 (24,493) to k=4 (13,842), then more gradually afterward.
The "elbow point" (where the rate of decrease slows) occurs around k=4 or k=5.
Key Insight: The elbow suggests diminishing returns beyond k=4 or k=5.
- Silhouette Score Analysis
Silhouette Score quantifies cluster separation (higher = better-defined clusters). Scores range from -1 to +1.
Trend:
The highest score is 0.34 at k=4, followed by 0.33 at k=2 and 0.32 at k=3/k=5/k=6.
Scores decline steadily after k=4.
Key Insight: k=4 maximizes cluster quality according to silhouette scores.
- Combined Decision: Optimal k
Metric k=2 k=3 k=4 k=5 k=6 ... Inertia 24,493 17,874 13,842 11,655 10,262 ... Silhouette 0.33 0.32 0.34 0.32 0.32 ... Recommendation: k=4 is optimal because:
It has the highest silhouette score (0.34).
The elbow point in inertia occurs here, indicating a good trade-off between compactness and complexity.
Beyond k=4, gains in inertia reduction are marginal, and silhouette scores degrade
performing K-Means clustering on your dataset.
kmeans = KMeans(n_clusters=4, random_state=42, max_iter=1000)
cluster_labels = kmeans.fit_predict(scaled_data_df)
cluster_labels
array([3, 3, 2, ..., 1, 0, 2], shape=(13084,), dtype=int32)
non_outliers_df["Cluster"] = cluster_labels
non_outliers_df
C:\Users\USER\AppData\Local\Temp\ipykernel_20060\3577770544.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
customer_id | MonetaryValue | Frequency | Recency | Cluster | |
---|---|---|---|---|---|
0 | 0009655768c64bdeb2e877511632db8f | 127.60 | 8 | 0.75 | 3 |
1 | 0011e0d4e6b944f998e987f904e8c1e5 | 79.46 | 5 | 2.50 | 3 |
2 | 0020c2b971eb4e9188eac86d93036a77 | 196.86 | 8 | 0.25 | 2 |
3 | 0020ccbbb6d84e358d3414a3ff76cffd | 154.05 | 12 | 1.75 | 2 |
4 | 003d66b6608740288d6cc97a6903f4f0 | 48.34 | 18 | 0.75 | 0 |
... | ... | ... | ... | ... | ... |
14486 | fff29fb549084123bd046dbc5ceb4faa | 291.47 | 11 | 1.25 | 2 |
14488 | fff7576017104bcc8677a8d63322b5e1 | 29.94 | 6 | 0.75 | 3 |
14489 | fff8957ea8b240a6b5e634b6ee8eafcf | 12.15 | 5 | 5.75 | 1 |
14490 | fffad4f4828548d1b5583907f2e9906b | 88.83 | 12 | 1.50 | 0 |
14491 | ffff82501cea40309d5fdd7edcca4a07 | 226.07 | 15 | 2.75 | 2 |
13084 rows × 5 columns
cluster_colors = {0: '#6f1d1b',
1: '#99582a',
2: '#432818',
3: '#bb9457'}
colors = non_outliers_df['Cluster'].map(cluster_colors)
fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(projection='3d')
scatter = ax.scatter(non_outliers_df['MonetaryValue'],
non_outliers_df['Frequency'],
non_outliers_df['Recency'],
c=colors, # Use mapped solid colors
marker='o')
ax.set_xlabel('Monetary Value')
ax.set_ylabel('Frequency')
ax.set_zlabel('Recency')
ax.set_title('3D Scatter Plot of Customer Data by Cluster')
plt.show()
import plotly.express as px
# Convert cluster_colors keys to strings for plotly compatibility
color_discrete_map = {str(k): v for k, v in cluster_colors.items()}
fig = px.scatter_3d(
non_outliers_df,
x="MonetaryValue",
y="Frequency",
z="Recency",
color="Cluster",
color_discrete_map=color_discrete_map,
hover_data=["customer_id", "MonetaryValue", "Frequency", "Recency"],
title="Interactive 3D Scatter Plot of Customer Data by Cluster"
)
fig.update_traces(marker=dict(size=5))
fig.show()
plt.figure(figsize=(12, 18))
plt.subplot(3, 1, 1)
sns.violinplot(x=non_outliers_df['Cluster'], y=non_outliers_df['MonetaryValue'], palette=cluster_colors, hue=non_outliers_df["Cluster"])
sns.violinplot(y=non_outliers_df['MonetaryValue'], color='gray', linewidth=1.0)
plt.title('Monetary Value by Cluster')
plt.ylabel('Monetary Value')
plt.subplot(3, 1, 2)
sns.violinplot(x=non_outliers_df['Cluster'], y=non_outliers_df['Frequency'], palette=cluster_colors, hue=non_outliers_df["Cluster"])
sns.violinplot(y=non_outliers_df['Frequency'], color='gray', linewidth=1.0)
plt.title('Frequency by Cluster')
plt.ylabel('Frequency')
plt.subplot(3, 1, 3)
sns.violinplot(x=non_outliers_df['Cluster'], y=non_outliers_df['Recency'], palette=cluster_colors, hue=non_outliers_df["Cluster"])
sns.violinplot(y=non_outliers_df['Recency'], color='gray', linewidth=1.0)
plt.title('Recency by Cluster')
plt.ylabel('Recency')
plt.tight_layout()
plt.show()
🔵 Cluster 0 – "Champions" High frequency, high monetary, low recency Interpretation: These customers purchase frequently, spend a lot, and have bought recently. They’re your top-tier loyal customers.
📌 Action:
Prioritize them for exclusive offers, early access to new products, and VIP programs.
Keep them engaged through personalized emails and thank-you gestures.
🟠 Cluster 1 – "Slipping Away" Medium-low frequency, low monetary, high recency Interpretation: These customers haven’t purchased in a while. Their spending and frequency are low compared to others. They're at risk of churning.
📌 Action:
Win-back campaigns, reminders, or time-sensitive discounts.
Ask for feedback: “We miss you!” emails to understand their drop-off.
🟢 Cluster 2 – "Potential Loyalists" Mid-high monetary, decent frequency, low recency Interpretation: These customers made recent purchases, spend a good amount, and could become your next champions if nurtured properly.
📌 Action:
Encourage repeat purchases through loyalty points, upselling, and next-best offers.
Offer personalized experiences to build strong habits.
🔴 Cluster 3 – "New & Low-Engaged" Low frequency, low monetary, recent transactions Interpretation: New or low-engagement users. They’ve recently interacted but not consistently.
📌 Action:
Provide onboarding offers and a warm welcome.
Use email flows, introductory discounts, or “How to get the best from us” content.
Customer Metrics Distribution by Cluster¶
The violin plots above visualize the distribution of three key customer metrics—Monetary Value, Frequency, and Recency—across the identified customer clusters.
Monetary Value: Shows how much customers in each cluster spend on average. Wider sections indicate more customers at that spending level. Some clusters have higher average spenders (e.g., "DELIGHT", "PAMPER"), while others are lower.
Frequency: Indicates how often customers in each cluster make purchases. Clusters like "DELIGHT" and "UPSELL" show higher purchase frequencies, suggesting highly engaged customers.
Recency: Measures how recently customers made a purchase. Lower values mean more recent activity. Clusters with low recency (e.g., "RETAIN", "DELIGHT") represent customers who have purchased very recently.
Interpretation:
- Each cluster has a distinct profile, combining spending, engagement, and recency.
- Outlier clusters (e.g., "DELIGHT", "PAMPER", "UPSELL") are separated and show extreme values in one or more metrics.
- The distribution shapes help identify which clusters are high-value, frequent, or recently active, supporting targeted marketing strategies.
Yes, there is overlap between monetary and frequency outliers.
- The variable
overlap_indices
contains the indices of customers who are outliers in both monetary value and frequency. - The DataFrame
monetary_and_frequency_outliers
shows these overlapping outliers.
There are 35 such customers, as seen in monetary_and_frequency_outliers
.
Are the overlaps significant?
- Monetary & Frequency overlap: 35 customers (2.48% of all outliers)
- Monetary & Recency overlap: 4 customers (0.28% of all outliers)
- Frequency & Recency overlap: 0 customers (0.00% of all outliers)
- Total outliers: 1,412
Interpretation:
- The overlaps between outlier groups are very small relative to the total number of outliers.
- Only a small fraction of customers are extreme in more than one dimension.
- This suggests that most outliers are unique to a single metric, and multi-metric outliers are not a major driver in the dataset.
- The low overlap means that segmentation and interventions can be tailored to specific behaviors (spending, frequency, or recency) rather than broad, multi-dimensional outlier profiles.
This code is labeling outliers into distinct custom groups (clusters) based on whether they are outliers in Monetary, Frequency, or Recency, or some combination — and combining them all into a single DataFrame
overlap_indices = monetary_outliers_df.index.intersection(frequency_outliers_df.index)
recency_overlap_indices = recency_outliers_df.index.intersection(monetary_outliers_df.index.union(frequency_outliers_df.index))
monetary_only_outliers = monetary_outliers_df.drop(overlap_indices)
frequency_only_outliers = frequency_outliers_df.drop(overlap_indices)
recency_only_outliers = recency_outliers_df.drop(recency_overlap_indices)
monetary_and_frequency_outliers = monetary_outliers_df.loc[overlap_indices]
monetary_only_outliers["Cluster"] = -1
frequency_only_outliers["Cluster"] = -2
recency_only_outliers["Cluster"] = -3
monetary_and_frequency_outliers["Cluster"] = -4
outlier_clusters_df = pd.concat([monetary_only_outliers, frequency_only_outliers,recency_only_outliers, monetary_and_frequency_outliers])
outlier_clusters_df
customer_id | MonetaryValue | Frequency | Recency | Cluster | |
---|---|---|---|---|---|
10 | 00715b6e55c3431cb56ff7307eb19675 | 375.12 | 15 | 0.50 | -1 |
43 | 00e9f403afa641889cd034ee7c7ca6e9 | 614.59 | 17 | 0.25 | -1 |
60 | 01443a2afce54939a323c978f467c540 | 417.93 | 16 | 1.75 | -1 |
76 | 017febbe52e64ac19cf28cf0d44386e4 | 1003.41 | 10 | 7.00 | -1 |
87 | 019ed95d987446e3947da0246cdab831 | 515.09 | 8 | 6.25 | -1 |
... | ... | ... | ... | ... | ... |
10972 | c06cbc22ae8d4ce39d8c2cdee4272ace | 460.52 | 22 | 0.75 | -4 |
11925 | d237d3afa2cf4463a07e06b9a8fcba0c | 400.87 | 26 | 2.75 | -4 |
13668 | f130699fa7934e2ebbfa0a3f976d645c | 405.54 | 23 | 2.75 | -4 |
13881 | f5205b328b564bf0a55952a8dc19ce6a | 395.59 | 24 | 0.00 | -4 |
14430 | fee2d799672d4f81bfa3237207290f79 | 361.49 | 21 | 4.00 | -4 |
1408 rows × 5 columns
# Total number of outliers (from outlier_clusters_df)
total_outliers = len(outlier_clusters_df)
# Overlap counts
monetary_frequency_overlap = 35
monetary_recency_overlap = 4
frequency_recency_overlap = 0
# Calculate percentages
mf_pct = (monetary_frequency_overlap / total_outliers) * 100
mr_pct = (monetary_recency_overlap / total_outliers) * 100
fr_pct = (frequency_recency_overlap / total_outliers) * 100
print(f"Monetary & Frequency overlap: {monetary_frequency_overlap} ({mf_pct:.2f}%)")
print(f"Monetary & Recency overlap: {monetary_recency_overlap} ({mr_pct:.2f}%)")
print(f"Frequency & Recency overlap: {frequency_recency_overlap} ({fr_pct:.2f}%)")
print(f"Total outliers: {total_outliers}")
Monetary & Frequency overlap: 35 (2.49%) Monetary & Recency overlap: 4 (0.28%) Frequency & Recency overlap: 0 (0.00%) Total outliers: 1408
print("Total outlier records:", outlier_clusters_df.shape[0])
print("Unique records:", outlier_clusters_df.index.nunique())
Total outlier records: 1408 Unique records: 1408
outlier_clusters_df["Cluster"].value_counts()
Cluster -3 574 -1 435 -2 364 -4 35 Name: count, dtype: int64
cluster_colors = {-1: '#9467bd',
-2: '#8c564b',
-3: '#e377c2',
-4: '#6f1d1b'}
plt.figure(figsize=(12, 18))
plt.subplot(3, 1, 1)
sns.violinplot(x=outlier_clusters_df['Cluster'], y=outlier_clusters_df['MonetaryValue'], palette=cluster_colors, hue=outlier_clusters_df["Cluster"])
sns.violinplot(y=outlier_clusters_df['MonetaryValue'], color='gray', linewidth=1.0)
plt.title('Monetary Value by Cluster')
plt.ylabel('Monetary Value')
plt.subplot(3, 1, 2)
sns.violinplot(x=outlier_clusters_df['Cluster'], y=outlier_clusters_df['Frequency'], palette=cluster_colors, hue=outlier_clusters_df["Cluster"])
sns.violinplot(y=outlier_clusters_df['Frequency'], color='gray', linewidth=1.0)
plt.title('Frequency by Cluster')
plt.ylabel('Frequency')
plt.subplot(3, 1, 3)
sns.violinplot(x=outlier_clusters_df['Cluster'], y=outlier_clusters_df['Recency'], palette=cluster_colors, hue=outlier_clusters_df["Cluster"])
sns.violinplot(y=outlier_clusters_df['Recency'], color='gray', linewidth=1.0)
plt.title('Recency by Cluster')
plt.ylabel('Recency')
plt.tight_layout()
plt.show()
🔴 Cluster -4: DELIGHT 🧠 Characteristics:
Extremely high Monetary value — the highest of all clusters.
Very high Frequency — they buy often.
Very low Recency — meaning they purchased very recently. These are your best customers: frequent, recent, and high-spending.
💡 Strategy – “Delight”
VIP Programs – Invite-only clubs or premium memberships.
Surprise Gifts – Unannounced freebies or bonus points.
First Access – Let them preview new products or features.
Personal Concierge – If applicable, assign relationship managers.
🌸 Cluster -3: RE-ENGAGE 🧠 Characteristics:
Low Monetary, Low Frequency, and high Recency.
They barely buy, don’t spend much, and haven't purchased recently. These users are the least engaged and at high churn risk.
💡 Strategy – “Re-Engage”
Win-back Campaigns – “We miss you” emails or SMS with strong incentives.
Reactivation Offers – Deep discounts or free shipping to lure them back.
Feedback Loop – Ask them why they left or what they need.
☕ Cluster -2: UPSELL 🧠 Characteristics:
Low Monetary but high Frequency.
Recency is also very low (they purchase frequently and recently). These are habitual but low-value spenders.
💡 Strategy – “Upsell”
Bundling – Promote combos and packages to raise spend per visit.
Tiered Loyalty – Unlock rewards as they spend more.
Premium Add-ons – Offer product upgrades or extras at checkout.
🟣 Cluster -1: PAMPER 🧠 Characteristics:
High Monetary, medium Frequency, low Recency.
They don’t buy that often but when they do, they spend big, and they’ve done so recently. These are valuable infrequent buyers.
💡 Strategy – “Pamper”
Personalized Offers – Based on their preferences or past purchases.
Luxury Treatment – Priority support, curated experiences.
Special Event Invites – Like tasting events, webinars, or launches.
cluster_labels = {
0: "Loyal Core",
1: "Sleeping Buyers",
2: "First-Time Buyers",
3: "Frequent Shoppers",
-1: "High Spenders",
-2: "Potential Upgraders",
-3: "VIP Customers",
-4: "Win-Back Targets"
}
full_clustering_df = pd.concat([non_outliers_df, outlier_clusters_df])
full_clustering_df
customer_id | MonetaryValue | Frequency | Recency | Cluster | |
---|---|---|---|---|---|
0 | 0009655768c64bdeb2e877511632db8f | 127.60 | 8 | 0.75 | 3 |
1 | 0011e0d4e6b944f998e987f904e8c1e5 | 79.46 | 5 | 2.50 | 3 |
2 | 0020c2b971eb4e9188eac86d93036a77 | 196.86 | 8 | 0.25 | 2 |
3 | 0020ccbbb6d84e358d3414a3ff76cffd | 154.05 | 12 | 1.75 | 2 |
4 | 003d66b6608740288d6cc97a6903f4f0 | 48.34 | 18 | 0.75 | 0 |
... | ... | ... | ... | ... | ... |
10972 | c06cbc22ae8d4ce39d8c2cdee4272ace | 460.52 | 22 | 0.75 | -4 |
11925 | d237d3afa2cf4463a07e06b9a8fcba0c | 400.87 | 26 | 2.75 | -4 |
13668 | f130699fa7934e2ebbfa0a3f976d645c | 405.54 | 23 | 2.75 | -4 |
13881 | f5205b328b564bf0a55952a8dc19ce6a | 395.59 | 24 | 0.00 | -4 |
14430 | fee2d799672d4f81bfa3237207290f79 | 361.49 | 21 | 4.00 | -4 |
14492 rows × 5 columns
full_clustering_df["ClusterLabel"] = full_clustering_df["Cluster"].map(cluster_labels)
full_clustering_df
customer_id | MonetaryValue | Frequency | Recency | Cluster | ClusterLabel | |
---|---|---|---|---|---|---|
0 | 0009655768c64bdeb2e877511632db8f | 127.60 | 8 | 0.75 | 3 | Frequent Shoppers |
1 | 0011e0d4e6b944f998e987f904e8c1e5 | 79.46 | 5 | 2.50 | 3 | Frequent Shoppers |
2 | 0020c2b971eb4e9188eac86d93036a77 | 196.86 | 8 | 0.25 | 2 | First-Time Buyers |
3 | 0020ccbbb6d84e358d3414a3ff76cffd | 154.05 | 12 | 1.75 | 2 | First-Time Buyers |
4 | 003d66b6608740288d6cc97a6903f4f0 | 48.34 | 18 | 0.75 | 0 | Loyal Core |
... | ... | ... | ... | ... | ... | ... |
10972 | c06cbc22ae8d4ce39d8c2cdee4272ace | 460.52 | 22 | 0.75 | -4 | Win-Back Targets |
11925 | d237d3afa2cf4463a07e06b9a8fcba0c | 400.87 | 26 | 2.75 | -4 | Win-Back Targets |
13668 | f130699fa7934e2ebbfa0a3f976d645c | 405.54 | 23 | 2.75 | -4 | Win-Back Targets |
13881 | f5205b328b564bf0a55952a8dc19ce6a | 395.59 | 24 | 0.00 | -4 | Win-Back Targets |
14430 | fee2d799672d4f81bfa3237207290f79 | 361.49 | 21 | 4.00 | -4 | Win-Back Targets |
14492 rows × 6 columns
cluster_counts = full_clustering_df['ClusterLabel'].value_counts()
full_clustering_df["MonetaryValue per 100 pounds"] = full_clustering_df["MonetaryValue"] / 100.00
feature_means = full_clustering_df.groupby('ClusterLabel')[['Recency', 'Frequency', 'MonetaryValue per 100 pounds']].mean()
fig, ax1 = plt.subplots(figsize=(15, 8))
sns.barplot(x=cluster_counts.index, y=cluster_counts.values, ax=ax1, palette='viridis', hue=cluster_counts.index)
ax1.set_ylabel('Number of Customers', color='b')
ax1.set_title('Cluster Distribution with Average Feature Values')
ax2 = ax1.twinx()
sns.lineplot(data=feature_means, ax=ax2, palette='Set2', marker='o')
ax2.set_ylabel('Average Value', color='g')
plt.show()
To join full_clustering_df
with maven_cafe
, use a left merge on customer_id
. This will add the cluster labels to every transaction/event in maven_cafe
.
full_export_df = maven_cafe.merge(
full_clustering_df[['customer_id', 'Cluster', 'ClusterLabel']],
on='customer_id',
how='left'
)
full_export_df.to_csv('maven_cafe_with_clusters.csv', index=False)
full_export_df
customer_id | event | value | Amount | became_member_on | gender | age | income | offer_id | offer_type | difficulty | reward | duration | channels | time_in_days | web | mobile | social | age_group | income_group | Cluster | ClusterLabel | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0009655768c64bdeb2e877511632db8f | offer viewed | fafdcd668e3743c1bb461111dcafc2a4 | NaN | 2017-04-21 | M | 33.00 | 72000.00 | fafdcd668e3743c1bb461111dcafc2a4 | discount | 10.00 | 2.00 | 10.00 | ['web', 'email', 'mobile', 'social'] | 22.50 | 1 | 1 | 1 | 1 | Early Career Professionals | Middle Income | 3.00 | Frequent Shoppers |
1 | 0009655768c64bdeb2e877511632db8f | offer received | 5a8bc65990b245e5a138643cd4eb9837 | NaN | 2017-04-21 | M | 33.00 | 72000.00 | 5a8bc65990b245e5a138643cd4eb9837 | informational | 0.00 | 0.00 | 3.00 | ['email', 'mobile', 'social'] | 7.00 | 0 | 1 | 1 | 1 | Early Career Professionals | Middle Income | 3.00 | Frequent Shoppers |
2 | 0009655768c64bdeb2e877511632db8f | offer received | f19421c1d4aa40978ebb69ca19b0e20d | NaN | 2017-04-21 | M | 33.00 | 72000.00 | f19421c1d4aa40978ebb69ca19b0e20d | bogo | 5.00 | 5.00 | 5.00 | ['web', 'email', 'mobile', 'social'] | 17.00 | 1 | 1 | 1 | 1 | Early Career Professionals | Middle Income | 3.00 | Frequent Shoppers |
3 | 0009655768c64bdeb2e877511632db8f | transaction | NaN | 8.57 | 2017-04-21 | M | 33.00 | 72000.00 | NaN | NaN | NaN | NaN | NaN | NaN | 17.25 | 0 | 0 | 0 | 0 | Early Career Professionals | Middle Income | 3.00 | Frequent Shoppers |
4 | 0009655768c64bdeb2e877511632db8f | offer viewed | f19421c1d4aa40978ebb69ca19b0e20d | NaN | 2017-04-21 | M | 33.00 | 72000.00 | f19421c1d4aa40978ebb69ca19b0e20d | bogo | 5.00 | 5.00 | 5.00 | ['web', 'email', 'mobile', 'social'] | 19.00 | 1 | 1 | 1 | 1 | Early Career Professionals | Middle Income | 3.00 | Frequent Shoppers |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
272383 | ffff82501cea40309d5fdd7edcca4a07 | offer completed | fafdcd668e3743c1bb461111dcafc2a4 | NaN | 2016-11-25 | F | 45.00 | 62000.00 | fafdcd668e3743c1bb461111dcafc2a4 | discount | 10.00 | 2.00 | 10.00 | ['web', 'email', 'mobile', 'social'] | 2.50 | 1 | 1 | 1 | 1 | Mature Professionals | Middle Income | 2.00 | First-Time Buyers |
272384 | ffff82501cea40309d5fdd7edcca4a07 | offer completed | 9b98b8c7a33c4b65b9aebfe6a799e6d9 | NaN | 2016-11-25 | F | 45.00 | 62000.00 | 9b98b8c7a33c4b65b9aebfe6a799e6d9 | bogo | 5.00 | 5.00 | 7.00 | ['web', 'email', 'mobile'] | 21.00 | 1 | 1 | 1 | 0 | Mature Professionals | Middle Income | 2.00 | First-Time Buyers |
272385 | ffff82501cea40309d5fdd7edcca4a07 | offer viewed | fafdcd668e3743c1bb461111dcafc2a4 | NaN | 2016-11-25 | F | 45.00 | 62000.00 | fafdcd668e3743c1bb461111dcafc2a4 | discount | 10.00 | 2.00 | 10.00 | ['web', 'email', 'mobile', 'social'] | 0.25 | 1 | 1 | 1 | 1 | Mature Professionals | Middle Income | 2.00 | First-Time Buyers |
272386 | ffff82501cea40309d5fdd7edcca4a07 | offer completed | 0b1e1539f2cc45b7b9fa7c272da2e1d7 | NaN | 2016-11-25 | F | 45.00 | 62000.00 | 0b1e1539f2cc45b7b9fa7c272da2e1d7 | discount | 20.00 | 5.00 | 10.00 | ['web', 'email'] | 8.25 | 1 | 1 | 0 | 0 | Mature Professionals | Middle Income | 2.00 | First-Time Buyers |
272387 | ffff82501cea40309d5fdd7edcca4a07 | transaction | NaN | 15.57 | 2016-11-25 | F | 45.00 | 62000.00 | NaN | NaN | NaN | NaN | NaN | NaN | 16.00 | 0 | 0 | 0 | 0 | Mature Professionals | Middle Income | 2.00 | First-Time Buyers |
272388 rows × 23 columns
maven_cafe
customer_id | event | value | Amount | became_member_on | gender | age | income | offer_id | offer_type | difficulty | reward | duration | channels | time_in_days | web | mobile | social | age_group | income_group | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0009655768c64bdeb2e877511632db8f | offer viewed | fafdcd668e3743c1bb461111dcafc2a4 | NaN | 2017-04-21 | M | 33.00 | 72000.00 | fafdcd668e3743c1bb461111dcafc2a4 | discount | 10.00 | 2.00 | 10.00 | ['web', 'email', 'mobile', 'social'] | 22.50 | 1 | 1 | 1 | 1 | Early Career Professionals | Middle Income |
1 | 0009655768c64bdeb2e877511632db8f | offer received | 5a8bc65990b245e5a138643cd4eb9837 | NaN | 2017-04-21 | M | 33.00 | 72000.00 | 5a8bc65990b245e5a138643cd4eb9837 | informational | 0.00 | 0.00 | 3.00 | ['email', 'mobile', 'social'] | 7.00 | 0 | 1 | 1 | 1 | Early Career Professionals | Middle Income |
2 | 0009655768c64bdeb2e877511632db8f | offer received | f19421c1d4aa40978ebb69ca19b0e20d | NaN | 2017-04-21 | M | 33.00 | 72000.00 | f19421c1d4aa40978ebb69ca19b0e20d | bogo | 5.00 | 5.00 | 5.00 | ['web', 'email', 'mobile', 'social'] | 17.00 | 1 | 1 | 1 | 1 | Early Career Professionals | Middle Income |
3 | 0009655768c64bdeb2e877511632db8f | transaction | NaN | 8.57 | 2017-04-21 | M | 33.00 | 72000.00 | NaN | NaN | NaN | NaN | NaN | NaN | 17.25 | 0 | 0 | 0 | 0 | Early Career Professionals | Middle Income |
4 | 0009655768c64bdeb2e877511632db8f | offer viewed | f19421c1d4aa40978ebb69ca19b0e20d | NaN | 2017-04-21 | M | 33.00 | 72000.00 | f19421c1d4aa40978ebb69ca19b0e20d | bogo | 5.00 | 5.00 | 5.00 | ['web', 'email', 'mobile', 'social'] | 19.00 | 1 | 1 | 1 | 1 | Early Career Professionals | Middle Income |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
272383 | ffff82501cea40309d5fdd7edcca4a07 | offer completed | fafdcd668e3743c1bb461111dcafc2a4 | NaN | 2016-11-25 | F | 45.00 | 62000.00 | fafdcd668e3743c1bb461111dcafc2a4 | discount | 10.00 | 2.00 | 10.00 | ['web', 'email', 'mobile', 'social'] | 2.50 | 1 | 1 | 1 | 1 | Mature Professionals | Middle Income |
272384 | ffff82501cea40309d5fdd7edcca4a07 | offer completed | 9b98b8c7a33c4b65b9aebfe6a799e6d9 | NaN | 2016-11-25 | F | 45.00 | 62000.00 | 9b98b8c7a33c4b65b9aebfe6a799e6d9 | bogo | 5.00 | 5.00 | 7.00 | ['web', 'email', 'mobile'] | 21.00 | 1 | 1 | 1 | 0 | Mature Professionals | Middle Income |
272385 | ffff82501cea40309d5fdd7edcca4a07 | offer viewed | fafdcd668e3743c1bb461111dcafc2a4 | NaN | 2016-11-25 | F | 45.00 | 62000.00 | fafdcd668e3743c1bb461111dcafc2a4 | discount | 10.00 | 2.00 | 10.00 | ['web', 'email', 'mobile', 'social'] | 0.25 | 1 | 1 | 1 | 1 | Mature Professionals | Middle Income |
272386 | ffff82501cea40309d5fdd7edcca4a07 | offer completed | 0b1e1539f2cc45b7b9fa7c272da2e1d7 | NaN | 2016-11-25 | F | 45.00 | 62000.00 | 0b1e1539f2cc45b7b9fa7c272da2e1d7 | discount | 20.00 | 5.00 | 10.00 | ['web', 'email'] | 8.25 | 1 | 1 | 0 | 0 | Mature Professionals | Middle Income |
272387 | ffff82501cea40309d5fdd7edcca4a07 | transaction | NaN | 15.57 | 2016-11-25 | F | 45.00 | 62000.00 | NaN | NaN | NaN | NaN | NaN | NaN | 16.00 | 0 | 0 | 0 | 0 | Mature Professionals | Middle Income |
272388 rows × 21 columns