import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
import scipy.stats as stats
from statsmodels.stats.power import NormalIndPower

%matplotlib inline

df = pd.read_csv('conservation_dataset.csv')
df.head()

df.columns

Index(['Unnamed: 0', 'user id', 'message_type', 'engaged',
       'total_messages_seen', 'most engagement day', 'most engagement hour'],
      dtype='object')

df.shape

(588101, 7)

# Check for missing values and data types
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 588101 entries, 0 to 588100
Data columns (total 7 columns):
 #   Column                Non-Null Count   Dtype 
---  ------                --------------   ----- 
 0   Unnamed: 0            588101 non-null  int64 
 1   user id               588101 non-null  int64 
 2   message_type          588101 non-null  object
 3   engaged               588101 non-null  bool  
 4   total_messages_seen   588101 non-null  int64 
 5   most engagement day   588101 non-null  object
 6   most engagement hour  588101 non-null  int64 
dtypes: bool(1), int64(4), object(2)
memory usage: 27.5+ MB
None

df['user id'].nunique() ## Checking if there is any duplicates

588101

df.drop(['Unnamed: 0', 'user id'],axis=1,inplace=True)
df.head()

df['message_type'].value_counts(normalize=True) ## To check for imbalance in our data

message_type
Personalized    0.96
Generic         0.04
Name: proportion, dtype: float64

# Step 1: Create a Contingency Table
observed_frequencies = pd.crosstab(df['message_type'], df['engaged'])

# Step 2: Calculate Expected Frequencies
row_totals = observed_frequencies.sum(axis=1)
col_totals = observed_frequencies.sum(axis=0)
grand_total = observed_frequencies.values.sum()

# Calculate expected frequencies using broadcasting
expected = np.outer(row_totals, col_totals) / grand_total
expected_frequencies = pd.DataFrame(expected, index=observed_frequencies.index, columns=observed_frequencies.columns)

# Step 3: Check if all expected frequencies are ≥ 5
min_expected_frequency = expected_frequencies.min().min()

print("Observed Frequencies:")
print(observed_frequencies)
print("\nExpected Frequencies:")
print(expected_frequencies)
print(f"\nMinimum Expected Frequency: {min_expected_frequency}")

# Decision: Can Chi-Square Test be performed?
if min_expected_frequency >= 5:
    print("Chi-Square Test can be performed.")
else:
    print("Chi-Square Test cannot be performed. Consider Fisher's Exact Test.")

Observed Frequencies:
engaged        False  True 
message_type               
Generic        23104    420
Personalized  550154  14423

Expected Frequencies:
engaged              False        True 
message_type                           
Generic        22930.28101    593.71899
Personalized  550327.71899  14249.28101

Minimum Expected Frequency: 593.7189904455187
Chi-Square Test can be performed.

# Make sure to import the library
from scipy.stats import chi2_contingency

observed = [
    [23104, 420],      # Generic group
    [550154, 14423]    # Personalized group
]

# Perform the Chi-Square Test
chi2, p_value, dof, expected = chi2_contingency(observed)

print("Chi-Square Statistic:", chi2)
print("P-Value:", p_value)
print("Degrees of Freedom:", dof)
print("\nExpected Frequencies (calculated by SciPy):")
print(expected)

# Interpretation
if p_value < 0.05:
    print("Result: Significant difference detected (reject null hypothesis).")
else:
    print("Result: No significant difference detected (fail to reject null hypothesis).")

Chi-Square Statistic: 54.00582388368525
P-Value: 1.998962306339e-13
Degrees of Freedom: 1

Expected Frequencies (calculated by SciPy):
[[ 22930.28100955    593.71899045]
 [550327.71899045  14249.28100955]]
Result: Significant difference detected (reject null hypothesis).

# Subset of test groups
generic_group = df[df['message_type']=='Generic']['engaged']
personalized_group = df[df['message_type']=='Personalized']['engaged']

boot_personalized=[]
boot_generic=[]

for i in range (1000):
    boot_mean=personalized_group.sample(frac=1,replace=True).mean()
    boot_personalized.append(boot_mean)

    boot_mean=generic_group.sample(frac=1,replace=True).mean()
    boot_generic.append(boot_mean)

boot_personalized=pd.DataFrame(boot_personalized)
boot_generic=pd.DataFrame(boot_generic)

boot_personalized.plot(kind='density')
boot_generic.plot(kind='density')

<Axes: ylabel='Density'>

from scipy.stats import ttest_ind

# Perform Welch's T-Test (assuming unequal variance)
t_stat, p_value = ttest_ind(boot_personalized[0], boot_generic[0], equal_var=False)

# Display results
print("Welch's T-Test on Bootstrap Samples:")
print("T-Statistic:", t_stat)
print("P-Value:", p_value)

# Interpretation
if p_value < 0.05:
    print("Result: Significant difference detected (reject null hypothesis).")
else:
    print("Result: No significant difference detected (fail to reject null hypothesis).")

Welch's T-Test on Bootstrap Samples:
T-Statistic: 280.33372908894046
P-Value: 0.0
Result: Significant difference detected (reject null hypothesis).

# Create figure and axes
fig, ax = plt.subplots(1, 2, figsize=(12, 6))

# Observed Frequencies Plot
sns.barplot(x=observed_frequencies.index, y=observed_frequencies[False], ax=ax[0], color="red", label="Not Engaged")
sns.barplot(x=observed_frequencies.index, y=observed_frequencies[True], ax=ax[0], color="blue", label="Engaged")
ax[0].set_title("Observed Engagement Frequencies")
ax[0].set_xlabel("Message Type")
ax[0].set_ylabel("Count")
ax[0].legend()

# Expected Frequencies Plot
sns.barplot(x=expected_frequencies.index, y=expected_frequencies[False], ax=ax[1], color="red", label="Not Engaged")
sns.barplot(x=expected_frequencies.index, y=expected_frequencies[True], ax=ax[1], color="blue", label="Engaged")
ax[1].set_title("Expected Engagement Frequencies")
ax[1].set_xlabel("Message Type")
ax[1].set_ylabel("Count")
ax[1].legend()

# Adjust layout and display
plt.tight_layout()
plt.show()

# Box plot to compare total_messages_seen for engaged vs. not engaged
plt.figure(figsize=(16, 12))  # Increase figure size for better clarity

# Create the box plot with mean markers
sns.boxplot(x='engaged', y='total_messages_seen', data=df, showmeans=True, meanprops={"marker":"o", "markerfacecolor":"red", "markeredgecolor":"black"})

# Adjust y-axis limits if needed
plt.ylim(0, 2000)  # Modify this based on the data range

plt.title("Comparison of Total Messages Seen by Engagement Status", fontsize=14)
plt.xlabel("Engaged", fontsize=12)
plt.ylabel("Total Messages Seen", fontsize=12)

plt.show()

# Create a contingency table for 'converted' and 'most ads day'
contingency_table = pd.crosstab(df['engaged'], df['most engagement day'])

# Perform Chi-Square Test
chi2, p, dof, expected = stats.chi2_contingency(contingency_table)

# Display results
print(f"Chi-Square Test Statistic: {chi2}")
print(f"P-value: {p}")
print(f"Degrees of Freedom: {dof}")

# Check if we can reject the null hypothesis
if p < 0.05:
    print("Conclusion: Reject the null hypothesis. There is an association between 'engaged' and 'most engagement day'.")
else:
    print("Conclusion: Fail to reject the null hypothesis. No significant association between 'engaged' and 'most engagement day'.")

Chi-Square Test Statistic: 410.0478857936585
P-value: 1.932184379244731e-85
Degrees of Freedom: 6
Conclusion: Reject the null hypothesis. There is an association between 'engaged' and 'most engagement day'.

# Plotting the contingency table with annotations
plt.figure(figsize=(10, 6))
sns.heatmap(contingency_table, annot=True, fmt="d", cmap="coolwarm", cbar=True)
plt.title("Contingency Table of 'engaged' and 'most engagement day'")
plt.xlabel("Most Engagement Day")
plt.ylabel("Engagement")
plt.savefig('heatmap_most_engagement_day.png',dpi=300)
plt.show()

# Create a contingency table
contingency_table = pd.crosstab([df['most engagement hour'], df['message_type']], df['engaged'])

# Perform Chi-Square Test
chi2, p_value, dof, expected = chi2_contingency(contingency_table)

# Display results
print("Chi-Square Statistic:", chi2)
print("P-Value:", p_value)
print("Degrees of Freedom:", dof)
print("\nExpected Frequencies:")
print(expected)

# Interpretation
if p_value < 0.05:
    print("Result: Significant difference detected (reject null hypothesis).")
else:
    print("Result: No significant difference detected (fail to reject null hypothesis).")

Chi-Square Statistic: 496.7428872096697
P-Value: 2.163815581745976e-76
Degrees of Freedom: 47

Expected Frequencies:
[[2.21270778e+02 5.72922168e+00]
 [5.17500688e+03 1.33993119e+02]
 [1.82280333e+02 4.71966720e+00]
 [4.49852265e+03 1.16477348e+02]
 [1.76431766e+02 4.56823403e+00]
 [5.02196938e+03 1.30030617e+02]
 [8.67537413e+01 2.24625872e+00]
 [2.52463135e+03 6.53686527e+01]
 [2.72933119e+01 7.06688137e-01]
 [6.76484230e+02 1.75157703e+01]
 [2.24195062e+01 5.80493827e-01]
 [7.23272764e+02 1.87272356e+01]
 [8.09051745e+01 2.09482555e+00]
 [1.93490086e+03 5.00991411e+01]
 [2.31018390e+02 5.98161030e+00]
 [6.01232670e+03 1.55673301e+02]
 [6.42367590e+02 1.66324101e+01]
 [1.65397470e+04 4.28253011e+02]
 [1.17166289e+03 3.03371122e+01]
 [2.90498314e+04 7.52168566e+02]
 [1.44752029e+03 3.74797101e+01]
 [3.65087037e+04 9.45296339e+02]
 [2.00898271e+03 5.20172946e+01]
 [4.30347295e+04 1.11427052e+03]
 [2.00800794e+03 5.19920558e+01]
 [4.40962444e+04 1.14175564e+03]
 [2.11523167e+03 5.47683306e+01]
 [4.43370104e+04 1.14798964e+03]
 [1.82182857e+03 4.71714331e+01]
 [4.26740679e+04 1.10493214e+03]
 [1.78186336e+03 4.61366398e+01]
 [4.17733886e+04 1.08161143e+03]
 [1.56351687e+03 4.04831347e+01]
 [3.50553348e+04 9.07665195e+02]
 [1.34809465e+03 3.49053462e+01]
 [3.27568480e+04 8.48151959e+02]
 [1.23892141e+03 3.20785936e+01]
 [3.02682829e+04 7.83717144e+02]
 [1.15314243e+03 2.98575738e+01]
 [2.84328076e+04 7.36192367e+02]
 [1.04981775e+03 2.71822544e+01]
 [2.71431986e+04 7.02801352e+02]
 [1.05371679e+03 2.72832099e+01]
 [2.81657231e+04 7.29276918e+02]
 [8.93855964e+02 2.31440365e+01]
 [2.48710304e+04 6.43969565e+02]
 [6.03377144e+02 1.56228556e+01]
 [1.90536560e+04 4.93344036e+02]]
Result: Significant difference detected (reject null hypothesis).

# Pivot table for heatmap
heatmap_data = df.pivot_table(values='engaged', index='message_type', columns='most engagement hour', aggfunc='sum')

# Create heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(heatmap_data, annot=True, fmt="d", cmap="coolwarm", cbar=True)
plt.title("Peak Engagement Hours by Message Type")
plt.xlabel("Hour of the Day")
plt.ylabel("Message Type")
plt.savefig('heatmap_most_engagement_hours.png')
plt.show()

# Selecting the independent (X) and dependent (Y) variables
X = df[['total_messages_seen', 'most engagement hour']]  # Independent variables
Y = df['engaged']  # Dependent variable (conversiof
# Adding a constant for the intercept term
X = sm.add_constant(X)

# Fit the logistic regression model
logit_model = sm.Logit(Y, X)
result = logit_model.fit()

# Print the summary
print(result.summary())

Optimization terminated successfully.
         Current function value: 0.108765
         Iterations 8
                           Logit Regression Results                           
==============================================================================
Dep. Variable:                engaged   No. Observations:               588101
Model:                          Logit   Df Residuals:                   588098
Method:                           MLE   Df Model:                            2
Date:                Tue, 22 Jul 2025   Pseudo R-squ.:                 0.07655
Time:                        15:58:03   Log-Likelihood:                -63965.
converged:                       True   LL-Null:                       -69267.
Covariance Type:            nonrobust   LLR p-value:                     0.000
========================================================================================
                           coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                   -4.5402      0.030   -153.098      0.000      -4.598      -4.482
total_messages_seen      0.0102   9.87e-05    103.359      0.000       0.010       0.010
most engagement hour     0.0327      0.002     17.860      0.000       0.029       0.036
========================================================================================

# Generate predicted probabilities
predictions = result.predict(X)  
# Sort values for a smooth line
sorted_idx = np.argsort(df['total_messages_seen'])
sorted_messages = df['total_messages_seen'][sorted_idx]
sorted_predictions = predictions[sorted_idx]

# Line plot
plt.figure(figsize=(10, 6))
sns.lineplot(x=sorted_messages, y=sorted_predictions, color='blue')
plt.title("Smoothed Predicted Probability of Engagement vs. Total Messages Seen")
plt.xlabel("Total Messages Seen")
plt.ylabel("Predicted Probability of Engagement")
plt.grid(True)
plt.savefig('predictplot.png',dpi=300)
plt.show()

Variable	Effect	Meaning
Total Messages Seen (`coef = 0.0102`)	Positive	More messages seen → Higher chance of engagement. Even though the effect is small, it’s statistically significant.
Most Engagement Hour (`coef = 0.0327`)	Positive	Some hours are better for engagement than others(as we found out in the heatmap). A higher hour (e.g., late afternoon) increases engagement likelihood.
Constant (-4.5402)	Baseline probability	Before considering messages seen or engagement hour, engagement is very low.

AB Testing for Conservation Campaign¶

Imports¶

EDA¶

To check if we can use a Chi-Square Test on our data. For this, we need to calculate something called the expected frequency for each category (cell in our table) and make sure these numbers are at least 5. If even one number is less than 5, the Chi-Square Test won’t work reliably.¶

Bootstrap sampling¶

Interpretation of the First Image (Density Plots)¶

T-Test¶

What This Means¶

Campaign Implications¶

Analyzing message content further to see which specific factors in personalized messages drive engagement.¶

How total messages seen drives the engagement among our target population?¶

Key Observations:¶

What Does This Mean?¶

What Can We Do Next?¶

Analyzing Engagement trends across different days of the week for our conservation campaign.¶

Key Takeaways from the Data:¶

How This Affects Your A/B Test (Personalized vs. Generic Messages)¶

Best time in a day for messaging our target population for boosting engagement¶

Key Observations:¶

For the Conservation Campaign¶

Logistic Regression for Engagement Analysis¶

What is Logistic Regression?¶

What We Are Testing¶

Why We Add a Constant (`sm.add_constant(X)`)¶

Interpreting the Logistic Regression Output¶

How This Helps Your Conservation Campaign¶

Regression Analysis¶

Hypotheses for Logistic Regression¶

Null Hypothesis (H₀)¶

Alternative Hypothesis (H₁)¶

Interpreting the Logistic Regression Results:¶

Key Findings¶

3. What This Means for our Conservation Campaign¶

Key Interpretations¶

Implications for Your Conservation Campaign¶

	Unnamed: 0	user id	message_type	engaged	total_messages_seen	most engagement day	most engagement hour
0	0	1069124	Personalized	False	130	Monday	20
1	1	1119715	Personalized	False	93	Tuesday	22
2	2	1144181	Personalized	False	21	Tuesday	18
3	3	1435133	Personalized	False	355	Tuesday	10
4	4	1015700	Personalized	False	276	Friday	14

AB Testing for Conservation Campaign¶

Imports¶

EDA¶

To check if we can use a Chi-Square Test on our data. For this, we need to calculate something called the expected frequency for each category (cell in our table) and make sure these numbers are at least 5. If even one number is less than 5, the Chi-Square Test won’t work reliably.¶

Bootstrap sampling¶

Interpretation of the First Image (Density Plots)¶

T-Test¶

What This Means¶

Campaign Implications¶

Analyzing message content further to see which specific factors in personalized messages drive engagement.¶

How total messages seen drives the engagement among our target population?¶

Key Observations:¶

What Does This Mean?¶

What Can We Do Next?¶

Analyzing Engagement trends across different days of the week for our conservation campaign.¶

Key Takeaways from the Data:¶

How This Affects Your A/B Test (Personalized vs. Generic Messages)¶

Best time in a day for messaging our target population for boosting engagement¶

Key Observations:¶

For the Conservation Campaign¶

Logistic Regression for Engagement Analysis¶

What is Logistic Regression?¶

What We Are Testing¶

Why We Add a Constant (sm.add_constant(X))¶

Interpreting the Logistic Regression Output¶

How This Helps Your Conservation Campaign¶

Regression Analysis¶

Hypotheses for Logistic Regression¶

Null Hypothesis (H₀)¶

Alternative Hypothesis (H₁)¶

Interpreting the Logistic Regression Results:¶

Key Findings¶

3. What This Means for our Conservation Campaign¶

Key Interpretations¶

Implications for Your Conservation Campaign¶

Why We Add a Constant (`sm.add_constant(X)`)¶