#https://www.kaggle.com/datasets/faviovaz/marketing-ab-testing


# data Manipulation - first we check information about data if any problems we will fix it.

# import data_manipulation from AB_test
from AB_experiment import data_manipulation

#create alias to call data_manipulation
dm=data_manipulation()

data='marketing_AB.csv'
column1="test group"
column2=["converted"]
quartile1=0.25
quartile3=0.75
info = True
download_df=False
filename='new'

dm.data_info(data,column1,column2,quartile1,quartile3,info,download_df,filename)

{'1': ['dataframe_shape', {'Observations': 588101, 'Column': 7}],
 '2': ['missing_data_info', {'No missing values'}],
 '3': ['outliers_info', []],
 '4': ['data_types',
  [{'object_values': "['test group', 'most ads day']"},
   {'float_values': '[]'},
   {'int_values': ['Unnamed: 0', 'user id', 'total ads', 'most ads hour']},
   {'bool_val': ['converted']}]],
 '5': ['numerical_Variables',
  ['Unnamed: 0', 'user id', 'total ads', 'most ads hour']],
 '6': ['Categorical_variables', ['test group', 'converted', 'most ads day']],
 '7': [{'Unique values count for variable':      test group
   ad       564577
   psa       23524},
  {'Unique values count for variable':        converted
   False     573258
   True       14843},
  {'Unique values count for variable':            most ads day
   Friday            92608
   Monday            87073
   Sunday            85391
   Thursday          82982
   Saturday          81660
   Wednesday         80908
   Tuesday           77479}],
 '8': [['Descriptive statistics-numerical_Variables',
             Unnamed: 0       user id      total ads  most ads hour
   count  588101.000000  5.881010e+05  588101.000000  588101.000000
   mean   294050.000000  1.310692e+06      24.820876      14.469061
   std    169770.279667  2.022260e+05      43.715181       4.834634
   min         0.000000  9.000000e+05       1.000000       0.000000
   25%    147025.000000  1.143190e+06       4.000000      11.000000
   50%    294050.000000  1.313725e+06      13.000000      14.000000
   75%    441075.000000  1.484088e+06      27.000000      18.000000
   max    588100.000000  1.654483e+06    2065.000000      23.000000,
   '********************'],
  ['Descriptive statistics-Categorical_variables',
          test group converted most ads day
   count      588101    588101       588101
   unique          2         2            7
   top            ad     False       Friday
   freq       564577    573258        92608,
   '********************']],
 '9': ['category_stats', []],
 '10': ['Dataframe',
     Unnamed: 0  user id test group  converted  total ads most ads day  /
  0           0  1069124         ad      False        130       Monday   
  1           1  1119715         ad      False         93      Tuesday   
  2           2  1144181         ad      False         21      Tuesday   
  3           3  1435133         ad      False        355      Tuesday   
  4           4  1015700         ad      False        276       Friday   
  
     most ads hour  
  0             20  
  1             22  
  2             18  
  3             10  
  4             14  ]}


# From above output info we can say that in our data there is no outliers , no missing values present 
# and datatypes of all variables correct
#Now we findout sample size


#fist we findout baseline conversion rate
# import stats_test from AB_test
from AB_experiment import stats_test

#create alias to call stats_test
st=stats_test()

data='marketing_AB.csv'
column1="test group"
column1_value='psa'
column2='converted'

st.baseline_conversion_rate(data,column1,column1_value,column2)

{'Baseline conversion rate(p1) of test group psa': 0.0179}


p1= 0.0179
mde=0.004
alpha=0.05
power=0.8
n_side=2

st.sample_size(p1,mde,alpha,power, n_side)

{'Sample size': 17804}


# import stats_test from AB_test
from AB_experiment import stats_test

#create alias to call stats_test
st=stats_test()

data='marketing_AB.csv'
sample_size=17804
group="test group"
group1_val='psa'
group2_val='ad'
target="converted"
alpha=0.05
paired_data=False 

st.AB_Test_assumption(data, sample_size, group, group1_val, group2_val, target, alpha, paired_data)

({'Target variable is boolean data type': 'Use Chi-Squared Test'},
 {'Note': 'If our data involve time-to-event or survival analysis (e.g., time until a user completes a task), we can use methods such as the log-rank test'})


# perform chi-square test
data='marketing_AB.csv'
sample_size=17804
column1='test group'
column1_value1='psa'
column1_value2='ad'
column2='converted'
alpha=0.05
reverse_experiment=False

st.chi_squared_test(data,sample_size,column1,column1_value1,column1_value2,column2,alpha,reverse_experiment)

{'Test name': 'Chi-square test',
 'Control group': 'psa',
 'Treatment group': 'ad',
 'Timestamp': '2023-08-29 19:38:05',
 'Sample size': 17804,
 'Status': 'We can reject H0 => test group ad is more successful',
 'P-value': 9e-05,
 'alpha': 0.05,
 'Test Statistic': 15.30793569801566,
 'Proportion of group psa': 0.0188,
 'Proportion of group ad': 0.0249,
 'Confidence interval of group psa': (0.01682, 0.02081),
 'Confidence interval of group ad': (-0.00916, -0.00308),
 'Confidence interval of difference in groups': (0.02265, 0.02723)}

marketing-data-ab-testing

The companies are interested in answering two questions:¶

Data dictionary:¶

By checking assumptions we perform Non-parametric test Mann-Whitney U test for AB Testing¶

Conclusion¶

Ads

Related Articles