# data Manipulation - first we check information about data if any problems we will fix it.

# import data_manipulation from AB_test
from AB_experiment import data_manipulation

#create alias to call data_manipulation
dm=data_manipulation()

data='app_data.csv'
column1="group"
column2=["downloaded_app","time_spent(min)"]
quartile1=0.25
quartile3=0.75
info = True
download_df=False
filename='new'

dm.data_info(data,column1,column2,quartile1,quartile3,info,download_df,filename)

{'1': ['dataframe_shape', {'Observations': 30000, 'Column': 4}],
 '2': ['missing_data_info', {'No missing values'}],
 '3': ['outliers_info',
  [{'variable_name time_spent(min)': 'No outliers present'}]],
 '4': ['data_types',
  [{'object_values': "['group', 'downloaded_app']"},
   {'float_values': '[]'},
   {'int_values': ['user_id', 'time_spent(min)']},
   {'bool_val': []}]],
 '5': ['numerical_Variables', ['user_id', 'time_spent(min)']],
 '6': ['Categorical_variables', ['group', 'downloaded_app']],
 '7': [{'Unique values count for variable':           group
   ad        17903
   referral  12097},
  {'Unique values count for variable':      downloaded_app
   Yes           18393
   No            11607},
  {'Unique values count for variable':     time_spent(min)
   20             2351
   18             2271
   19             2241
   12             2213
   13             2212
   17             2209
   14             2205
   10             2196
   15             2188
   11             2186
   16             2151
   8              1150
   6              1121
   7              1119
   5              1108
   9              1079}],
 '8': ['Descriptive statistics-numerical_Variables',
               user_id  time_spent(min)
  count   30000.000000     30000.000000
  mean   497244.479467        13.548800
  std    289220.271868         4.290116
  min        41.000000         5.000000
  25%    246691.000000        10.000000
  50%    495162.000000        14.000000
  75%    747418.250000        17.000000
  max    999979.000000        20.000000,
  '********************',
  'Descriptive statistics-Categorical_variables',
          group downloaded_app
  count   30000          30000
  unique      2              2
  top        ad            Yes
  freq    17903          18393,
  '********************'],
 '9': {'category_stats': [         time_spent(min)                                    
                      count median       mean       std min max
   group                                                       
   ad                 17903   13.0  12.533654  4.633040   5  20
   referral           12097   15.0  15.051170  3.177318  10  20]},
 '10': ['Dataframe',
     user_id     group downloaded_app  time_spent(min)
  0   784598        ad            Yes               13
  1   699052  referral            Yes               11
  2   218829        ad             No                7
  3   627414        ad            Yes                7
  4   190259  referral             No               10]}


# Since categorical variable present we will convert it into numerical using categorical_encoding
# import data_manipulation from AB_test
from AB_experiment import data_manipulation

#create alias to call data_manipulation
dm=data_manipulation()

data='app_data.csv'
variables=['downloaded_app']
download_df=True
filename='new'

dm.categorical_encoding(data, variables, download_df, filename)

[{'Before encoding': {'Variable_name': 'downloaded_app',
   'unique_values': array(['Yes', 'No'], dtype=object)},
  'After encoding': {'Variable_name': 'downloaded_app_coded',
   'unique_values': array([1, 0])}}]


#From above function we have converted variable into numeric variable hence we also convert its datatype into bool for better analysis.

data='new.csv'
change_variables=['downloaded_app_coded']
dtype=['bool']
drop_variables=[]
download_df=True
filename='new'

dm.change_variables(data,change_variables,dtype,drop_variables,download_df,filename)

[{'Variable1': ['downloaded_app_coded', dtype('bool')]}]


# After changing data types we chacking agian data_info
# import data_manipulation from AB_test

data='new.csv'
column1="group"
column2=["downloaded_app","time_spent(min)"]
quartile1=0.25
quartile3=0.75
info = True
download_df=False
filename='new'

dm.data_info(data,column1,column2,quartile1,quartile3,info,download_df,filename)

{'1': ['dataframe_shape', {'Observations': 30000, 'Column': 5}],
 '2': ['missing_data_info', {'No missing values'}],
 '3': ['outliers_info',
  [{'variable_name time_spent(min)': 'No outliers present'}]],
 '4': ['data_types',
  [{'object_values': "['group', 'downloaded_app']"},
   {'float_values': '[]'},
   {'int_values': ['user_id', 'time_spent(min)']},
   {'bool_val': ['downloaded_app_coded']}]],
 '5': ['numerical_Variables', ['user_id', 'time_spent(min)']],
 '6': ['Categorical_variables',
  ['group', 'downloaded_app', 'downloaded_app_coded']],
 '7': [{'Unique values count for variable':           group
   ad        17903
   referral  12097},
  {'Unique values count for variable':      downloaded_app
   Yes           18393
   No            11607},
  {'Unique values count for variable':     time_spent(min)
   20             2351
   18             2271
   19             2241
   12             2213
   13             2212
   17             2209
   14             2205
   10             2196
   15             2188
   11             2186
   16             2151
   8              1150
   6              1121
   7              1119
   5              1108
   9              1079},
  {'Unique values count for variable':        downloaded_app_coded
   True                  18393
   False                 11607}],
 '8': ['Descriptive statistics-numerical_Variables',
               user_id  time_spent(min)
  count   30000.000000     30000.000000
  mean   497244.479467        13.548800
  std    289220.271868         4.290116
  min        41.000000         5.000000
  25%    246691.000000        10.000000
  50%    495162.000000        14.000000
  75%    747418.250000        17.000000
  max    999979.000000        20.000000,
  '********************',
  'Descriptive statistics-Categorical_variables',
          group downloaded_app downloaded_app_coded
  count   30000          30000                30000
  unique      2              2                    2
  top        ad            Yes                 True
  freq    17903          18393                18393,
  '********************'],
 '9': {'category_stats': [         time_spent(min)                                    
                      count median       mean       std min max
   group                                                       
   ad                 17903   13.0  12.533654  4.633040   5  20
   referral           12097   15.0  15.051170  3.177318  10  20]},
 '10': ['Dataframe',
     user_id     group downloaded_app  time_spent(min)  downloaded_app_coded
  0   784598        ad            Yes               13                  True
  1   699052  referral            Yes               11                  True
  2   218829        ad             No                7                 False
  3   627414        ad            Yes                7                  True
  4   190259  referral             No               10                 False]}


# From above output info we can say that in our data there is no outliers , no missing values present 
# and datatypes of all variables correct
#Now we findout sample size


#fist we findout baseline conversion rate
# import stats_test from AB_test
from AB_experiment import stats_test

#create alias to call stats_test
st=stats_test()

data='new.csv'
column1="group"
column1_value='referral'

a = st.baseline_conversion_rate(data,column1,column1_value,column2='downloaded_app_coded')
b = st.baseline_conversion_rate(data,column1,column1_value,column2='time_spent(min)',bool_var=False,threshold=13.5)
print('downloaded_app',a,'/ntime_spent(min)',b)

downloaded_app_coded {'Baseline conversion rate(p1) of group referral': 0.4877} 
time_spent(min) {'Baseline conversion rate(p1) of group referral for greater than or equal to threshold value 13.5': 0.6419}


#Sample size using baseline conversion rate.

p1= 0.4877
mde=0.02
alpha=0.05
power=0.8
n_side=2

# For variable downloaded_app_coded
a=st.sample_size(p1,mde,alpha,power, n_side)

# For variable time_spent(min)
p1=0.6419
b=st.sample_size(p1,mde,alpha,power, n_side)

print('downloaded_app',a,'/ntime_spent(min)',b)

downloaded_app_coded {'Sample size': 9806} 
time_spent(min) {'Sample size': 8985}


# Now we check assumptions for all combinations to perform statistical tests for AB testing 

# import stats_test from AB_test
from AB_experiment import stats_test

#create alias to call stats_test
st=stats_test()

data='new.csv'
sample_size=9806
column1="group"
column1_value1='referral'
column1_value2='ad'
column2="downloaded_app_coded"
alpha=0.05
paired_data=False 

# For variable downloaded_app_coded
a=st.AB_Test_assumption(data, sample_size, column1, column1_value1, column1_value2, column2, alpha, paired_data)

# For variable time_spent(min)
sample_size=8985
column2="time_spent(min)"

b=st.AB_Test_assumption(data, sample_size, column1, column1_value1, column1_value2, column2, alpha, paired_data)

print('For downloaded_app variable/n',a,'/n',40*'*','/n For time_spent(min) variable/n',b)

For downloaded_app_coded variable
 ({'Target variable is boolean data type': 'Use Chi-Squared Test'}, {'Note': 'If our data involve time-to-event or survival analysis (e.g., time until a user completes a task), we can use methods such as the log-rank test'}) 
 **************************************** 
 For time_spent(min) variable
 ({'Assumption of Normality is not satisfied': 'Non-parametric test => Use Mann-Whitney U test.'}, {'Note': 'If we are comparing more than two groups, such as in an A/B/C testing scenario, we can use Kruskal-Wallis test.'})

C:/Users/VINAYAK/anaconda3/lib/site-packages/scipy/stats/morestats.py:1760: UserWarning: p-value may not be accurate for N > 5000.
  warnings.warn("p-value may not be accurate for N > 5000.")


# import stats_test from AB_test
from AB_experiment import stats_test

#create alias to call stats_test
st=stats_test()

# perform chi-square test
data='new.csv'
sample_size=9806
column1='group'
column1_value1='referral'
column1_value2='ad'
column2='downloaded_app_coded'
alpha=0.05
reverse_experiment=False

# For variable downloaded_app_coded
a=st.chi_squared_test(data, sample_size, column1, column1_value1, column1_value2, column2, alpha, reverse_experiment)

# For variable time_spent(min)
sample_size=8985
column2="time_spent(min)"

b=st.mann_whitney_U_test(data, sample_size, column1, column1_value1, column1_value2, column2, alpha, paired_data)

('For downloaded_app variable',a,40*'*','For time_spent(min) variable',b)

('For downloaded_app variable',
 [{'Test name': 'Chi-square test',
   'Timestamp': '2023-08-11 13:45:42',
   'Sample size': 9806,
   'Status': 'We can reject H0 => group ad is more successful',
   'P-value': 1.602395622342239e-193,
   'alpha': 0.05,
   'Test Statistic': 880.6203723014223,
   'Confidence Interval': (-0.2217828294823734, -0.19490083358105723)},
  {'proportion1': 0.4884, 'proportion2': 0.6967}],
 '****************************************',
 'For time_spent(min) variable',
 {'Test name': 'Mann whitney U test',
  'Timestamp': '2023-08-11 13:45:45',
  'Sample size': 8985,
  'Status': 'We can reject H0 => group referral performs better',
  'P-value': 1.2118957304952622e-289,
  'alpha': 0.05,
  'Test Statistic': 52979345.0,
  'Confidence Interval': (2.0, 3.0)})

mobile-app-ab-experiment

Goal :¶

By checking assumptions we use Chi-Squared Test for variable downloaded_app¶

By checking assumptions we perform Non-parametric test Mann-Whitney U test for variable time_spent(min)¶

Conclusion¶

Links

Related Articles