Goal :¶
In [ ]:
In [1]:
# data Manipulation - first we check information about data if any problems we will fix it.
# import data_manipulation from AB_test
from AB_experiment import data_manipulation
#create alias to call data_manipulation
dm=data_manipulation()
data='app_data.csv'
column1="group"
column2=["downloaded_app","time_spent(min)"]
quartile1=0.25
quartile3=0.75
info = True
download_df=False
filename='new'
dm.data_info(data,column1,column2,quartile1,quartile3,info,download_df,filename)
Out[1]:
{'1': ['dataframe_shape', {'Observations': 30000, 'Column': 4}], '2': ['missing_data_info', {'No missing values'}], '3': ['outliers_info', [{'variable_name time_spent(min)': 'No outliers present'}]], '4': ['data_types', [{'object_values': "['group', 'downloaded_app']"}, {'float_values': '[]'}, {'int_values': ['user_id', 'time_spent(min)']}, {'bool_val': []}]], '5': ['numerical_Variables', ['user_id', 'time_spent(min)']], '6': ['Categorical_variables', ['group', 'downloaded_app']], '7': [{'Unique values count for variable': group ad 17903 referral 12097}, {'Unique values count for variable': downloaded_app Yes 18393 No 11607}, {'Unique values count for variable': time_spent(min) 20 2351 18 2271 19 2241 12 2213 13 2212 17 2209 14 2205 10 2196 15 2188 11 2186 16 2151 8 1150 6 1121 7 1119 5 1108 9 1079}], '8': ['Descriptive statistics-numerical_Variables', user_id time_spent(min) count 30000.000000 30000.000000 mean 497244.479467 13.548800 std 289220.271868 4.290116 min 41.000000 5.000000 25% 246691.000000 10.000000 50% 495162.000000 14.000000 75% 747418.250000 17.000000 max 999979.000000 20.000000, '********************', 'Descriptive statistics-Categorical_variables', group downloaded_app count 30000 30000 unique 2 2 top ad Yes freq 17903 18393, '********************'], '9': {'category_stats': [ time_spent(min) count median mean std min max group ad 17903 13.0 12.533654 4.633040 5 20 referral 12097 15.0 15.051170 3.177318 10 20]}, '10': ['Dataframe', user_id group downloaded_app time_spent(min) 0 784598 ad Yes 13 1 699052 referral Yes 11 2 218829 ad No 7 3 627414 ad Yes 7 4 190259 referral No 10]} In [ ]:
In [4]:
# Since categorical variable present we will convert it into numerical using categorical_encoding
# import data_manipulation from AB_test
from AB_experiment import data_manipulation
#create alias to call data_manipulation
dm=data_manipulation()
data='app_data.csv'
variables=['downloaded_app']
download_df=True
filename='new'
dm.categorical_encoding(data, variables, download_df, filename)
Out[4]:
[{'Before encoding': {'Variable_name': 'downloaded_app', 'unique_values': array(['Yes', 'No'], dtype=object)}, 'After encoding': {'Variable_name': 'downloaded_app_coded', 'unique_values': array([1, 0])}}] In [12]:
#From above function we have converted variable into numeric variable hence we also convert its datatype into bool for better analysis.
data='new.csv'
change_variables=['downloaded_app_coded']
dtype=['bool']
drop_variables=[]
download_df=True
filename='new'
dm.change_variables(data,change_variables,dtype,drop_variables,download_df,filename)
Out[12]:
[{'Variable1': ['downloaded_app_coded', dtype('bool')]}] In [13]:
# After changing data types we chacking agian data_info
# import data_manipulation from AB_test
data='new.csv'
column1="group"
column2=["downloaded_app","time_spent(min)"]
quartile1=0.25
quartile3=0.75
info = True
download_df=False
filename='new'
dm.data_info(data,column1,column2,quartile1,quartile3,info,download_df,filename)
Out[13]:
{'1': ['dataframe_shape', {'Observations': 30000, 'Column': 5}], '2': ['missing_data_info', {'No missing values'}], '3': ['outliers_info', [{'variable_name time_spent(min)': 'No outliers present'}]], '4': ['data_types', [{'object_values': "['group', 'downloaded_app']"}, {'float_values': '[]'}, {'int_values': ['user_id', 'time_spent(min)']}, {'bool_val': ['downloaded_app_coded']}]], '5': ['numerical_Variables', ['user_id', 'time_spent(min)']], '6': ['Categorical_variables', ['group', 'downloaded_app', 'downloaded_app_coded']], '7': [{'Unique values count for variable': group ad 17903 referral 12097}, {'Unique values count for variable': downloaded_app Yes 18393 No 11607}, {'Unique values count for variable': time_spent(min) 20 2351 18 2271 19 2241 12 2213 13 2212 17 2209 14 2205 10 2196 15 2188 11 2186 16 2151 8 1150 6 1121 7 1119 5 1108 9 1079}, {'Unique values count for variable': downloaded_app_coded True 18393 False 11607}], '8': ['Descriptive statistics-numerical_Variables', user_id time_spent(min) count 30000.000000 30000.000000 mean 497244.479467 13.548800 std 289220.271868 4.290116 min 41.000000 5.000000 25% 246691.000000 10.000000 50% 495162.000000 14.000000 75% 747418.250000 17.000000 max 999979.000000 20.000000, '********************', 'Descriptive statistics-Categorical_variables', group downloaded_app downloaded_app_coded count 30000 30000 30000 unique 2 2 2 top ad Yes True freq 17903 18393 18393, '********************'], '9': {'category_stats': [ time_spent(min) count median mean std min max group ad 17903 13.0 12.533654 4.633040 5 20 referral 12097 15.0 15.051170 3.177318 10 20]}, '10': ['Dataframe', user_id group downloaded_app time_spent(min) downloaded_app_coded 0 784598 ad Yes 13 True 1 699052 referral Yes 11 True 2 218829 ad No 7 False 3 627414 ad Yes 7 True 4 190259 referral No 10 False]} In [ ]:
In [ ]:
# From above output info we can say that in our data there is no outliers , no missing values present
# and datatypes of all variables correct
#Now we findout sample size
In [6]:
#fist we findout baseline conversion rate
# import stats_test from AB_test
from AB_experiment import stats_test
#create alias to call stats_test
st=stats_test()
data='new.csv'
column1="group"
column1_value='referral'
a = st.baseline_conversion_rate(data,column1,column1_value,column2='downloaded_app_coded')
b = st.baseline_conversion_rate(data,column1,column1_value,column2='time_spent(min)',bool_var=False,threshold=13.5)
print('downloaded_app',a,'/ntime_spent(min)',b)
downloaded_app_coded {'Baseline conversion rate(p1) of group referral': 0.4877} time_spent(min) {'Baseline conversion rate(p1) of group referral for greater than or equal to threshold value 13.5': 0.6419} In [ ]:
In [16]:
#Sample size using baseline conversion rate.
p1= 0.4877
mde=0.02
alpha=0.05
power=0.8
n_side=2
# For variable downloaded_app_coded
a=st.sample_size(p1,mde,alpha,power, n_side)
# For variable time_spent(min)
p1=0.6419
b=st.sample_size(p1,mde,alpha,power, n_side)
print('downloaded_app',a,'/ntime_spent(min)',b)
downloaded_app_coded {'Sample size': 9806} time_spent(min) {'Sample size': 8985} In [ ]:
In [29]:
# Now we check assumptions for all combinations to perform statistical tests for AB testing
# import stats_test from AB_test
from AB_experiment import stats_test
#create alias to call stats_test
st=stats_test()
data='new.csv'
sample_size=9806
column1="group"
column1_value1='referral'
column1_value2='ad'
column2="downloaded_app_coded"
alpha=0.05
paired_data=False
# For variable downloaded_app_coded
a=st.AB_Test_assumption(data, sample_size, column1, column1_value1, column1_value2, column2, alpha, paired_data)
# For variable time_spent(min)
sample_size=8985
column2="time_spent(min)"
b=st.AB_Test_assumption(data, sample_size, column1, column1_value1, column1_value2, column2, alpha, paired_data)
print('For downloaded_app variable/n',a,'/n',40*'*','/n For time_spent(min) variable/n',b)
For downloaded_app_coded variable ({'Target variable is boolean data type': 'Use Chi-Squared Test'}, {'Note': 'If our data involve time-to-event or survival analysis (e.g., time until a user completes a task), we can use methods such as the log-rank test'}) **************************************** For time_spent(min) variable ({'Assumption of Normality is not satisfied': 'Non-parametric test => Use Mann-Whitney U test.'}, {'Note': 'If we are comparing more than two groups, such as in an A/B/C testing scenario, we can use Kruskal-Wallis test.'}) C:/Users/VINAYAK/anaconda3/lib/site-packages/scipy/stats/morestats.py:1760: UserWarning: p-value may not be accurate for N > 5000. warnings.warn("p-value may not be accurate for N > 5000.") In [ ]:
By checking assumptions we use Chi-Squared Test for variable downloaded_app¶Define the null and alternative hypotheses :
By checking assumptions we perform Non-parametric test Mann-Whitney U test for variable time_spent(min)¶Define the null and alternative hypotheses :
In [30]:
# import stats_test from AB_test
from AB_experiment import stats_test
#create alias to call stats_test
st=stats_test()
# perform chi-square test
data='new.csv'
sample_size=9806
column1='group'
column1_value1='referral'
column1_value2='ad'
column2='downloaded_app_coded'
alpha=0.05
reverse_experiment=False
# For variable downloaded_app_coded
a=st.chi_squared_test(data, sample_size, column1, column1_value1, column1_value2, column2, alpha, reverse_experiment)
# For variable time_spent(min)
sample_size=8985
column2="time_spent(min)"
b=st.mann_whitney_U_test(data, sample_size, column1, column1_value1, column1_value2, column2, alpha, paired_data)
('For downloaded_app variable',a,40*'*','For time_spent(min) variable',b)
Out[30]:
('For downloaded_app variable', [{'Test name': 'Chi-square test', 'Timestamp': '2023-08-11 13:45:42', 'Sample size': 9806, 'Status': 'We can reject H0 => group ad is more successful', 'P-value': 1.602395622342239e-193, 'alpha': 0.05, 'Test Statistic': 880.6203723014223, 'Confidence Interval': (-0.2217828294823734, -0.19490083358105723)}, {'proportion1': 0.4884, 'proportion2': 0.6967}], '****************************************', 'For time_spent(min) variable', {'Test name': 'Mann whitney U test', 'Timestamp': '2023-08-11 13:45:45', 'Sample size': 8985, 'Status': 'We can reject H0 => group referral performs better', 'P-value': 1.2118957304952622e-289, 'alpha': 0.05, 'Test Statistic': 52979345.0, 'Confidence Interval': (2.0, 3.0)}) In [ ]:
Conclusion¶From downloaded_app Variable
From time_spent(min) Variable
In [ ]:
In [ ]:
|