# Preamble script block to identify host, user, and kernel
import sys
! hostname
! whoami
print(sys.executable)
print(sys.version)
print(sys.version_info)
inspired by an example in "A (very) friendly introduction to Confidence Intervals" by Dima Shulga available at https://towardsdatascience.com/a-very-friendly-introduction-to-confidence-intervals-9add126e714
and "Introduction of Confidence Interval" by Irfan Rahman available at https://medium.com/steps-towards-data-science/confidence-interval-a7fb3484d7b4
*hint: According to UN estimate data, almost 60 million (60,449,841) people live in Italy
totalpop = 60*10**6 # Total adult population of Italy (60M)
fbl_p = 0.75 #percentage of those loving soccer|football !
fblpop = int(totalpop * fbl_p) #Population of those who love football
nfblpop = int(totalpop * (1-fbl_p)) #Population of those who doesn't love football
import numpy as np
fblpop_1 = np.ones(fblpop) #An array of "1"s | its length is equal to the population of those who love football | DO NOT ATTEMPT TO PRINT!!!
nfblpop_0 = np.zeros(nfblpop) #An array of "0"s | its length is equal to the population of those who doesn't love football | DO NOT ATTEMPT TO PRINT!!!
totpop_01 = np.hstack([fblpop_1,nfblpop_0]) #An array of "0 & 1"s | its length is equal to the total population of Italy | DO NOT ATTEMPT TO PRINT!!!
print(np.mean(totpop_01))
np.mean(np.random.choice(totpop_01, size=1000)) # Run multiple times
# Let's do it in a more sophisticated/engineery/data sciency way!
for i in range(10): #Let's take 10 samples
sample = np.random.choice(totpop_01, size=1000)
print('Sample', i, ':', np.mean(sample))
values = [] #Create an empty list
for i in range(10000): #Let's take 10000 samples
sample = np.random.choice(totpop_01, size=1000) #Notice that the sample size is not changing
mean = np.mean(sample)
values.append(mean) #Store the mean of each sample set
print(np.mean(values)) #Printing the mean of means!
values = np.array(values)
print(values.std()) #Printing the standard deviation of means!
import seaborn as sns
sns.distplot(values,color='purple', rug=True,kde=True)
import matplotlib.pyplot as plt
fig = plt.figure(figsize =(10, 7))
plt.boxplot (values,1, '')
plt.show()
import seaborn as sns
sns.distplot(values,color='purple', rug=True,kde=True)
# Step 1- Organize the data
n = 500 #Sample size
Xbar = 461 #Sample mean
C = 0.95 #Confidence level
std = 100 #Standard deviation (σ)
z = 1.96 #The z value associated with 95% Confidence Interval
# Assuming a normally distributed population
# Assuming randomly selected samples
# Step2- Calculate the margin of error
import math
margin = z*(std/math.sqrt(n))
print('The margin of error is equal to : ',margin)
# Step3- Find the estimated true population mean for the 95% confidence interval
# To find the range of values you just have to add and subtract 8.765 from 461
low = Xbar-margin
high = Xbar+margin
print('the true population mean will be captured within the confidence interval of (',low,' , ',high,') and the confidence is 95%')
Some great reads on Confidence Intervals:
Some great videos on Confidence Intervals:
inspired by an example in "A/B Test Significance in Python" by Samuel Hinton available at https://cosmiccoding.com.au/tutorials/ab_tests
This is an A/B test. Often this is used interchangably with the term “split testing”, though in general A/B tests test small changes, and split testing might be when you present two entirely different websites to the user.
Good question - by having two sites active at once and randomly directing users to one or the other, you control for all other variables. If one week later puts you the week before Christmas, this will impact sales, and you might draw the wrong conclusion because of these confounding effects.
you can have as many perturbations running as you want, but got to keep the name simple. The more perturbations you try though, the smaller a number of samples you’ll have for each case, and the harder it will be to draw statistically significant conclusions.
num_a= 550
num_b = 450
click_a= 48
click_b = 56
rate_a= click_a / num_a
rate_b = click_b / num_b
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import binom
# Determine the probability of having x number of clicks - Binomial Dist.
clicks = np.arange(10, 100)
prob_a = binom(num_a, rate_a).pmf(clicks)
prob_b = binom(num_b, rate_b).pmf(clicks)
# Make the bar plots.
plt.bar(clicks, prob_a, label="A", alpha=0.7)
plt.bar(clicks, prob_b, label="B", alpha=0.7)
plt.legend()
plt.xlabel("# of Sells"); plt.ylabel("Probability");
from scipy.stats import norm
# Where does this come from? See this link: https://en.wikipedia.org/wiki/Binomial_distribution#Normal_approximation
std_a = np.sqrt(rate_a * (1 - rate_a) / num_a)
std_b = np.sqrt(rate_b * (1 - rate_b) / num_b)
click_rate = np.linspace(0, 0.2, 200)
prob_a = norm(rate_a, std_a).pdf(click_rate)
prob_b = norm(rate_b, std_b).pdf(click_rate)
# Make the bar plots.
plt.plot(click_rate, prob_a, label="A")
plt.plot(click_rate, prob_b, label="B")
plt.legend(frameon=False)
plt.xlabel("Purchase rate"); plt.ylabel("Probability")
z_score = (rate_b - rate_a) / np.sqrt(std_a**2 + std_b**2)
p = norm(rate_b - rate_a, np.sqrt(std_a**2 + std_b**2))
x = np.linspace(-0.05, 0.15, 1000)
y = p.pdf(x)
area_under_curve = p.sf(0)
plt.plot(x, y, label="PDF")
plt.fill_between(x, 0, y, where=x>0, label="Prob(b>a)", alpha=0.3)
plt.annotate(f"Area={area_under_curve:0.3f}", (0.02, 5))
plt.legend()
plt.xlabel("Difference in purchase rate"); plt.ylabel("Prob");
print(f"zscore is {z_score:0.3f}, with p-value {norm().sf(z_score):0.3f}")
def get_confidence_ab_test(click_a, num_a, click_b, num_b):
rate_a = click_a / num_a
rate_b = click_b / num_b
std_a = np.sqrt(rate_a * (1 - rate_a) / num_a)
std_b = np.sqrt(rate_b * (1 - rate_b) / num_b)
z_score = (rate_b - rate_a) / np.sqrt(std_a**2 + std_b**2)
return norm.sf(z_score)
print(get_confidence_ab_test(click_a, num_a, click_b, num_b))
from scipy.stats import mannwhitneyu
a_dist = np.zeros(num_a)
a_dist[:click_a] = 1
b_dist = np.zeros(num_b)
b_dist[:click_b] = 1
stat, p_value = mannwhitneyu(a_dist, b_dist, alternative="less")
print(f"Mann-Whitney U test for null hypothesis B <= A is {p_value:0.3f}")
Some great reads on A/B Testing:
Some great videos on A/B Testing: