#Load the necessary packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#Previously ...
data = pd.read_csv("lab14_E1data.csv")
set1 = np.array(data['Set1'])
set2 = np.array(data['Set2'])
mu1 = set1.mean()
sd1 = set1.std()
mu2 = set2.mean()
sd2 = set2.std()
set1_s = np.random.normal(mu1, sd1, 100)
set2_s = np.random.normal(mu2, sd2, 100)
data2 = pd.DataFrame({'Set1s':set1_s,'Set2s':set2_s})
#Previously ...
fig, ax = plt.subplots()
data2.plot.hist(density=False, ax=ax, title='Histogram: Set1 and Set1 samples vs. Set2 and Set2 samples', bins=40)
data.plot.hist(density=False, ax=ax, bins=40)
#Previously ...
fig = plt.figure(figsize =(10, 7))
plt.boxplot ([set1, set1_s, set2, set2_s],1, '')
We can use statistical hypothesis tests to confirm that our sets are from Normal Distribution Data Models. We can use the Shapiro-Wilk Normality Test:
# the Shapiro-Wilk Normality Test for set1
from scipy.stats import shapiro
stat, p = shapiro(data['Set1'])
print('stat=%.3f, p=%.3f' % (stat, p))
if p > 0.05:
print('Probably Gaussian')
print('Probably not Gaussian')
# the Shapiro-Wilk Normality Test for set2
from scipy.stats import shapiro
stat, p = shapiro(data['Set2'])
print('stat=%.3f, p=%.3f' % (stat, p))
if p > 0.05:
print('Probably Gaussian')
print('Probably not Gaussian')
# the Shapiro-Wilk Normality Test for set1s
from scipy.stats import shapiro
stat, p = shapiro(data2['Set1s'])
print('stat=%.3f, p=%.3f' % (stat, p))
if p > 0.05:
print('Probably Gaussian')
print('Probably not Gaussian')
# the Shapiro-Wilk Normality Test for set2s
from scipy.stats import shapiro
stat, p = shapiro(data2['Set2s'])
print('stat=%.3f, p=%.3f' % (stat, p))
if p > 0.05:
print('Probably Gaussian')
print('Probably not Gaussian')
Now let's confirm that set1 and set1_s are from the same distribution. We can use the Mann-Whitney U Test for this:
from scipy.stats import mannwhitneyu # import a useful non-parametric test
stat, p = mannwhitneyu(data['Set1'],data2['Set1s'])
print('statistic=%.3f, p-value at rejection =%.3f' % (stat, p))
if p > 0.05:
print('Probably the same distribution')
print('Probably different distributions')
Let's also confirm that set2 and set2_s are from the same distribution:
from scipy.stats import mannwhitneyu # import a useful non-parametric test
stat, p = mannwhitneyu(data['Set2'],data2['Set2s'])
print('statistic=%.3f, p-value at rejection =%.3f' % (stat, p))
if p > 0.05:
print('Probably the same distribution')
print('Probably different distributions')
Based on the results we can say set1 and set1_s probably belong to the same distrubtion. The same can be stated about set2 and set2_s. Now let's check and see if set1 and set2 are SIGNIFICANTLY different or not?
from scipy.stats import mannwhitneyu # import a useful non-parametric test
stat, p = mannwhitneyu(data['Set1'],data['Set2'])
print('statistic=%.3f, p-value at rejection =%.3f' % (stat, p))
if p > 0.05:
print('Probably the same distribution')
print('Probably different distributions')
The test's result indicate that the set1 and set2 belong to distirbutions with different measures of central tendency (means). We can check the same for set1_s and set2_s as well:
from scipy.stats import mannwhitneyu # import a useful non-parametric test
stat, p = mannwhitneyu(data2['Set1s'],data2['Set2s'])
print('statistic=%.3f, p-value at rejection =%.3f' % (stat, p))
if p > 0.05:
print('Probably the same distribution')
print('Probably different distributions')
Now we can state at a 95% confidence level that set1 and set2 are different. The same for set1s and set2s.
#From previous lab:
#For this lab: