# Preamble script block to identify host, user, and kernel
import sys
! hostname
! whoami
print(sys.executable)
print(sys.version)
print(sys.version_info)
Plotting Position: An empirical distribution, based on a random sample from a (possibly unknown) probability distribution, obtained by plotting the exceedance (or cumulative) probability of the sample distribution against the sample value.
The exceedance probability for a particular sample value is a function of sample size and the rank of the particular sample. For exceedance probabilities, the sample values are ranked from largest to smallest. The general expression in common use for plotting position is
where m is the ordered rank of a sample value, N is the sample size, and b is a constant between 0 and 1, depending on the plotting method.
Let's work on an example. First, import the necessary packages:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
Read the "lab13_data.csv" file as a dataset:
data = pd.read_csv("lab13_data.csv")
data
The dataset contains two sets of values: "Set1" and "Set2". Use descriptive functions to learn more the sets.
# Let's check out set1 and set2
set1 = data['Set1']
set2 = data['Set2']
print(set1)
print(set2)
set1.describe()
set2.describe()
Remember the Weibull Plotting Position formula from last session. Use Weibull Plotting Position formula to plot set1 and set2 quantiles on the same graph.
Do they look different? How?
def weibull_pp(sample): # Weibull plotting position function
# returns a list of plotting positions; sample must be a numeric list
weibull_pp = [] # null list to return after fill
sample.sort() # sort the sample list in place
for i in range(0,len(sample),1):
weibull_pp.append((i+1)/(len(sample)+1)) #values from the gringorten formula
return weibull_pp
#Convert to numpy arrays
set1 = np.array(set1)
set2 = np.array(set2)
#Apply the weibull pp function
set1_wei = weibull_pp(set1)
set2_wei = weibull_pp(set2)
myfigure = matplotlib.pyplot.figure(figsize = (4,8)) # generate a object from the figure class, set aspect ratio
matplotlib.pyplot.scatter(set1_wei, set1 ,color ='blue')
matplotlib.pyplot.scatter(set2_wei, set2 ,color ='orange')
matplotlib.pyplot.xlabel("Density or Quantile Value")
matplotlib.pyplot.ylabel("Value")
matplotlib.pyplot.title("Quantile Plot for Set1 and Set2 based on Weibull Plotting Function")
matplotlib.pyplot.show()
Do they look different? How?
Define functions for Gringorten, Cunnane, California, and Hazen Plotting Position Formulas. Overlay and Plot them all for set 1 and set2 on two different graphs.
def gringorten_pp(sample): # plotting position function
# returns a list of plotting positions; sample must be a numeric list
gringorten_pp = [] # null list to return after fill
sample.sort() # sort the sample list in place
for i in range(0,len(sample),1):
gringorten_pp.append((i+1-0.44)/(len(sample)+0.12)) #values from the gringorten formula
return gringorten_pp
set1_grin = gringorten_pp(set1)
set2_grin = gringorten_pp(set2)
def cunnane_pp(sample): # plotting position function
# returns a list of plotting positions; sample must be a numeric list
cunnane_pp = [] # null list to return after fill
sample.sort() # sort the sample list in place
for i in range(0,len(sample),1):
cunnane_pp.append((i+1-0.40)/(len(sample)+0.2)) #values from the cunnane formula
return cunnane_pp
set1_cun = cunnane_pp(set1)
set2_cun = cunnane_pp(set2)
def california_pp(sample): # plotting position function
# returns a list of plotting positions; sample must be a numeric list
california_pp = [] # null list to return after fill
sample.sort() # sort the sample list in place
for i in range(0,len(sample),1):
california_pp.append((i+1)/(len(sample))) #values from the cunnane formula
return california_pp
set1_cal = california_pp(set1)
set2_cal = california_pp(set2)
def hazen_pp(sample): # plotting position function
# returns a list of plotting positions; sample must be a numeric list
hazen_pp = [] # null list to return after fill
sample.sort() # sort the sample list in place
for i in range(0,len(sample),1):
hazen_pp.append((i+1-0.5)/(len(sample))) #values from the cunnane formula
return hazen_pp
set1_haz = hazen_pp(set1)
set2_haz = hazen_pp(set2)
myfigure = matplotlib.pyplot.figure(figsize = (12,8)) # generate a object from the figure class, set aspect ratio
matplotlib.pyplot.scatter(set1_wei, set1 ,color ='blue',
marker ="^",
s = 50)
matplotlib.pyplot.scatter(set1_grin, set1 ,color ='red',
marker ="o",
s = 20)
matplotlib.pyplot.scatter(set1_cun, set1 ,color ='green',
marker ="s",
s = 20)
matplotlib.pyplot.scatter(set1_cal, set1 ,color ='yellow',
marker ="p",
s = 20)
matplotlib.pyplot.scatter(set1_haz, set1 ,color ='black',
marker ="*",
s = 20)
matplotlib.pyplot.xlabel("Density or Quantile Value")
matplotlib.pyplot.ylabel("Value")
matplotlib.pyplot.title("Quantile Plot for Set1 based on Weibull, Gringorton, Cunnane, California, and Hazen Plotting Functions")
matplotlib.pyplot.show()
Plot a histogram of Set1 with 10 bins.
import matplotlib.pyplot as plt
myfigure = matplotlib.pyplot.figure(figsize = (10,5)) # generate a object from the figure class, set aspect ratio
set1 = data['Set1']
set1.plot.hist(grid=False, bins=10, rwidth=1,
color='navy')
plt.title('Histogram of Set1')
plt.xlabel('Value')
plt.ylabel('Counts')
plt.grid(axis='y',color='yellow', alpha=1)
Plot a histogram of Set2 with 10 bins.
set2 = data['Set2']
set2.plot.hist(grid=False, bins=10, rwidth=1,
color='darkorange')
plt.title('Histogram of Set2')
plt.xlabel('Value')
plt.ylabel('Counts')
plt.grid(axis='y',color='yellow', alpha=1)
Plot a histogram of both Set1 and Set2 and discuss the differences.
fig, ax = plt.subplots()
data.plot.hist(density=False, ax=ax, title='Histogram: Set1 vs. Set2', bins=40)
ax.set_ylabel('Count')
ax.grid(axis='y')
The cool 'seaborn' package: Another way for plotting histograms and more!
import seaborn as sns
sns.distplot(set1,color='navy', rug=True)
sns.distplot(set2,color='darkorange', rug=True)
Kernel Density Estimation (KDE): a non-parametric way to estimate the probability density function of a random variable. Kernel density estimation is a fundamental data smoothing problem where inferences about the population are made, based on a finite data sample. This can be useful if you want to visualize just the “shape” of some data, as a kind of continuous replacement for the discrete histogram.
*From:
https://en.wikipedia.org/wiki/Kernel_density_estimation
https://mathisonian.github.io/kde/ >> A SUPERCOOL Blog!
https://www.youtube.com/watch?v=fJoR3QsfXa0 >> A Nice Intro to distplot in seaborn | Note that displot is pretty much the same thing!
sns.distplot(set1,color='navy',kde=True,rug=True)
sns.distplot(set1,color='navy',kde=True)
sns.distplot(set2,color='orange',kde=True)
sns.distplot(set1,color='navy',kde=True)
Empirical Cumulative Distribution Function (ECDF): the distribution function associated with the empirical measure of a sample. This cumulative distribution function is a step function that jumps up by 1/n at each of the n data points. Its value at any specified value of the measured variable is the fraction of observations of the measured variable that are less than or equal to the specified value.
*From:
https://en.wikipedia.org/wiki/Empirical_distribution_function
Fit a Normal distribution data model to both Set1 and Set2. Plot them seperately. Describe the fit.
set1 = data['Set1']
set2 = data['Set2']
set1 = np.array(set1)
set2 = np.array(set2)
set1_wei = weibull_pp(set1)
set2_wei = weibull_pp(set2)
# Normal Quantile Function
import math
def normdist(x,mu,sigma):
argument = (x - mu)/(math.sqrt(2.0)*sigma)
normdist = (1.0 + math.erf(argument))/2.0
return normdist
# For set1
mu = set1.mean() # Fitted Model
sigma = set1.std()
x = []; ycdf = []
xlow = 0; xhigh = 1.2*max(set1) ; howMany = 100
xstep = (xhigh - xlow)/howMany
for i in range(0,howMany+1,1):
x.append(xlow + i*xstep)
yvalue = normdist(xlow + i*xstep,mu,sigma)
ycdf.append(yvalue)
# Fitting Data to Normal Data Model
# Now plot the sample values and plotting position
myfigure = matplotlib.pyplot.figure(figsize = (7,9)) # generate a object from the figure class, set aspect ratio
matplotlib.pyplot.scatter(set1_wei, set1 ,color ='navy')
matplotlib.pyplot.plot(ycdf, x, color ='gold',linewidth=3)
matplotlib.pyplot.xlabel("Quantile Value")
matplotlib.pyplot.ylabel("Set1 Value")
mytitle = "Normal Distribution Data Model sample mean = : " + str(mu)+ " sample variance =:" + str(sigma**2)
matplotlib.pyplot.title(mytitle)
matplotlib.pyplot.show()
# For set2
mu = set2.mean() # Fitted Model
sigma = set2.std()
x = []; ycdf = []
xlow = 0; xhigh = 1.2*max(set2) ; howMany = 100
xstep = (xhigh - xlow)/howMany
for i in range(0,howMany+1,1):
x.append(xlow + i*xstep)
yvalue = normdist(xlow + i*xstep,mu,sigma)
ycdf.append(yvalue)
# Fitting Data to Normal Data Model
# Now plot the sample values and plotting position
myfigure = matplotlib.pyplot.figure(figsize = (7,9)) # generate a object from the figure class, set aspect ratio
matplotlib.pyplot.scatter(set2_wei, set2 ,color ='orange')
matplotlib.pyplot.plot(ycdf, x, color ='purple',linewidth=3)
matplotlib.pyplot.xlabel("Quantile Value")
matplotlib.pyplot.ylabel("Set2 Value")
mytitle = "Normal Distribution Data Model sample mean = : " + str(mu)+ " sample variance =:" + str(sigma**2)
matplotlib.pyplot.title(mytitle)
matplotlib.pyplot.show()
Since it was an appropriate fit, we can use the normal distrubation to generate another sample randomly from the same population. Use a histogram with the new generated sets and compare them visually.
mu1 = set1.mean()
sd1 = set1.std()
mu2 = set2.mean()
sd2 = set2.std()
set1_s = np.random.normal(mu1, sd1, 100)
set2_s = np.random.normal(mu2, sd2, 100)
data_d = pd.DataFrame({'Set1s':set1_s,'Set2s':set2_s})
fig, ax = plt.subplots()
data_d.plot.hist(density=False, ax=ax, title='Histogram: Set1 samples vs. Set2 samples', bins=40)
ax.set_ylabel('Count')
ax.grid(axis='y')
fig, ax = plt.subplots()
data_d.plot.hist(density=False, ax=ax, title='Histogram: Set1 and Set1 samples vs. Set2 and Set2 samples', bins=40)
data.plot.hist(density=False, ax=ax, bins=40)
ax.set_ylabel('Count')
ax.grid(axis='y')
Use boxplots to compare the four sets. Discuss their differences.
fig = plt.figure(figsize =(10, 7))
plt.boxplot ([set1, set1_s, set2, set2_s],1, '')
plt.show()
The first pair and the second pair look similar while the two pairs look differnet, right? The question is how can we KNOW if two sets are truly (significantly) different or not?