%%html
<!--Script block to left align Markdown Tables-->
<style>
table {margin-left: 0 !important;}
</style>
# Preamble script block to identify host, user, and kernel
import sys
! hostname
! whoami
print(sys.executable)
print(sys.version)
print(sys.version_info)
Last GitHub Commit Date: 16 Mar 2021
A procedure to use additional variables to make predictions. OLS is not confined to a single explainatory variable; we can consider a collection of explainatory variables.
The CT concepts include:
https://inferentialthinking.com/chapters/15/Prediction.html
You know the URL that no-one reads, perhaps because there is a "secret" module you need to install, without instructions of how!
As an example consider EcommerceCustomers and the effect, if any, of advertising.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
ecom = pd.read_csv("EcommerceCustomers.csv") # Read the file as a .CSV assign to a dataframe evapdf
ecom.head() # check structure
advrt = pd.read_csv("advertisingB.csv") # Read the file as a .CSV assign to a dataframe
advrt.head() # check structure
def linearsolver(A,b):
n = len(A)
M = A
i = 0
for x in M:
x.append(b[i])
i += 1
for k in range(n):
for i in range(k,n):
if abs(M[i][k]) > abs(M[k][k]):
M[k], M[i] = M[i],M[k]
else:
pass
for j in range(k+1,n):
q = float(M[j][k]) / M[k][k]
for m in range(k, n+1):
M[j][m] -= q * M[k][m]
x = [0 for i in range(n)]
x[n-1] =float(M[n-1][n])/M[n-1][n-1]
for i in range (n-1,-1,-1):
z = 0
for j in range(i+1,n):
z = z + float(M[i][j])*x[j]
x[i] = float(M[i][n] - z)/M[i][i]
# print (x)
return(x)
def matrixvectormult(amatrix,xvector,rowNumA,colNumA):
bvector=[0.0 for i in range(rowNumA)]
for i in range(0,rowNumA):
for j in range(0,1):
for k in range(0,colNumA):
bvector[i]=bvector[i]+amatrix[i][k]*xvector[k]
return(bvector)
apptime=ecom["Time on App"]
seslen=ecom["Avg. Session Length"]
memlen=ecom["Length of Membership"]
webtime=ecom["Time on Website"]
moneyshot=ecom["Yearly Amount Spent"]
colNum = 5
rowNum = len(moneyshot)
excitation_matrix = [[0 for j in range(colNum)] for i in range(rowNum)]
#result_matrix = [[0 for j in range(colNumB)] for i in range(rowNumA)]
for i in range(0,rowNum):
excitation_matrix[i][0]=1
excitation_matrix[i][1]=apptime[i]
excitation_matrix[i][2]=seslen[i]
excitation_matrix[i][3]=memlen[i]
excitation_matrix[i][4]=webtime[i]
# observe the triple for-loop structure and the counting scheme
xt_matrix = [[0 for j in range(rowNum)] for i in range(colNum)] #transpose the matrix
for i in range(0,rowNum):
for j in range(0,colNum):
xt_matrix[j][i]=excitation_matrix[i][j]
rowNumA = 5 # a == transpose(excitation_matrix)
colNumA = rowNum
rowNumB = rowNum
colNumB = 5 # b == excitation_matrix
xtx_matrix = [[0 for j in range(colNumB)] for i in range(rowNumA)]
for i in range(0,rowNumA):
for j in range(0,colNumB):
for k in range(0,colNumA):
xtx_matrix[i][j]=xtx_matrix[i][j]+xt_matrix[i][k]*excitation_matrix[k][j]
# observe the triple for-loop structure and the counting scheme
xty_vector = []
xty_vector = matrixvectormult(xt_matrix,moneyshot,rowNumA,colNumA)
# copy amatrix into cmatrix
cmatrix = [[xtx_matrix[i][j] for j in range(colNumB)]for i in range(rowNumA)]
dvector = [xty_vector[i] for i in range(rowNumA)]
dvector = linearsolver(xtx_matrix,xty_vector)
dvector
def moneyShot(app_time,session_length,member_length,website_time):
moneyShot = -1051.5942552997137+38.709153810827736*app_time+25.734271084677594*session_length+61.57732375487672*member_length+0.4367388355965311*website_time
return(moneyShot)
predicted_spend = []
webtime = []
avgtim=ecom["Time on App"].mean()
avgses=ecom["Avg. Session Length"].mean()
avgmem=ecom["Length of Membership"].mean()
avgweb=ecom["Time on Website"].mean()
varweb=ecom["Time on Website"].std()
low = avgweb - 3*varweb
high = avgweb + 3*varweb
hilo = high - low
for i in range(101):
wtime = low + float(i)*hilo/100.
webtime.append(wtime)
predicted_spend.append(moneyShot(avgtim,avgses,avgmem,wtime))
import matplotlib.pyplot
matplotlib.pyplot.scatter(webtime,predicted_spend)
matplotlib.pyplot.xlabel('Time on Website - minutes')
matplotlib.pyplot.ylabel('Customer Annual Spend ')
matplotlib.pyplot.xlabel('Time on Website - minutes')
matplotlib.pyplot.title('Spend versus Time on Website ')
matplotlib.pyplot.show()
# In class - add other components; examine apparent slope
https://scipython.com/book/chapter-8-scipy/examples/weighted-and-non-weighted-least-squares-fitting/
Consider our last lesson where we used linear algebra to construct OLS regressions
$\mathbf{X^T}\mathbf{Y}=\mathbf{X^T}\mathbf{X}\mathbf{\beta}$
If we were to insert an identy matrix (1s on the diagonal, 0s elsewhere) we could write:
$\mathbf{X^T}\mathbf{I}\mathbf{Y}=\mathbf{X^T}\mathbf{I}\mathbf{X}\mathbf{\beta}$ and there is no change.
# copy/adapted from https://www.statsmodels.org/0.6.1/examples/notebooks/generated/wls.html
from __future__ import print_function
import numpy as np
from scipy import stats
import statsmodels.api as sm
import matplotlib.pyplot as plt
from statsmodels.sandbox.regression.predstd import wls_prediction_std
from statsmodels.iolib.table import (SimpleTable, default_txt_fmt)
np.random.seed(1024)
nsample = 50
x = np.linspace(0, 20, nsample)
X = np.column_stack((x, (x - 5)**2))
X = sm.add_constant(X)
beta = [5., 0.5, -0.01]
sig = 0.5
w = np.ones(nsample)
w[int(nsample * 6/10):] = 3
y_true = np.dot(X, beta)
e = np.random.normal(size=nsample)
y = y_true + sig * w * e
X = X[:,[0,1]]
mod_wls = sm.WLS(y, X, weights=1./w)
res_wls = mod_wls.fit()
print(res_wls.summary())
res_ols = sm.OLS(y, X).fit()
print(res_ols.params)
print(res_wls.params)
covb = res_ols.cov_params()
prediction_var = res_ols.mse_resid + (X * np.dot(covb,X.T).T).sum(1)
prediction_std = np.sqrt(prediction_var)
tppf = stats.t.ppf(0.975, res_ols.df_resid)
prstd_ols, iv_l_ols, iv_u_ols = wls_prediction_std(res_ols)
prstd, iv_l, iv_u = wls_prediction_std(res_wls)
fig, ax = plt.subplots(figsize=(8,6))
ax.plot(x, y, 'o', label="Data")
ax.plot(x, y_true, 'b-', label="True")
# OLS
ax.plot(x, res_ols.fittedvalues, 'r--')
ax.plot(x, iv_u_ols, 'r--', label="OLS")
ax.plot(x, iv_l_ols, 'r--')
# WLS
ax.plot(x, res_wls.fittedvalues, 'g--.')
ax.plot(x, iv_u, 'g--', label="WLS")
ax.plot(x, iv_l, 'g--')
ax.legend(loc="best");
resid1 = res_ols.resid[w==1.]
var1 = resid1.var(ddof=int(res_ols.df_model)+1)
resid2 = res_ols.resid[w!=1.]
var2 = resid2.var(ddof=int(res_ols.df_model)+1)
w_est = w.copy()
w_est[w!=1.] = np.sqrt(var2) / np.sqrt(var1)
res_fwls = sm.WLS(y, X, 1./w_est).fit()
print(res_fwls.summary())