Download (right-click, save target as ...) this page as a jupyterlab notebook from: Lab14-TH
LAST NAME, FIRST NAME
R00000000
ENGR 1330 Laboratory 14 - Homework
# Preamble script block to identify host, user, and kernel
import sys
! hostname
! whoami
print(sys.executable)
print(sys.version)
print(sys.version_info)
atomickitty sensei /opt/jupyterhub/bin/python3 3.8.10 (default, Sep 28 2021, 16:10:42) [GCC 9.3.0] sys.version_info(major=3, minor=8, micro=10, releaselevel='final', serial=0)
Recall in Lab10-TH that you accessed a file of concrete strength and related mixture variables.
! pip install requests
#Get database -- use the Get Data From URL Script
#Step 1: import needed modules to interact with the internet
import requests
#Step 2: make the connection to the remote file (actually its implementing "bash curl -O http://fqdn/path ...")
remote_url = 'http://54.243.252.9/engr-1330-webroot/8-Labs/Lab10/concreteData.xls' # an Excel file
response = requests.get(remote_url) # Gets the file contents puts into an object
output = open('concreteData.xls', 'wb') # Prepare a destination, local
output.write(response.content) # write contents of object to named local file
output.close() # close the connection
Requirement already satisfied: requests in /usr/lib/python3/dist-packages (2.22.0)
Then you changed some column names
import pandas
concreteData = pandas.read_excel('concreteData.xls') # read the file
# rename the columns
req_col_names = ["Cement", "BlastFurnaceSlag", "FlyAsh", "Water", "Superplasticizer",
"CoarseAggregate", "FineAggregate", "Age", "CC_Strength"]
curr_col_names = list(concreteData.columns)
mapper = {}
for i, name in enumerate(curr_col_names):
mapper[name] = req_col_names[i]
concreteData = concreteData.rename(columns=mapper)
concreteData.head() # show the dataframe
Cement | BlastFurnaceSlag | FlyAsh | Water | Superplasticizer | CoarseAggregate | FineAggregate | Age | CC_Strength | |
---|---|---|---|---|---|---|---|---|---|
0 | 540.0 | 0.0 | 0.0 | 162.0 | 2.5 | 1040.0 | 676.0 | 28 | 79.986111 |
1 | 540.0 | 0.0 | 0.0 | 162.0 | 2.5 | 1055.0 | 676.0 | 28 | 61.887366 |
2 | 332.5 | 142.5 | 0.0 | 228.0 | 0.0 | 932.0 | 594.0 | 270 | 40.269535 |
3 | 332.5 | 142.5 | 0.0 | 228.0 | 0.0 | 932.0 | 594.0 | 365 | 41.052780 |
4 | 198.6 | 132.4 | 0.0 | 192.0 | 0.0 | 978.4 | 825.5 | 360 | 44.296075 |
Then you did the mulitple plots
# ! pip install seaborn
import matplotlib.pyplot
import seaborn
%matplotlib inline
seaborn.pairplot(concreteData)
matplotlib.pyplot.show()
So it's a cool plot, but the meaningful data science question is which variable(s) have predictive value for estimating concrete strength?
Answer by:
Cement
variable, what is its correlation coefficient?
$Strength_{model} = \beta_0 + \beta_1 \cdot Cement $# correlation coefficients
concreteData.corr()
Cement | BlastFurnaceSlag | FlyAsh | Water | Superplasticizer | CoarseAggregate | FineAggregate | Age | CC_Strength | |
---|---|---|---|---|---|---|---|---|---|
Cement | 1.000000 | -0.275193 | -0.397475 | -0.081544 | 0.092771 | -0.109356 | -0.222720 | 0.081947 | 0.497833 |
BlastFurnaceSlag | -0.275193 | 1.000000 | -0.323569 | 0.107286 | 0.043376 | -0.283998 | -0.281593 | -0.044246 | 0.134824 |
FlyAsh | -0.397475 | -0.323569 | 1.000000 | -0.257044 | 0.377340 | -0.009977 | 0.079076 | -0.154370 | -0.105753 |
Water | -0.081544 | 0.107286 | -0.257044 | 1.000000 | -0.657464 | -0.182312 | -0.450635 | 0.277604 | -0.289613 |
Superplasticizer | 0.092771 | 0.043376 | 0.377340 | -0.657464 | 1.000000 | -0.266303 | 0.222501 | -0.192717 | 0.366102 |
CoarseAggregate | -0.109356 | -0.283998 | -0.009977 | -0.182312 | -0.266303 | 1.000000 | -0.178506 | -0.003016 | -0.164928 |
FineAggregate | -0.222720 | -0.281593 | 0.079076 | -0.450635 | 0.222501 | -0.178506 | 1.000000 | -0.156094 | -0.167249 |
Age | 0.081947 | -0.044246 | -0.154370 | 0.277604 | -0.192717 | -0.003016 | -0.156094 | 1.000000 | 0.328877 |
CC_Strength | 0.497833 | 0.134824 | -0.105753 | -0.289613 | 0.366102 | -0.164928 | -0.167249 | 0.328877 | 1.000000 |
# plotting functions (ok to use built-in in pandas)
import matplotlib.pyplot as plt
def make2plot(listx1,listy1,listx2,listy2,strlablx,strlably,strtitle):
mydata = plt.figure(figsize = (10,5)) # build a square drawing canvass from figure class
plt.plot(listx1,listy1, c='red', marker='v',linewidth=0) # basic data plot
plt.plot(listx2,listy2, c='blue',linewidth=1) # basic model plot
plt.xlabel(strlablx)
plt.ylabel(strlably)
plt.legend(['Observations','Model'])# modify for argument insertion
plt.title(strtitle)
plt.show()
return
# data model trial-and-error fit
b0 = 0
b1 = .09
model = b0 + b1*concreteData['Cement']
# plot
make2plot(concreteData['Cement'],concreteData['CC_Strength'],concreteData['Cement'],model,"st1","st2","st3")
# assess model - sum of squares residuals
def residue(list1,list2,list3):
'''
compute residues
list3 = list1 - list2
return residuals in list3
'''
if len(list1)!=len(list2) or len(list1)!=len(list3):
print('Lists unequal length, undefined operations')
return
for i in range(len(list1)):
list3[i]=list1[i]-list2[i]
return(list3)
# get the residues
resids = [0 for i in range(len(concreteData['CC_Strength']))] # empty list
residue(concreteData['CC_Strength'],model,resids)
print(sum(resids))
for i in range(len(resids)):
resids[i]=resids[i]**2
print(sum(resids))
-6177.762599102981 326399.4857919762
Repeat the exercise using Age
as the predictor variable.
# data model trial-and-error fit
# plot
# assess model - sum of squares residuals
b0 = 40
b1 = .01
model = b0 + b1*concreteData['Age']
# plot
make2plot(concreteData['Age'],concreteData['CC_Strength'],concreteData['Age'],model,"st1","st2","st3")
# get the residues
resids = [0 for i in range(len(concreteData['CC_Strength']))] # empty list
residue(concreteData['CC_Strength'],model,resids)
print(sum(resids))
for i in range(len(resids)):
resids[i]=resids[i]**2
print(sum(resids))
-4777.949099102985 302604.9763904364
# data model trial-and-error fit
# plot
# assess model - sum of squares residuals
b0 = 130
b1 = -.5
model = b0 + b1*concreteData['Water']
# plot
make2plot(concreteData['Water'],concreteData['CC_Strength'],concreteData['Water'],model,"st1","st2","st3")
# get the residues
resids = [0 for i in range(len(concreteData['CC_Strength']))] # empty list
residue(concreteData['CC_Strength'],model,resids)
print(sum(resids))
for i in range(len(resids)):
resids[i]=resids[i]**2
print(sum(resids))
-3500.954099102989 310075.58209215617
Which is the better model of the three you examined?