# Script block to identify host, user, and kernel
import sys
! hostname; ! whoami; ! pwd; 
print(sys.executable)

atomickitty
sensei
/home/sensei/engr-1330-webroot/1-Lessons/Lesson09
/opt/jupyterhub/bin/python3


%%html
<!-- Script Block to set tables to left alignment -->
<style>
  table {margin-left: 0 !important;}
</style>


parts = [['PNO','PNAME','COLOR','WEIGHT','CITY'],
         ['P1','Nut','Red',12.0,'London'],
         ['P2','Bolt','Green',17.0,'Paris'],
         ['P3','Screw','Blue',17.0,'Oslo'],
         ['P4','Screw','Red',14.0,'London'],
         ['P5','Cam','Blue',12.0,'Paris'],
         ['P6','Cog','Red',19.0,'London'],]
suppliers = [['SNO','SNAME','STATUS','CITY'],
             ['S1','Smith',20,'London'],
             ['S2','Jones',10,'Paris'],
             ['S3','Blake',30,'Paris'],
             ['S4','Clark',20,'London'],
             ['S5','Adams',30,'Athens'],]
shipments = [['SNO','PNO','QTY'],
             ['S1','P1',300],
             ['S1','P2',200],
             ['S1','P3',400],
             ['S1','P4',200],
             ['S1','P5',100],
             ['S1','P6',100],
             ['S2','P1',300],
             ['S2','P2',400],
             ['S3','P2',200],
             ['S4','P2',200],
             ['S4','P4',300],
             ['S4','P5',400]]


for i in range(1,len(parts)):
    if parts[i][3] < 13.0 :
        print(parts[i])

['P1', 'Nut', 'Red', 12.0, 'London']
['P5', 'Cam', 'Blue', 12.0, 'Paris']


for i in range(1,len(parts)):
    if parts[i][3] < 13.0 :
        print(parts[i][0],parts[i][2],parts[i][4]) # slice the sublist

P1 Red London
P5 Blue Paris


temp=[]
for i in range(0,len(suppliers)):
    if suppliers[i][2] == 10 :
        continue
    else:
        temp.append(suppliers[i]) # slice the sublist
suppliers = temp # attempt to rewrite the original
for i in range(len(suppliers)):
    print(suppliers[i])

['SNO', 'SNAME', 'STATUS', 'CITY']
['S1', 'Smith', 20, 'London']
['S3', 'Blake', 30, 'Paris']
['S4', 'Clark', 20, 'London']
['S5', 'Adams', 30, 'Athens']


temp=[]
for i in range(0,len(suppliers)):
    if suppliers[i][3] == 'London' :
        temp.append(suppliers[i][0]) # get supplier code from london
    else:
        continue

howmany = 0 # keep count 
for i in range(0,len(shipments)):
    for j in range(len(temp)):
        if shipments[i][0] == temp[j]:
            howmany = howmany + shipments[i][2]
        else:
            continue

print(howmany)

2200


import pandas


%reset -f


import pandas
partsdf = pandas.DataFrame(parts)
partsdf.set_axis(parts[0][:],axis=1,inplace=True)  # label the columns
partsdf.drop(0, axis=0, inplace = True) # remove the first row that held the column names
partsdf


suppliersdf = pandas.DataFrame(suppliers)
suppliersdf.set_axis(suppliers[0][:],axis=1,inplace=True)  # label the columns
suppliersdf.drop(0, axis=0, inplace = True) # remove the first row that held the column names
suppliersdf


shipmentsdf = pandas.DataFrame(shipments)
shipmentsdf.set_axis(shipments[0][:],axis=1,inplace=True)  # label the columns
shipmentsdf.drop(0, axis=0, inplace = True) # remove the first row that held the column names
shipmentsdf


partsdf.shape  # this is a method to return shape, notice no argument list i.e. no ()

(6, 5)


suppliersdf.shape

(4, 4)


shipmentsdf.shape

(12, 3)


partsdf['COLOR'] #Selecing a single column

1      Red
2    Green
3     Blue
4      Red
5     Blue
6      Red
Name: COLOR, dtype: object


partsdf[['COLOR','CITY']] #Selecing a multiple columns - note the names are supplied as a list


partsdf.loc[[5,6]] #Selecing rows based on label via loc[ ] indexer using row indices - note supplied as a list


partsdf[partsdf['WEIGHT'] < 13] # from dataframe named partsdf, find all rows in column "WEIGHT less than 13, and return these rows"


partsdf[partsdf['WEIGHT'] < 13][['PNO','COLOR','CITY']] # from dataframe named partsdf, find all rows in column "WEIGHT less than 13, and return  part number, color, and city from these rows"


shipmentsdf.head() # if you supply an argument you control how many rows are shown i.e. shipmentsdf.head(3) returns first 3 rows


shipmentsdf.tail()


#Info about the dataframe

suppliersdf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5 entries, 1 to 5
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   SNO     5 non-null      object
 1   SNAME   5 non-null      object
 2   STATUS  5 non-null      object
 3   CITY    5 non-null      object
dtypes: object(4)
memory usage: 200.0+ bytes


#Statistics of the dataframe

partsdf.describe()


%reset -f


import numpy # we just reset the worksheet, so reimport the packages
import pandas


mydf = pandas.DataFrame(numpy.random.randint(1,100,(5,4)), ['A','B','C','D','E'], ['W','X','Y','Z'])
mydf


mydf.shape

(5, 4)


mydf['new']= 'NA'


mydf


newrow = mydf.loc[['E']].rename(index={"E": "X"}) # create a single row, rename the index
newtable = pandas.concat([mydf,newrow]) # concatenate the row to bottom of df - note the syntax


newtable


newtable.drop('new', axis=1, inplace = True)
newtable


newtable = newtable.loc[['A','B','D','E','X']] # select all rows except C
newtable


# or just use drop with axis specify
newtable.drop('X', axis=0, inplace = True)


newtable


newtable['X'] #Selecing a single column

A    43
B    74
D    22
E    58
Name: X, dtype: int64


newtable[['X','W']] #Selecing a multiple columns


newtable.loc['E'] #Selecing rows based on label via loc[ ] indexer

W    73
X    58
Y    89
Z    59
Name: E, dtype: int64


newtable


newtable.loc[['E','D','B']] #Selecing multiple rows based on label via loc[ ] indexer


newtable.loc[['B','E','D'],['X','Y']] #Selecting elements via both rows and columns via loc[ ] indexer


mydf = pandas.DataFrame({'col1':[1,2,3,4,5,6,7,8],
                   'col2':[444,555,666,444,666,111,222,222],
                   'col3':['orange','apple','grape','mango','jackfruit','watermelon','banana','peach']})
mydf


#What fruit corresponds to the number 555 in ‘col2’?

mydf[mydf['col2']==555]['col3']

1    apple
Name: col3, dtype: object


#What fruit corresponds to the minimum number in ‘col2’?

mydf[mydf['col2']==mydf['col2'].min()]['col3']

5    watermelon
Name: col3, dtype: object


#Creating a dataframe from a dictionary

mydf = pandas.DataFrame({'col1':[1,2,3,4,5,6,7,8],
                   'col2':[444,555,666,444,666,111,222,222],
                   'col3':['orange','apple','grape','mango','jackfruit','watermelon','banana','peach']})
mydf


#Returns only the first five rows

mydf.head()


#Info about the dataframe

mydf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   col1    8 non-null      int64 
 1   col2    8 non-null      int64 
 2   col3    8 non-null      object
dtypes: int64(2), object(1)
memory usage: 320.0+ bytes


#Statistics of the dataframe

mydf.describe()


mydf['col2'].sum() #Sum of a specified column

3330


mydf['col2'].unique() #Returns the list of unique values along the indexed column

array([444, 555, 666, 111, 222])


mydf['col2'].nunique() #Returns the total number of unique values along the indexed column

5


mydf['col2'].value_counts()  #Returns the number of occurences of each unique value

222    2
444    2
666    2
111    1
555    1
Name: col2, dtype: int64


def times2(x):  # A prototype function to scalar multiply an object x by 2
    return(x*2)

print(mydf)
print('Apply the times2 function to col2')
mydf['reallynew'] = mydf['col2'].apply(times2) #Symbolic apply the function to each element of column col2, result is another dataframe

   col1  col2        col3
0     1   444      orange
1     2   555       apple
2     3   666       grape
3     4   444       mango
4     5   666   jackfruit
5     6   111  watermelon
6     7   222      banana
7     8   222       peach
Apply the times2 function to col2


mydf


mydf.sort_values('col2', ascending = True) #Sorting based on columns


mydf.sort_values('col3', ascending = True) #Lexiographic sort


#Creating a dataframe from a dictionary

data = {
    'key' : ['A', 'B', 'C', 'A', 'B', 'C'],
    'data1' : [1, 2, 3, 4, 5, 6],
    'data2' : [10, 11, 12, 13, 14, 15],
    'data3' : [20, 21, 22, 13, 24, 25]
}

mydf1 = pandas.DataFrame(data)
mydf1


# Grouping and summing values in all the columns based on the column 'key'

mydf1.groupby('key').sum()


# Grouping and summing values in the selected columns based on the column 'key'

mydf1.groupby('key')[['data1', 'data2']].sum()


#Creating a dataframe from a dictionary

df = pandas.DataFrame({'col1':[1,2,3,4,None,6,7,None],
                   'col2':[444,555,None,444,666,111,None,222],
                   'col3':['orange','apple','grape','mango','jackfruit','watermelon','banana','peach']})
df


df_dropped = df.dropna()
df_dropped


df_filled1 = df.fillna(0)
df_filled1


df_filled2 = df.fillna(df.mean())
df_filled2


readfilecsv = pandas.read_csv('CSV_ReadingFile.csv')  #Reading a .csv file
print(readfilecsv)

    a   b   c   d
0   0   1   2   3
1   4   5   6   7
2   8   9  10  11
3  12  13  14  15


readfileexcel = pandas.read_excel('Excel_ReadingFile.xlsx', sheet_name='Sheet1', engine='openpyxl') #Reading a .xlsx file
print(readfileexcel)

   Unnamed: 0   a   b   c   d
0           0   0   1   2   3
1           1   4   5   6   7
2           2   8   9  10  11
3           3  12  13  14  15


#Creating and writing to a .csv file
readfilecsv = pandas.read_csv('CSV_ReadingFile.csv')
readfilecsv.to_csv('CSV_WritingFile1.csv') # write to local directory
readfilecsv = pandas.read_csv('CSV_WritingFile1.csv') # read the file back
print(readfilecsv)

   Unnamed: 0   a   b   c   d
0           0   0   1   2   3
1           1   4   5   6   7
2           2   8   9  10  11
3           3  12  13  14  15


#Creating and writing to a .csv file by excluding row labels 
readfilecsv = pandas.read_csv('CSV_ReadingFile.csv')
readfilecsv.to_csv('CSV_WritingFile2.csv', index = False)
readfilecsv = pandas.read_csv('CSV_WritingFile2.csv')
print(readfilecsv)

    a   b   c   d
0   0   1   2   3
1   4   5   6   7
2   8   9  10  11
3  12  13  14  15


#Creating and writing to a .xls file
readfileexcel = pandas.read_excel('Excel_ReadingFile.xlsx', sheet_name='Sheet1', engine='openpyxl')
readfileexcel.to_excel('Excel_WritingFile.xlsx', sheet_name='Sheet1' , index = False, engine='openpyxl')
readfileexcel = pandas.read_excel('Excel_WritingFile.xlsx', sheet_name='Sheet1', engine='openpyxl')
print(readfileexcel)

   Unnamed: 0   a   b   c   d
0           0   0   1   2   3
1           1   4   5   6   7
2           2   8   9  10  11
3           3  12  13  14  15


import requests # Module to process http/https requests


remote_url="http://54.243.252.9/engr-1330-webroot/4-Databases/all_quads_gross_evaporation.csv"  # set the url
rget = requests.get(remote_url, allow_redirects=True)  # get the remote resource, follow imbedded links
open('all_quads_gross_evaporation.csv','wb').write(rget.content) # extract from the remote the contents, assign to a local file same name
import pandas as pd # Module to process dataframes (not absolutely needed but somewhat easier than using primatives, and gives graphing tools)


#evapdf = pd.read_csv("all_quads_gross_evaporation.csv",parse_dates=["YYYY-MM"]) # Read the file as a .CSV assign to a dataframe evapdf
evapdf = pandas.read_csv("all_quads_gross_evaporation.csv")
evapdf.head() # check structure


evapdf.plot.line(x='YYYY-MM',y='911') # Plot quadrant 911 evaporation time series

<AxesSubplot:xlabel='YYYY-MM'>


evapdf[['911','912']] # pull out columns


evapdf[evapdf['YYYY-MM'] == "1993-01"][['911','912']]  # get 2 columns from 1993-01 date in YYYY-MM

	col1	col2
count	8.00000	8.0000
mean	4.50000	416.2500
std	2.44949	211.8576
min	1.00000	111.0000
25%	2.75000	222.0000
50%	4.50000	444.0000
75%	6.25000	582.7500
max	8.00000	666.0000

	col1	col2	col3
0	1.000000	444.0	orange
1	2.000000	555.0	apple
2	3.000000	407.0	grape
3	4.000000	444.0	mango
4	3.833333	666.0	jackfruit
5	6.000000	111.0	watermelon
6	7.000000	407.0	banana
7	3.833333	222.0	peach

	YYYY-MM	104	105	106	107	108	204	205	206	207	...	911	912	1008	1009	1010	1011	1108	1109	1110	1210
0	1954-01	1.80	1.80	2.02	2.24	2.24	2.34	1.89	1.80	1.99	...	1.42	1.30	2.50	2.42	1.94	1.29	2.59	2.49	2.22	2.27
1	1954-02	4.27	4.27	4.13	3.98	3.90	4.18	4.26	4.27	4.26	...	2.59	2.51	4.71	4.30	3.84	2.50	5.07	4.62	4.05	4.18
2	1954-03	4.98	4.98	4.62	4.25	4.20	5.01	4.98	4.98	4.68	...	3.21	3.21	6.21	6.06	5.02	3.21	6.32	6.20	5.68	5.70
3	1954-04	6.09	5.94	5.94	6.07	5.27	6.31	5.98	5.89	5.72	...	3.83	3.54	6.45	6.25	4.92	3.54	6.59	6.44	5.88	5.95
4	1954-05	5.41	5.09	5.14	4.40	3.61	5.57	4.56	4.47	4.18	...	3.48	3.97	7.92	8.13	6.31	3.99	7.75	7.98	7.40	7.40

	911	912
0	1.42	1.30
1	2.59	2.51
2	3.21	3.21
3	3.83	3.54
4	3.48	3.97
...	...	...
787	5.96	6.06
788	5.17	5.39
789	4.47	4.39
790	2.49	2.40
791	2.39	2.31

ENGR 1330 Computational Thinking with Data Science¶

Lesson 10 Databases:¶

- Fundamental Concepts¶

Objectives¶

Computational Thinking Concepts¶

Databases¶

Database Types¶

Relational Database Concepts¶

Dataframe-type Structure using primative python¶

The Pandas module¶

Pandas:¶

Data Structure¶

The Dataframe¶

Computational Thinking Concepts¶

Module Set-Up¶

`head` method¶

`tail` method¶

`info` method¶

`describe` method¶

Examples with "numerical" data¶

Now we shall create a proper dataframe¶

Getting the shape of dataframes¶

Appending new columns¶

Appending new rows¶

Removing Rows and Columns¶

Indexing¶

Conditional Selection¶

Descriptor Functions¶

`info` method¶

`describe` method¶

Counting and Sum methods¶

Using functions in dataframes - symbolic apply¶

Sorts¶

Aggregating (Grouping Values) dataframe contents¶

Filtering out missing values¶

Reading a File into a Dataframe¶

Writing a dataframe to file¶

Downloading files from websites (optional)¶

Method: Get the actual file from a remote web server (unencrypted)¶

References¶

	PNO	PNAME	COLOR	WEIGHT	CITY
1	P1	Nut	Red	12	London
2	P2	Bolt	Green	17	Paris
3	P3	Screw	Blue	17	Oslo
4	P4	Screw	Red	14	London
5	P5	Cam	Blue	12	Paris
6	P6	Cog	Red	19	London

	SNO	SNAME	STATUS	CITY
1	S1	Smith	20	London
2	S3	Blake	30	Paris
3	S4	Clark	20	London
4	S5	Adams	30	Athens

	SNO	PNO	QTY
1	S1	P1	300
2	S1	P2	200
3	S1	P3	400
4	S1	P4	200
5	S1	P5	100
6	S1	P6	100
7	S2	P1	300
8	S2	P2	400
9	S3	P2	200
10	S4	P2	200
11	S4	P4	300
12	S4	P5	400

	W	X	Y	Z	new
A	73	43	57	63	NA
B	31	74	47	53	NA
C	68	63	1	33	NA
D	77	22	93	45	NA
E	73	58	89	59	NA
X	73	58	89	59	NA

	key	data1	data2	data3
0	A	1	10	20
1	B	2	11	21
2	C	3	12	22
3	A	4	13	13
4	B	5	14	24
5	C	6	15	25

	data1	data2	data3
key
A	5	23	33
B	7	25	45
C	9	27	47

	data1	data2
key
A	5	23
B	7	25
C	9	27

	SNO	PNO	QTY
1	S1	P1	300
2	S1	P2	200
3	S1	P3	400
4	S1	P4	200
5	S1	P5	100
6	S1	P6	100
7	S2	P1	300
8	S2	P2	400
9	S3	P2	200
10	S4	P2	200
11	S4	P4	300
12	S4	P5	400

	W	X	Y	Z	new
A	73	43	57	63	NA
B	31	74	47	53	NA
C	68	63	1	33	NA
D	77	22	93	45	NA
E	73	58	89	59	NA
X	73	58	89	59	NA

ENGR 1330 Computational Thinking with Data Science¶

Lesson 10 Databases:¶

- Fundamental Concepts¶

Objectives¶

Computational Thinking Concepts¶

Databases¶

Database Types¶

Relational Database Concepts¶

Dataframe-type Structure using primative python¶

The Pandas module¶

Pandas:¶

Data Structure¶

The Dataframe¶

Computational Thinking Concepts¶

Module Set-Up¶

head method¶

tail method¶

info method¶

describe method¶

Examples with "numerical" data¶

Now we shall create a proper dataframe¶

Getting the shape of dataframes¶

Appending new columns¶

Appending new rows¶

Removing Rows and Columns¶

Indexing¶

Conditional Selection¶

Descriptor Functions¶

info method¶

describe method¶

Counting and Sum methods¶

Using functions in dataframes - symbolic apply¶

Sorts¶

Aggregating (Grouping Values) dataframe contents¶

Filtering out missing values¶

Reading a File into a Dataframe¶

Writing a dataframe to file¶

Downloading files from websites (optional)¶

Method: Get the actual file from a remote web server (unencrypted)¶

References¶

`head` method¶

`tail` method¶

`info` method¶

`describe` method¶

`info` method¶

`describe` method¶

	SNO	PNO	QTY
1	S1	P1	300
2	S1	P2	200
3	S1	P3	400
4	S1	P4	200
5	S1	P5	100
6	S1	P6	100
7	S2	P1	300
8	S2	P2	400
9	S3	P2	200
10	S4	P2	200
11	S4	P4	300
12	S4	P5	400

	W	X	Y	Z	new
A	73	43	57	63	NA
B	31	74	47	53	NA
C	68	63	1	33	NA
D	77	22	93	45	NA
E	73	58	89	59	NA
X	73	58	89	59	NA