%%html
<!--Script block to left align Markdown Tables-->
<style>
  table {margin-left: 0 !important;}
</style>


import pandas


import numpy
mytabular = numpy.random.randint(1,100,(5,4))
myrowname = ['A','B','C','D','E']
mycolname = ['W','X','Y','Z']
mytable = [['' for jcol in range(len(mycolname)+1)] for irow in range(len(myrowname)+1)] #non-null destination matrix, note the implied loop construction


for irow in range(1,len(myrowname)+1): # write the row names
    mytable[irow][0]=myrowname[irow-1]
for jcol in range(1,len(mycolname)+1): # write the column names
    mytable[0][jcol]=mycolname[jcol-1]  
for irow in range(1,len(myrowname)+1): # fill the table (note the nested loop)
    for jcol in range(1,len(mycolname)+1):
        mytable[irow][jcol]=mytabular[irow-1][jcol-1]


for irow in range(0,len(myrowname)+1):
    print(mytable[irow][0:len(mycolname)+1])

['', 'W', 'X', 'Y', 'Z']
['A', 3, 78, 15, 62]
['B', 82, 5, 27, 80]
['C', 36, 35, 63, 30]
['D', 88, 77, 55, 74]
['E', 9, 86, 44, 87]


print(mytable[3][0:len(mycolname)+1])

['C', 36, 35, 63, 30]


for irow in range(0,len(myrowname)+1):  #cannot use implied loop in a column slice
    print(mytable[irow][2])

X
78
5
35
77
86


print(' ',mytable[0][3])
print(mytable[3][0],mytable[3][3])

  Y
C 63


mydf = pandas.DataFrame(numpy.random.randint(1,100,(5,4)), ['A','B','C','D','E'], ['W','X','Y','Z'])
mydf


mydf1 = pandas.DataFrame(mytable)
mydf1


mydf2 = pandas.DataFrame(mytabular,myrowname,mycolname)
mydf2


mydf.shape

(5, 4)


mydf1.shape

(6, 5)


mydf2.shape

(5, 4)


mydf['new']= 'NA'


mydf


newrow = mydf.loc[['E']].rename(index={"E": "X"}) # create a single row, rename the index
newtable = pandas.concat([mydf,newrow]) # concatenate the row to bottom of df - note the syntax


newtable


newtable.drop('new', axis=1, inplace = True)
newtable


newtable = newtable.loc[['A','B','D','E','X']] # select all rows except C
newtable


# or just use drop with axis specify
newtable.drop('X', axis=0, inplace = True)


newtable


newtable['X'] #Selecing a single column

A    46
B    66
D    24
E    23
Name: X, dtype: int64


newtable[['X','W']] #Selecing a multiple columns


newtable.loc['E'] #Selecing rows based on label via loc[ ] indexer

W    75
X    23
Y    71
Z    74
Name: E, dtype: int64


newtable


newtable.loc[['E','D','B']] #Selecing multiple rows based on label via loc[ ] indexer


newtable.loc[['B','E','D'],['X','Y']] #Selecting elements via both rows and columns via loc[ ] indexer


mydf = pandas.DataFrame({'col1':[1,2,3,4,5,6,7,8],
                   'col2':[444,555,666,444,666,111,222,222],
                   'col3':['orange','apple','grape','mango','jackfruit','watermelon','banana','peach']})
mydf


#What fruit corresponds to the number 555 in ‘col2’?

mydf[mydf['col2']==555]['col3']

1    apple
Name: col3, dtype: object


#What fruit corresponds to the minimum number in ‘col2’?

mydf[mydf['col2']==mydf['col2'].min()]['col3']

5    watermelon
Name: col3, dtype: object


#Creating a dataframe from a dictionary

mydf = pandas.DataFrame({'col1':[1,2,3,4,5,6,7,8],
                   'col2':[444,555,666,444,666,111,222,222],
                   'col3':['orange','apple','grape','mango','jackfruit','watermelon','banana','peach']})
mydf


#Returns only the first five rows

mydf.head()


#Info about the dataframe

mydf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   col1    8 non-null      int64 
 1   col2    8 non-null      int64 
 2   col3    8 non-null      object
dtypes: int64(2), object(1)
memory usage: 320.0+ bytes


#Statistics of the dataframe

mydf.describe()


mydf['col2'].sum() #Sum of a specified column

3330


mydf['col2'].unique() #Returns the list of unique values along the indexed column

array([444, 555, 666, 111, 222])


mydf['col2'].nunique() #Returns the total number of unique values along the indexed column

5


mydf['col2'].value_counts()  #Returns the number of occurences of each unique value

666    2
444    2
222    2
555    1
111    1
Name: col2, dtype: int64


def times2(x):  # A prototype function to scalar multiply an object x by 2
    return(x*2)

print(mydf)
print('Apply the times2 function to col2')
mydf['col2'].apply(times2) #Symbolic apply the function to each element of column col2, result is another dataframe

   col1  col2        col3
0     1   444      orange
1     2   555       apple
2     3   666       grape
3     4   444       mango
4     5   666   jackfruit
5     6   111  watermelon
6     7   222      banana
7     8   222       peach
Apply the times2 function to col2

0     888
1    1110
2    1332
3     888
4    1332
5     222
6     444
7     444
Name: col2, dtype: int64


mydf.sort_values('col2', ascending = True) #Sorting based on columns


mydf.sort_values('col3', ascending = True) #Lexiographic sort


#Creating a dataframe from a dictionary

data = {
    'key' : ['A', 'B', 'C', 'A', 'B', 'C'],
    'data1' : [1, 2, 3, 4, 5, 6],
    'data2' : [10, 11, 12, 13, 14, 15],
    'data3' : [20, 21, 22, 13, 24, 25]
}

mydf1 = pandas.DataFrame(data)
mydf1


# Grouping and summing values in all the columns based on the column 'key'

mydf1.groupby('key').sum()


# Grouping and summing values in the selected columns based on the column 'key'

mydf1.groupby('key')[['data1', 'data2']].sum()


#Creating a dataframe from a dictionary

df = pandas.DataFrame({'col1':[1,2,3,4,None,6,7,None],
                   'col2':[444,555,None,444,666,111,None,222],
                   'col3':['orange','apple','grape','mango','jackfruit','watermelon','banana','peach']})
df


df_dropped = df.dropna()
df_dropped


df_filled1 = df.fillna(0)
df_filled1


df_filled2 = df.fillna(df.mean())
df_filled2


readfilecsv = pandas.read_csv('CSV_ReadingFile.csv')  #Reading a .csv file
print(readfilecsv)

    a   b   c   d
0   0   1   2   3
1   4   5   6   7
2   8   9  10  11
3  12  13  14  15


readfileexcel = pandas.read_excel('Excel_ReadingFile.xlsx', sheet_name='Sheet1') #Reading a .xlsx file
print(readfileexcel)

   Unnamed: 0   a   b   c   d
0           0   0   1   2   3
1           1   4   5   6   7
2           2   8   9  10  11
3           3  12  13  14  15


#Creating and writing to a .csv file
readfilecsv = pandas.read_csv('CSV_ReadingFile.csv')
readfilecsv.to_csv('CSV_WritingFile1.csv')
readfilecsv = pandas.read_csv('CSV_WritingFile1.csv')
print(readfilecsv)

   Unnamed: 0   a   b   c   d
0           0   0   1   2   3
1           1   4   5   6   7
2           2   8   9  10  11
3           3  12  13  14  15


#Creating and writing to a .csv file by excluding row labels 
readfilecsv = pandas.read_csv('CSV_ReadingFile.csv')
readfilecsv.to_csv('CSV_WritingFile2.csv', index = False)
readfilecsv = pandas.read_csv('CSV_WritingFile2.csv')
print(readfilecsv)

    a   b   c   d
0   0   1   2   3
1   4   5   6   7
2   8   9  10  11
3  12  13  14  15


#Creating and writing to a .xlsx file
readfileexcel = pandas.read_excel('Excel_ReadingFile.xlsx', sheet_name='Sheet1')
readfileexcel.to_excel('Excel_WritingFile.xlsx', sheet_name='MySheet', index = False)
readfileexcel = pandas.read_excel('Excel_WritingFile.xlsx', sheet_name='MySheet')
print(readfileexcel)

   Unnamed: 0   a   b   c   d
0           0   0   1   2   3
1           1   4   5   6   7
2           2   8   9  10  11
3           3  12  13  14  15


# Preamble script block to identify host, user, and kernel
import sys
! hostname
! whoami
print(sys.executable)
print(sys.version)
print(sys.version_info)

ip-172-26-4-2
compthink
/opt/jupyterhub/bin/python3
3.8.5 (default, Jul 28 2020, 12:59:40) 
[GCC 9.3.0]
sys.version_info(major=3, minor=8, micro=5, releaselevel='final', serial=0)

	col1	col2
count	8.00000	8.0000
mean	4.50000	416.2500
std	2.44949	211.8576
min	1.00000	111.0000
25%	2.75000	222.0000
50%	4.50000	444.0000
75%	6.25000	582.7500
max	8.00000	666.0000

	col1	col2	col3
0	1.0	444.0	orange
1	2.0	555.0	apple
2	3.0	NaN	grape
3	4.0	444.0	mango
4	NaN	666.0	jackfruit
5	6.0	111.0	watermelon
6	7.0	NaN	banana
7	NaN	222.0	peach

	col1	col2	col3
0	1.0	444.0	orange
1	2.0	555.0	apple
3	4.0	444.0	mango
5	6.0	111.0	watermelon

	col1	col2	col3
0	1.000000	444.0	orange
1	2.000000	555.0	apple
2	3.000000	407.0	grape
3	4.000000	444.0	mango
4	3.833333	666.0	jackfruit
5	6.000000	111.0	watermelon
6	7.000000	407.0	banana
7	3.833333	222.0	peach

ENGR 1330 Computational Thinking with Data Science¶

Lesson 8 The Pandas module¶

Special Script Blocks¶

Objectives¶

Pandas:¶

Data Structure¶

The Dataframe¶

Computational Thinking Concepts¶

Module Set-Up¶

Dataframe-type Structure using primative python¶

Now we shall create a proper dataframe¶

Getting the shape of dataframes¶

Appending new columns¶

Appending new rows¶

Removing Rows and Columns¶

Indexing¶

Conditional Selection¶

Descriptor Functions¶

`head` method¶

`info` method¶

`describe` method¶

Counting and Sum methods¶

Using functions in dataframes - symbolic apply¶

Sorts¶

Aggregating (Grouping Values) dataframe contents¶

Filtering out missing values¶

Reading a File into a Dataframe¶

Writing a dataframe to file¶

References¶

	W	X	Y	Z	new
A	33	46	69	49	NA
B	65	66	90	24	NA
C	91	63	19	69	NA
D	83	24	96	95	NA
E	75	23	71	74	NA
X	75	23	71	74	NA

	key	data1	data2	data3
0	A	1	10	20
1	B	2	11	21
2	C	3	12	22
3	A	4	13	13
4	B	5	14	24
5	C	6	15	25

	data1	data2	data3
key
A	5	23	33
B	7	25	45
C	9	27	47

	data1	data2
key
A	5	23
B	7	25
C	9	27

	W	X	Y	Z	new
A	33	46	69	49	NA
B	65	66	90	24	NA
C	91	63	19	69	NA
D	83	24	96	95	NA
E	75	23	71	74	NA
X	75	23	71	74	NA

ENGR 1330 Computational Thinking with Data Science¶

Lesson 8 The Pandas module¶

Special Script Blocks¶

Objectives¶

Pandas:¶

Data Structure¶

The Dataframe¶

Computational Thinking Concepts¶

Module Set-Up¶

Dataframe-type Structure using primative python¶

Now we shall create a proper dataframe¶

Getting the shape of dataframes¶

Appending new columns¶

Appending new rows¶

Removing Rows and Columns¶

Indexing¶

Conditional Selection¶

Descriptor Functions¶

head method¶

info method¶

describe method¶

Counting and Sum methods¶

Using functions in dataframes - symbolic apply¶

Sorts¶

Aggregating (Grouping Values) dataframe contents¶

Filtering out missing values¶

Reading a File into a Dataframe¶

Writing a dataframe to file¶

References¶

`head` method¶

`info` method¶

`describe` method¶

	W	X	Y	Z	new
A	33	46	69	49	NA
B	65	66	90	24	NA
C	91	63	19	69	NA
D	83	24	96	95	NA
E	75	23	71	74	NA
X	75	23	71	74	NA