# Preamble script block to identify host, user, and kernel
import sys
! hostname
! whoami
print(sys.executable)
print(sys.version)
print(sys.version_info)
import numpy as np
import pandas as pd
import statistics
import scipy.stats
from matplotlib import pyplot as plt
scipy.stats.norm.pdf(3, 3, 1)
scipy.stats.norm.pdf(3, 7, 2)
scipy.stats.norm.pdf(2, 4, 1) * scipy.stats.norm.pdf(6, 4, 1)
def find_line(xs, ys):
"""Calculates the slope and intercept"""
# number of points
n = len(xs)
# calculate means
x_bar = sum(xs)/n
y_bar = sum(ys)/n
# calculate slope
num = 0
denom = 0
for i in range(n):
num += (xs[i]-x_bar)*(ys[i]-y_bar)
denom += (xs[i]-x_bar)**2
slope = num/denom
# calculate intercept
intercept = y_bar - slope*x_bar
return slope, intercept
Elapsed Time (s) | Speed (m/s) |
---|---|
0 | 0 |
1.0 | 3 |
2.0 | 7 |
3.0 | 12 |
4.0 | 20 |
5.0 | 30 |
6.0 | 45.6 |
7.0 | 60.3 |
8.0 | 77.7 |
9.0 | 97.3 |
10.0 | 121.1 |
time = [0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]
speed = [0, 3, 7, 12, 20, 30, 45.6, 60.3, 77.7, 97.3, 121.2]
find_line(time, speed) #Is this similar to our past results?!
# Predict values
X = np.array(time)
alpha = -16.78636363636364
beta = 11.977272727272727
ypred = alpha + beta * X
# Plot regression against actual data
plt.figure(figsize=(12, 6))
plt.plot(X, speed, 'o') # scatter plot showing actual data
plt.plot(X, ypred, 'r', linewidth=2) # regression line
plt.xlabel('Time (s)')
plt.ylabel('Speed (m/s)')
plt.title('model vs observed')
plt.show()
The Mean Absolute Error (or MAE) is the average of the absolute differences between predictions and actual values. It gives an idea of how wrong the predictions were. The measure gives an idea of the magnitude of the error, but no idea of the direction (e.g. over or under predicting). Here is the formula:
It is thus an arithmetic average of the absolute errors |ei|=|yi-xi|, where yi is the prediction and xi the true value. This is known as a scale-dependent accuracy measure and therefore cannot be used to make comparisons between series using different scales.
# calculate manually
d = speed - ypred
mae_m = np.mean(abs(d))
print("Results by manual calculation:")
print("MAE:",mae_m)
import sklearn.metrics as metrics
mae = metrics.mean_absolute_error(speed, ypred)
print(mae)
The Mean Squared Error (or MSE) is much like the mean absolute error in that it provides a gross idea of the magnitude of error. It measures the average of the squares of the errors—that is, the average squared difference between the estimated values and the actual value. The MSE is a measure of the quality of an estimator—it is always non-negative, and values closer to zero are better. An MSE of zero, meaning that the estimator predicts observations of the parameter with perfect accuracy, is ideal (but typically not possible).Taking the square root of the mean squared error converts the units back to the original units of the output variable and can be meaningful for description and presentation. This is called the Root Mean Squared Error (or RMSE). RMSE is the most widely used metric for regression tasksHere is the formula:
mse_m = np.mean(d**2)
rmse_m = np.sqrt(mse_m)
print("MSE:", mse_m)
print("RMSE:", rmse_m)
mse = metrics.mean_squared_error(speed, ypred)
rmse = np.sqrt(mse) # or mse**(0.5)
print(mse)
print(rmse)
The R^2 (or R Squared) metric provides an indication of the goodness of fit of a set of predictions to the actual values. In statistical literature, this measure is called the coefficient of determination. This is a value between 0 and 1 for no-fit and perfect fit respectively. It provides a measure of how well observed outcomes are replicated by the model, based on the proportion of total variation of outcomes explained by the model..Here is the formula:
r2_m = 1-(sum(d**2)/sum((speed-np.mean(speed))**2))
print("R-Squared:", r2_m)
r2 = metrics.r2_score(speed, ypred)
print(r2)
This notebook was inspired by several blogposts including:
Here are some great reads on these topics:
Here are some great videos on these topics: