import pandas as pd
df = pd.read_csv("ckd.csv")
df
df = df[["Hemoglobin", "Blood Glucose Random", "White Blood Cell Count", "Class"]].copy()
df
df["Hemoglobin_su"] = (df["Hemoglobin"] - df["Hemoglobin"].mean()) / df["Hemoglobin"].std(ddof=0)
df["Glucose_su"] = (df["Blood Glucose Random"] - df["Blood Glucose Random"].mean()) / df["Blood Glucose Random"].std(ddof=0)
df["WhiteBCC_su"] = (df["White Blood Cell Count"] - df["White Blood Cell Count"].mean()) / df["White Blood Cell Count"].std(ddof=0)
df = df[["Hemoglobin_su", "Glucose_su", "WhiteBCC_su", "Class"]].copy()
df
import matplotlib.pyplot as plt
yes_ckd_df = df[df["Class"] == 1]
no_ckd_df = df[df["Class"] == 0]
plt.scatter(x=yes_ckd_df["Hemoglobin_su"], y=yes_ckd_df["Glucose_su"], label="YES CKD" )
plt.scatter(x=no_ckd_df["Hemoglobin_su"], y=no_ckd_df["Glucose_su"], label="NO CKD" )
plt.xlabel("Hemoglobin")
plt.ylabel("Glucose")
plt.legend()
plt.show()
new_patient = [0, 1.5]
yes_ckd_df = df[df["Class"] == 1]
no_ckd_df = df[df["Class"] == 0]
plt.scatter(x=new_patient[0], y=new_patient[1], color="red", label="Unknown")
plt.scatter(x=yes_ckd_df["Hemoglobin_su"], y=yes_ckd_df["Glucose_su"], label="YES CKD" )
plt.scatter(x=no_ckd_df["Hemoglobin_su"], y=no_ckd_df["Glucose_su"], label="NO CKD" )
plt.xlabel("Hemoglobin")
plt.ylabel("Glucose")
plt.legend()
plt.show()
new_point = [0, 1.5]
Distance between new_point and labeled points
import math as m
def euclide_distance(point1_x, point1_y, point2_x, point2_y):
temp = (point1_x - point2_x)**2 + (point1_y - point2_y)**2
return m.sqrt(temp)
distances_to_new_patient = []
for index, row in df.iterrows():
point1_x = row["Hemoglobin_su"]
point1_y = row["Glucose_su"]
distance = euclide_distance(point1_x, point1_y, new_point[0], new_point[1])
distances_to_new_patient.append(distance)
df["Distance"] = distances_to_new_patient
# obtain 10 cloest points
df = df.sort_values(["Distance"], ascending=True)
closest_points = df.head(10)
closest_points
# find the most common "class" in the cloest points
closest_points["Class"].mode().values[0]
new_patient = [0, 1.5]
yes_ckd_df = closest_points[closest_points["Class"] == 1]
no_ckd_df = closest_points[closest_points["Class"] == 0]
plt.scatter(x=new_patient[0], y=new_patient[1], color="red", label="Unknown")
plt.scatter(x=yes_ckd_df["Hemoglobin_su"], y=yes_ckd_df["Glucose_su"], label="YES CKD" )
plt.scatter(x=no_ckd_df["Hemoglobin_su"], y=no_ckd_df["Glucose_su"], label="NO CKD" )
plt.xlabel("Hemoglobin")
plt.ylabel("Glucose")
plt.legend()
plt.show()
def classify(reference_points, new_point, k_neighbors=1):
distances_to_new_patient = []
for index, row in df.iterrows():
point1_x = row["Hemoglobin_su"]
point1_y = row["Glucose_su"]
distance = euclide_distance(point1_x, point1_y, new_point[0], new_point[1])
distances_to_new_patient.append(distance)
reference_points["Distance"] = distances_to_new_patient
reference_points = reference_points.sort_values(["Distance"], ascending=True)
closest_points = reference_points.head(k_neighbors)
predicted_label = closest_points["Class"].mode().values[0]
return predicted_label
import numpy as np
import matplotlib.pyplot as plt
yes_ckd_df = df[df["Class"] == 1]
no_ckd_df = df[df["Class"] == 0]
hemoglobins = np.arange(-2, 2, 0.1)
glucoses = np.arange(-2, 2, 0.1)
for h in hemoglobins:
for g in glucoses:
new_point = [h, g]
predicted_val = classify(df, new_point, k_neighbors=1)
if predicted_val == 1:
plt.scatter(x=h, y=g, color="blue", alpha=0.2)
else:
plt.scatter(x=h, y=g, color="orange", alpha=0.2)
plt.scatter(x=yes_ckd_df["Hemoglobin_su"], y=yes_ckd_df["Glucose_su"], color="blue", label="YES CKD" )
plt.scatter(x=no_ckd_df["Hemoglobin_su"], y=no_ckd_df["Glucose_su"], color="orange", label="NO CKD" )
plt.xlabel("Hemoglobin")
plt.ylabel("Glucose")
plt.legend()
plt.show()