from sklearn.datasets import load_digits, load_wine
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import sklearn.datasets
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import  accuracy_score
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
%matplotlib inline

mnist = sklearn.datasets.fetch_openml('mnist_784')
x = mnist.data.to_numpy(dtype= np.int64)
y = mnist.target.to_numpy(dtype= np.int64)

/opt/anaconda3/lib/python3.11/site-packages/sklearn/datasets/_openml.py:968: FutureWarning: The default value of `parser` will change from `'liac-arff'` to `'auto'` in 1.4. You can set `parser='auto'` to silence this warning. Therefore, an `ImportError` will be raised from 1.4 if the dataset is dense and pandas is not installed. Note that the pandas parser may return different data types. See the Notes Section in fetch_openml's API doc for details.
  warn(

X_train, X_test, Y_train, Y_test = train_test_split(
    x[:2500], y[:2500], test_size=0.2, random_state=123, stratify=y[:2500])
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(2000, 784)
(500, 784)
(2000,)
(500,)

class knn:
	def __init__(self, k):
		self.k = k
		self.X_train = None
		self.Y_train = None

	def fit(self, X, Y):
		self.X_train = X
		self.Y_train = Y

	def distance(self, x1, x2):

		# returns the distance between two vectors
		# the distance is the L2 norm
		return np.sqrt(np.sum((x1 - x2) ** 2))

	def predict(self, x_test):
		# returns the predictions for multiple examples X
		# your solution here
		predictions = []
		for x in x_test:
			distances = [self.distance(x, x_train) for x_train in self.X_train]
			k_indices = np.argsort(distances)[:self.k]
			k_nearest_labels = [self.Y_train[i] for i in k_indices]
            
            # Find the most common label without using Counter
			unique_labels, counts = np.unique(k_nearest_labels, return_counts=True)
			most_common_label = unique_labels[np.argmax(counts)]
            
			predictions.append(most_common_label)
		return np.array(predictions)
                        
	def score(self, x_test, y_test):
		# Return the accuracy of your model on the test data.
		y_pred = self.predict(x_test)
		accuracy = np.mean(y_pred == y_test)
		return accuracy
    
	def training_score(self):
		y_pred = self.predict(self.X_train)
		accuracy = np.mean(y_pred == self.Y_train)
		return accuracy

from sklearn.neighbors import KNeighborsClassifier
sklearn_knn = KNeighborsClassifier(n_neighbors = 5)
sklearn_knn.fit(X_train, Y_train)
sklearn_knn_score = sklearn_knn.score(X_test, Y_test)
print("sklearn_knn_score:", sklearn_knn_score)

our_knn = knn(k = 5)
our_knn.fit(X_train, Y_train)
our_knn_score = our_knn.score(X_test, Y_test)
print("our_knn_score:", our_knn_score)

sklearn_knn_score: 0.912
our_knn_score: 0.912

Y_pred = our_knn.predict(X_test)

incorrect = [i for i, x in enumerate(Y_pred!=Y_test) if x]
correct = [i for i, x in enumerate(Y_pred==Y_test) if x]

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[5], line 1
----> 1 Y_pred = our_knn.predict(X_test)
      3 incorrect = [i for i, x in enumerate(Y_pred!=Y_test) if x]
      4 correct = [i for i, x in enumerate(Y_pred==Y_test) if x]

NameError: name 'our_knn' is not defined

NUM_OF_IMAGES = 8 # number of images to display
_, axes = plt.subplots(nrows=1, ncols=NUM_OF_IMAGES, figsize=(20, 20))
START = 0 # start index of the images to display
for ax, ind in zip(axes, list(range(START,START+NUM_OF_IMAGES))):
    ax.set_axis_off()
    image = X_test[incorrect[ind]].reshape(28, 28)
    ax.imshow(image, cmap=plt.cm.gray_r, interpolation="nearest")
    ax.set_title(f"Prediction: {Y_pred[incorrect[ind]]}")

_, axes = plt.subplots(nrows=1, ncols=NUM_OF_IMAGES, figsize=(20, 20))
START = 120 # start index of the images to display
for ax, ind in zip(axes, list(range(START,START+NUM_OF_IMAGES))):
    ax.set_axis_off()
    image = X_test[correct[ind]].reshape(28, 28)
    ax.imshow(image, cmap=plt.cm.gray_r, interpolation="nearest")
    ax.set_title(f"Prediction: {Y_pred[correct[ind]]}")

# Code for plots (2 points as mentioned above)
k_values = [1, 3, 5, 7, 9]
training_accuracies = []
test_accuracies = []

for k in k_values:
    knn_model = knn(k=k)
    knn_model.fit(X_train, Y_train)
    
    training_accuracy = knn_model.training_score()
    test_accuracy = knn_model.score(X_test, Y_test)
    
    training_accuracies.append(training_accuracy)
    test_accuracies.append(test_accuracy)
    
    print(f"k={k}, Training Accuracy: {training_accuracy}, Test Accuracy: {test_accuracy}")

plt.figure(figsize=(12, 6))
plt.plot(k_values, training_accuracies, marker='o', label='Training Accuracy')
plt.plot(k_values, test_accuracies, marker='s', label='Test Accuracy')
plt.xlabel('k value')
plt.ylabel('Accuracy')
plt.title('Training Accuracy and Test Accuracy for different k values')
plt.legend()
plt.grid(True)
plt.show()

k=1, Training Accuracy: 1.0, Test Accuracy: 0.912
k=3, Training Accuracy: 0.959, Test Accuracy: 0.918
k=5, Training Accuracy: 0.9375, Test Accuracy: 0.912
k=7, Training Accuracy: 0.9305, Test Accuracy: 0.916
k=9, Training Accuracy: 0.921, Test Accuracy: 0.898

"""
You can use built-in Numpy methods to implement your distance forumla but be sure to understand what equations those methods are using. 
"""
# Write the Python implementation here (1.5 points)
class knn_manhattan:
    def __init__(self, k):
        self.k = k
        self.X_train = None
        self.Y_train = None
    def fit(self, X, Y):
        self.X_train = X
        self.Y_train = Y
    def distance(self,x1,x2):
        return np.sum(np.abs(x1 - x2))
    def predict(self, x_test):
        num_test = x_test.shape[0]
        y_pred = np.zeros(num_test)
        for i in range(num_test):
            distances = np.array([self.distance(x_test[i], x_train) for x_train in self.X_train])
            knn_indices = np.argsort(distances)[:self.k]
            knn_labels = self.Y_train[knn_indices]
            y_pred[i] = np.argmax(np.bincount(knn_labels))
        return y_pred
    def score(self, x_test, y_test):
        y_pred = self.predict(x_test)
        accuracy = np.mean(y_pred == y_test)
        return accuracy
    def training_score(self):
        y_pred = self.predict(self.X_train)
        accuracy = np.mean(y_pred == self.Y_train)
        return accuracya

wine = load_wine()
X = wine.data
print(X.shape)
Y = wine.target
print(Y.shape)
print("feature names:", wine.feature_names)
print("target names:", wine.target_names)

(178, 13)
(178,)
feature names: ['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 'flavanoids', 'nonflavanoid_phenols', 'proanthocyanins', 'color_intensity', 'hue', 'od280/od315_of_diluted_wines', 'proline']
target names: ['class_0' 'class_1' 'class_2']

kf = KFold(n_splits=5, shuffle=True, random_state=123) # Do not change the arguments. 

# Your implementation to finding the optimal k value below
k_values = range(1, 21)
average_accuracies = []
for k in k_values:
    fold_accuracies = []
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        Y_train, Y_test = y[train_index], y[test_index]
        knn_model = knn(k=k)
        knn_model.fit(X_train, Y_train)
        
        fold_accuracy = knn_model.score(X_test, Y_test)
        fold_accuracies.append(fold_accuracy)
    
    average_accuracy = np.mean(fold_accuracies)
    average_accuracies.append(average_accuracy)
    
    print(f"k={k}, Average Accuracy: {average_accuracy}")
    

    
optimal_k = k_values[np.argmax(average_accuracies)]
print("The optimal k value is {}.".format(optimal_k))

k=1, Average Accuracy: 0.04492063492063492
k=2, Average Accuracy: 0.05603174603174603
k=3, Average Accuracy: 0.07857142857142857
k=4, Average Accuracy: 0.07317460317460317
k=5, Average Accuracy: 0.073015873015873
k=6, Average Accuracy: 0.062063492063492064
k=7, Average Accuracy: 0.05063492063492063
k=8, Average Accuracy: 0.05047619047619047
k=9, Average Accuracy: 0.0334920634920635
k=10, Average Accuracy: 0.05587301587301587
k=11, Average Accuracy: 0.05603174603174603
k=12, Average Accuracy: 0.04492063492063492
k=13, Average Accuracy: 0.06158730158730159
k=14, Average Accuracy: 0.05063492063492063
k=15, Average Accuracy: 0.07285714285714286
k=16, Average Accuracy: 0.06158730158730159
k=17, Average Accuracy: 0.050317460317460316
k=18, Average Accuracy: 0.05587301587301587
k=19, Average Accuracy: 0.0673015873015873
k=20, Average Accuracy: 0.06174603174603175
The optimal k value is 3.

# Python implementation (1.5 points)
# You can keep the code you wrote for testing the new distance function. 
kf = KFold(n_splits=5, shuffle=True, random_state=123)

optimal_k = 3
fold_accuracies = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    knn_model = knn_manhattan(k=optimal_k)#using the manhattan distance model we built earlier
    knn_model.fit(X_train, y_train)
    
    fold_accuracy = knn_model.score(X_test, y_test)
    fold_accuracies.append(fold_accuracy)

average_accuracy = np.mean(fold_accuracies)
print(f"Average Accuracy with Manhattan Distance (k={optimal_k}): {average_accuracy}")

Average Accuracy with Manhattan Distance (k=3): 0.06746031746031746

# YOUR CODE HERE:

def normalize(X):
    means = np.mean(X, axis=0)
    stds = np.std(X, axis=0)
    X_normalized = (X - means) / stds
    return X_normalized

kf = KFold(n_splits=5, shuffle=True, random_state=123)
X_normalized = normalize(X)
fav_k = 3
fold_accuracies = []

for train_index, test_index in kf.split(X_normalized):
    X_train, X_test = X_normalized[train_index], X_normalized[test_index]
    y_train, y_test = y[train_index], y[test_index]
    knn_model = knn(k=fav_k)
    knn_model.fit(X_train, y_train)
    
    fold_accuracy = knn_model.score(X_test, y_test)
    fold_accuracies.append(fold_accuracy)

average_accuracy = np.mean(fold_accuracies)
print(f"Average Accuracy using L2 norm (k={fav_k})after normalization: {average_accuracy}")

Average Accuracy using L2 norm (k=3)after normalization: 0.08984126984126985

Part 1.¶

1.1¶

1.2 Testing and Traning¶

¶

Problem 2¶

2.1 (3 points)¶

2.2 (4 points)¶

2.3 (3 points)¶

2.4 (1 Point)¶