# 《 Statistical learning method 》 expericnce The third chapter k Proximity method

• I'm Xiaobai ; This article code reprint address, there is a note at the end of the article ; Most of the notes are written by themselves , If you have any questions, please give me more advice .
1. The first procedure in this chapter is to find the norm of a vector
``````import math
# combinations Functions are used to combine , From n Remove from m individual
#  As a contrast ,permutations Function is used to sort , From n Take out in turn m individual
# from itertools import combinations

def L(x, y, p):
if len(x) == len(y) and len(x) > 1:  #  The premise of the calculation is
sum = 0
for i in range(len(x)):  # i Indicates subscript
sum += math.pow(abs(x[i] - y[i]), p)  # pow Exponentiation ,abs Find the absolute value
return math.pow(sum, 1/p)
else:
return 0

x1 = [1, 1]
x2 = [5, 1]
x3 = [4, 4]

for i in range(1, 5):
r = {"1-{}".format(c):L(x1, c, p=i) for c in [x2, x3]}
print(" The current number of times ：{}, At present r：{}".format(i, r))
print(" The current minimum ：")
print(min(zip(r.values(), r.keys())))

``````

Output results ：

`````` The current number of times ：1, At present r：{'1-[5, 1]': 4.0, '1-[4, 4]': 6.0}
The current minimum ：
(4.0, '1-[5, 1]')
The current number of times ：2, At present r：{'1-[5, 1]': 4.0, '1-[4, 4]': 4.242640687119285}
The current minimum ：
(4.0, '1-[5, 1]')
The current number of times ：3, At present r：{'1-[5, 1]': 3.9999999999999996, '1-[4, 4]': 3.7797631496846193}
The current minimum ：
(3.7797631496846193, '1-[4, 4]')
The current number of times ：4, At present r：{'1-[5, 1]': 4.0, '1-[4, 4]': 3.5676213450081633}
The current minimum ：
(3.5676213450081633, '1-[4, 4]')
``````
1. Second procedure , simulation k Proximity algorithm thought ： Go through all the points , Find the most recent k A little bit , By a majority vote , Determine the classification of test points
``````import numpy as np
import pandas as dp
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split  #  Used to divide data
from collections import Counter  #  Counter

class KNN:
def __init__(self, X_train, y_train, n_neighbors=3, p=2):  #  Set up k And p, It means the nearest 3 A little bit , And the distance is determined by two norms
self.n = n_neighbors
self.p = p
self.X_train = X_train
self.y_train = y_train

def predict(self, X):
knn_list = []
#  Let's find the three points first “ distance ”, there k=3,p=2. Modify... As needed
for i in range(self.n):
dist = np.linalg.norm(X - self.X_train[i], ord=self.p)
knn_list.append((dist, self.y_train[i]))
#  then , Recursively traverse the remaining points , By judgment ： distance    Is it better than    The calculated distance    Small , Replace when you are young . In the end, what's left of this list is the three closest points
for i in range(self.n, len(X_train)):
max_index = knn_list.index(max(knn_list, key=lambda x:x))
dist = np.linalg.norm(X - self.X_train[i], ord=self.p)
if knn_list[max_index] > dist:
knn_list[max_index] = (dist, self.y_train[i])
#  We will knn_list Take out the last line in , That is, the categories of the three nearest points are taken out
knn = [k[-1] for k in knn_list]
#  With a counter , The result of this step should be {"1": Number ,“0”： Number  }
count_pairs = Counter(knn)
# count_pairs.items() What's stored is ,[ Category , Number ]
#  Sort by the second dimension of the list , From small to large .
#  Sorting here allows for more than two types of data .
# [-1] Take out the first dimension of the last line , The most likely type
max_possible = sorted(count_pairs.items(), key=lambda x:x)[-1]
return  max_possible

def score(self, X_test, y_test):
right_count = 0
for X, y in zip(X_test, y_test):
label = self.predict(X)
if label == y:
right_count += 1
return right_count/len(X_test)

df = dp.DataFrame(iris.data, columns=iris.feature_names)
df['label'] = iris.target
df.columns = ["sepal length", "sepal width", "petal length", "petal width", "label"]

data = np.array(df.iloc[:100, [0, 1, -1]])
X, y = data[:, :-1], data[:, -1]
#  Divide the data into training sets and test sets , Percentage of test sets 0.25
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

clf = KNN(X_train, y_train)
print(" score ：")
print(clf.score(X_test, y_test))

test_point = [5.0, 4.0]
print('test point score:{}'.format(clf.predict(test_point)))  #  Output the possible categories of this point ,1 perhaps 0

plt.scatter(df[:50]['sepal length'], df[:50]['sepal width'], label="0")
plt.scatter(df[50:100]['sepal length'], df[50:100]['sepal width'], label="1")

plt.xlabel('sepal length')
plt.ylabel('sepal width')
plt.legend()
plt.show()
``````

result ： Scoring most of the time `1.0`, Once in a while `0.96`

Scroll to Top