KNN和K-Means的实践–以Iris为例

KNN

Code

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
#在iris上跑KNN算法
#代码来源于https://scikit-learn.org/stable/auto_examples/neighbors/plot_classification.html#sphx-glr-auto-examples-neighbors-plot-classification-py

#引入绘图相关的库
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import ListedColormap

#引入sklearn算法库,数据集,可视化库
from sklearn import datasets, neighbors
from sklearn.inspection import DecisionBoundaryDisplay

#KNN算法中的 K = 15
n_neighbors = 15

#引入iris数据库
iris = datasets.load_iris()

#引入花萼特征,也就是前两个特征
#原数据集切片, 现成为(150,2)大小的新数据集, 记为X
#y 是对应目标数据
X = iris.data[:, :2]
y = iris.target

#创建Colormap
#cmap_light 对应每种花对应区域的背景颜色
#cmap_bold 对应每种花的实心散点颜色
cmap_light = ListedColormap(["orange", "cyan", "cornflowerblue"])
cmap_bold = ["darkorange", "c", "darkblue"]


#uniform, distance 均为 KNeighborsClassifier参数
#weights = uniform 时 KNN算法不考虑距离的影响
#weights = distance 时 KNN算法考虑距离的影响

#weights = distance 时, 可以自行定义闵可夫斯基距离, 参数为[p=]
#如曼哈顿距离(p=1), 欧氏距离(p=2) 等价于p范数的计算公式
#使用例子 neighbors.KNeighborsClassifier(n_neighbors, weights="distance", p=2)
for weights in ["uniform", "distance"]:
#创建一个KNN分类器实例
clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights)
#输入数据集和目标集,开始分类
clf.fit(X, y)

_, ax = plt.subplots()
#绘制区域的背景部分
DecisionBoundaryDisplay.from_estimator(
clf,
X,
cmap=cmap_light,
ax=ax,
response_method="predict",
plot_method="pcolormesh",
xlabel=iris.feature_names[0],
ylabel=iris.feature_names[1],
shading="auto",
)

#绘制数据集的散点
sns.scatterplot(
x=X[:, 0],
y=X[:, 1],
hue=iris.target_names[y],
palette=cmap_bold,
alpha=1.0,
edgecolor="black",
)
#图片标题
plt.title(
"3-Class classification (k = %i, weights = '%s')" % (n_neighbors, weights)
)
#输出
plt.show()

结果图

输出结果:

K-Means

Code

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
import numpy as np
#支持坐标轴中文
plt.rcParams['font.sans-serif']=['SimHei']
iris = load_iris()
X = iris.data # 特征向量,并且是按顺序排列的
lable = iris.target # 标签
#数据集预处理,以花萼面积为横坐标,以花瓣面积做纵坐标
arr = np.array(X)
hua_e = arr[:,0]*arr[:,1]
hua_ban = arr[:,2]*arr[:,3]
############################################
#定义需要的函数
def shuju(k):
b =set()
while(len(b)<k):
b.add(np.random.randint(0,150))
return(b)
#每个点到中心点距离距离
def getDistance(point_x,point_y,cent_x,cent_y,k):
x = point_x
y = point_y
x0 = cent_x
y0 = cent_y
i = 0
j = 0
ds = [[]for i in range(len(x))]

while i < len(x):
while j < k:
M = np.sqrt((x[i]-x0[j]) * (x[i]-x0[j]) + (y[i]-y0[j]) * (y[i]-y0[j]))
M = round(M,1)
j = j + 1
ds[i].append(M)
j = 0
i = i + 1
return(ds)

#计算距离误差
def EDistance(point_x,point_y,cent_x,cent_y,k):
x = point_x
y = point_y
x0 = cent_x
y0 = cent_y
i = 0
j = 0
sum = 0
while i < k:
while j < len(x):
M = (x[j]-x0[i]) * (x[j]-x0[i]) + (y[j]-y0[i]) * (y[j]-y0[i])
M = round(M,1)
sum += M
j = j + 1
#ds[i].append(M)
j = 0
i = i + 1
return(sum)

#计算中心点
def cent(lable):
temp = lable
mean_x = []
mean_y = []
i = 0
j = 0
while i < 3:
cent_x = 0
cent_y = 0
count = 0
while j < len(x):
if i == temp[j]:
count = count + 1
cent_x = cent_x + x[j]
cent_y = cent_y + y[j]
j = j + 1
cent_x = cent_x / count
cent_y = cent_y / count
#更新中心点
mean_x.append(cent_x)
mean_y.append(cent_y)
j = 0
i = i + 1
return[mean_x,mean_y]

#按照k值聚类
def julei(ds,x):
x = x
x = len(x)
i = 0
temp = []
while i < x:
temp.append(ds[i].index(min(ds[i])))
i = i + 1
return(temp)
##############################################
#主程序部分
#这里聚3类,k取3
k = 3

b = shuju(k)
ceshi_hua_e = [hua_e[i] for i in range(len(hua_e)) if (i in b)]
ceshi_hua_ban = [hua_ban[i] for i in range(len(hua_ban)) if (i in b)]
ceshi_lable = [lable[i] for i in range(len(lable)) if (i in b)]
x = hua_e
y = hua_ban
x0 = ceshi_hua_e
y0 = ceshi_hua_ban
#第一次根据随机种子聚类
n = 0
ds = getDistance(x,y,x0,y0,k)
temp = julei(ds,x)
temp1 = EDistance(x,y,x0,y0,k)
n = n + 1
center = cent(temp)
x0 = center[0]
y0 = center[1]
ds = getDistance(x,y,x0,y0,k)
temp = julei(ds,x)
temp2 = EDistance(x,y,x0,y0,k)
n = n + 1
#比较两次平方误差 判断是否相等,不相等继续迭代
while np.abs(temp2 - temp1) != 0:
temp1 = temp2
center = cent(temp)
x0 = center[0]
y0 = center[1]
ds = getDistance(x,y,x0,y0,k)
temp = julei(ds,x)
temp2 = EDistance(x,y,x0,y0,k)
n = n + 1
print(n,temp2)
#结果可视化
print("迭代次数: ", n) # 统计出迭代次数
print('质心位置:',x0,y0)

plt.scatter(x0,y0,color='r',s=50,marker='s')
plt.scatter(x,y,c=temp,s=25,marker='o')
plt.xlabel('花萼面积')
plt.ylabel('花瓣面积')
plt.title("聚3类")
plt.show()

结果图