KNN和K-Means的实践–以Iris为例

KNN

Code

#在iris上跑KNN算法
#代码来源于https://scikit-learn.org/stable/auto_examples/neighbors/plot_classification.html#sphx-glr-auto-examples-neighbors-plot-classification-py

#引入绘图相关的库
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import ListedColormap

#引入sklearn算法库，数据集，可视化库
from sklearn import datasets, neighbors
from sklearn.inspection import DecisionBoundaryDisplay

#KNN算法中的 K = 15
n_neighbors = 15

#引入iris数据库
iris = datasets.load_iris()

#引入花萼特征，也就是前两个特征
#原数据集切片， 现成为（150,2）大小的新数据集， 记为X
#y 是对应目标数据
X = iris.data[:, :2]
y = iris.target

#创建Colormap
#cmap_light 对应每种花对应区域的背景颜色
#cmap_bold 对应每种花的实心散点颜色
cmap_light = ListedColormap(["orange", "cyan", "cornflowerblue"])
cmap_bold = ["darkorange", "c", "darkblue"]


#uniform, distance 均为 KNeighborsClassifier参数
#weights = uniform 时 KNN算法不考虑距离的影响
#weights = distance 时 KNN算法考虑距离的影响

#weights = distance 时， 可以自行定义闵可夫斯基距离， 参数为[p=]
#如曼哈顿距离(p=1), 欧氏距离(p=2) 等价于p范数的计算公式
#使用例子 neighbors.KNeighborsClassifier(n_neighbors, weights="distance", p=2)
for weights in ["uniform", "distance"]:
    #创建一个KNN分类器实例
    clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights)
    #输入数据集和目标集，开始分类
    clf.fit(X, y)

    _, ax = plt.subplots()
    #绘制区域的背景部分
    DecisionBoundaryDisplay.from_estimator(
        clf,
        X,
        cmap=cmap_light,
        ax=ax,
        response_method="predict",
        plot_method="pcolormesh",
        xlabel=iris.feature_names[0],
        ylabel=iris.feature_names[1],
        shading="auto",
    )

    #绘制数据集的散点
    sns.scatterplot(
        x=X[:, 0],
        y=X[:, 1],
        hue=iris.target_names[y],
        palette=cmap_bold,
        alpha=1.0,
        edgecolor="black",
    )
    #图片标题
    plt.title(
        "3-Class classification (k = %i, weights = '%s')" % (n_neighbors, weights)
    )
#输出
plt.show()

结果图

输出结果：

K-Means

Code

import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
import numpy as np
#支持坐标轴中文
plt.rcParams['font.sans-serif']=['SimHei']
iris = load_iris()
X = iris.data   # 特征向量，并且是按顺序排列的
lable = iris.target  # 标签
#数据集预处理，以花萼面积为横坐标，以花瓣面积做纵坐标
arr = np.array(X)
hua_e = arr[:,0]*arr[:,1]
hua_ban = arr[:,2]*arr[:,3]
############################################
#定义需要的函数
def shuju(k):
    b =set()
    while(len(b)<k):
        b.add(np.random.randint(0,150))
    return(b)
#每个点到中心点距离距离
def  getDistance(point_x,point_y,cent_x,cent_y,k):
    x = point_x
    y = point_y
    x0 = cent_x
    y0 = cent_y
    i = 0
    j = 0
    ds = [[]for i in range(len(x))]

    while i < len(x):
        while j < k:
            M = np.sqrt((x[i]-x0[j])  * (x[i]-x0[j]) + (y[i]-y0[j]) * (y[i]-y0[j]))
            M = round(M,1)
            j = j + 1
            ds[i].append(M)
        j = 0
        i = i + 1
    return(ds)

#计算距离误差
def  EDistance(point_x,point_y,cent_x,cent_y,k):
    x = point_x
    y = point_y
    x0 = cent_x
    y0 = cent_y
    i = 0
    j = 0
    sum = 0
    while i < k:
        while j < len(x):
            M = (x[j]-x0[i])  * (x[j]-x0[i]) + (y[j]-y0[i]) * (y[j]-y0[i])
            M = round(M,1)
            sum += M
            j = j + 1
            #ds[i].append(M)
        j = 0
        i = i + 1
    return(sum)

#计算中心点
def cent(lable):
    temp = lable
    mean_x = []
    mean_y = []
    i = 0
    j = 0
    while i < 3:
        cent_x = 0
        cent_y = 0
        count = 0
        while j < len(x):
            if i == temp[j]:
                count = count + 1
                cent_x = cent_x + x[j]
                cent_y = cent_y + y[j]
            j = j + 1
        cent_x = cent_x / count
        cent_y = cent_y / count
        #更新中心点
        mean_x.append(cent_x)
        mean_y.append(cent_y)
        j = 0
        i = i + 1
    return[mean_x,mean_y]

#按照k值聚类
def julei(ds,x):
    x = x
    x = len(x)
    i = 0
    temp = []
    while i < x:
        temp.append(ds[i].index(min(ds[i])))
        i = i + 1
    return(temp)
##############################################
#主程序部分
#这里聚3类，k取3
k = 3

b = shuju(k)
ceshi_hua_e = [hua_e[i] for i in range(len(hua_e)) if (i in b)]
ceshi_hua_ban = [hua_ban[i] for i in range(len(hua_ban)) if (i in b)]
ceshi_lable = [lable[i] for i in range(len(lable)) if (i in b)]
x = hua_e
y = hua_ban
x0 = ceshi_hua_e
y0 = ceshi_hua_ban
#第一次根据随机种子聚类
n = 0
ds = getDistance(x,y,x0,y0,k)
temp = julei(ds,x)
temp1 = EDistance(x,y,x0,y0,k)
n = n + 1
center = cent(temp)
x0 = center[0]
y0 = center[1]
ds = getDistance(x,y,x0,y0,k)
temp = julei(ds,x)
temp2 = EDistance(x,y,x0,y0,k)
n = n + 1
#比较两次平方误差 判断是否相等，不相等继续迭代
while np.abs(temp2 - temp1) != 0:
    temp1 = temp2
    center = cent(temp)
    x0 = center[0]
    y0 = center[1]
    ds = getDistance(x,y,x0,y0,k)
    temp = julei(ds,x)
    temp2 = EDistance(x,y,x0,y0,k)
    n = n + 1
    print(n,temp2)
#结果可视化
print("迭代次数: ", n) # 统计出迭代次数
print('质心位置：',x0,y0)

plt.scatter(x0,y0,color='r',s=50,marker='s')
plt.scatter(x,y,c=temp,s=25,marker='o')
plt.xlabel('花萼面积')
plt.ylabel('花瓣面积')
plt.title("聚3类")
plt.show()