定义
序列最小最优化(Sequential Minimal Optimization,SMO)
输入:训练数据集 T = { ( x 1 , y 1 ) , ( x 2 , y 2 ) , ⋯ , ( x N , y N ) } T=\{ (x_1,y_1),(x_2,y_2),\cdots,(x_N,y_N)\} T={(x1,y1),(x2,y2),⋯,(xN,yN)},其中, x i ∈ χ = R n , y i ∈ y = { − 1 , + 1 } , i = 1 , 2 , ⋯ , N x_i \in \chi=R^n, y_i \in {\tt y}=\{ -1,+1 \},i=1,2,\cdots,N xi∈χ=Rn,yi∈y={−1,+1},i=1,2,⋯,N,精度 ε \varepsilon ε
输出:近似值 α ^ \hat \alpha α^
(1)取初值 α ( 0 ) = 0 \alpha^{(0)}=0 α(0)=0,令 k = 0 k=0 k=0
(2)选取优化变量 α 1 ( k ) , α 2 ( k ) \alpha_1^{(k)},\alpha_2^{(k)} α1(k),α2(k),解析求解两个变量的最优化问题,求得最优解 α 1 ( k + 1 ) , α 2 ( k + 2 ) \alpha_1^{(k+1)},\alpha_2^{(k+2)} α1(k+1),α2(k+2),更新 α \alpha α为 α ( k + 1 ) \alpha^{(k+1)} α(k+1)
(3)若在精度 ε \varepsilon ε范围内满足停机条件:
∑ i = 1 N α i y i = 0 , 0 ≤ α i ≤ C , i = 1 , 2 , ⋯ , N \sum_{i=1}^N\alpha_iy_i=0,0\leq\alpha_i\leq C,i=1,2,\cdots,N ∑i=1Nαiyi=0,0≤αi≤C,i=1,2,⋯,N
y i ⋅ g ( x i ) : { ≥ 1 , { x i ∣ α i = 0 } = 1 , { x i ∣ 0 < α i < C } ≤ 1 , { x i ∣ α i = C y_i \cdot g(x_i):\begin{cases} \geq 1,\{x_i|\alpha_i=0\}\\ = 1, \{x_i|0 < \alpha_i < C\} \\ \leq 1,\{x_i|\alpha_i=C\ \end{cases} yi⋅g(xi):⎩
⎨
⎧≥1,{xi∣αi=0}=1,{xi∣0<αi<C}≤1,{xi∣αi=C ,其中 g ( x i ) = ∑ j = 1 N α j y j K ( x j , x i ) + b , g(x_i)=\sum_{j=1}^N\alpha_jy_jK(x_j,x_i)+b, g(xi)=∑j=1NαjyjK(xj,xi)+b,
则转(4);否则令 k = k + 1 k=k+1 k=k+1,转(2);
(4)取 α ^ = α ( k + 1 ) \hat \alpha = \alpha^{(k+1)} α^=α(k+1)
输入空间
T= { ( x 1 , y 1 ) , ( x 2 , y 2 ) , … , ( x N , y N ) } \left\{(x_1,y_1),(x_2,y_2),\dots,(x_N,y_N)\right\} {(x1,y1),(x2,y2),…,(xN,yN)}
import time
import numpy as np
import math
import random
def loadData(fileName):
'''
加载Mnist数据集 下载地址:https://download.csdn.net/download/nanxiaotao/89720991)
:param fileName:要加载的文件路径
:return: 数据集和标签集
'''
#存放数据及标记
dataArr = []; labelArr = []
#读取文件
fr = open(fileName)
#遍历文件中的每一行
for line in fr.readlines():
curLine = line.strip().split(',')
dataArr.append([int(num) / 255 for num in curLine[1:]])
if int(curLine[0]) == 0:
labelArr.append(1)
else:
labelArr.append(-1)
#返回数据集和标记
return dataArr, labelArr
trainDataList, trainLabelList = loadData('../Mnist/mnist_train.csv')
trainDataMat = np.mat(trainDataList) #训练数据集
trainLabelMat = np.mat(trainLabelList).T #训练标签集,为了方便后续运算提前做了转置,变为列向量
np.shape(trainDataMat)
特征空间(Feature Space)
trainDataMat[0][0:784]
统计学习方法
模型
f ( x ) = s i g n ( ∑ i = 1 N α i y i K ( x j , x i ) + b ) f(x)=sign\bigg( \sum_{i=1}^N \alpha_iy_iK(x_j,x_i)+b \bigg) f(x)=sign(∑i=1NαiyiK(xj,xi)+b)
策略
目标函数:
m i n α 1 , α 2 \mathop{min}\limits_{\alpha_1,\alpha_2} α1,α2min W ( α 1 , α 2 ) = 1 2 K 11 α 1 2 + 1 2 K 22 α 2 2 + y 1 y 2 K 12 α 1 α 2 − ( α 1 + α 2 ) + y 1 α 1 ∑ i = 3 N y i α i K i 1 + y 2 α 2 ∑ i = 3 N y i α i K i 2 W(\alpha_1,\alpha_2)=\frac{1}{2}K_{11}\alpha_1^2+\frac{1}{2}K_{22}\alpha_2^2+y_1y_2K_{12}\alpha_1\alpha_2-(\alpha_1+\alpha_2) +y_1\alpha_1\sum_{i=3}^Ny_i\alpha_iK_{i1}+y_2\alpha_2\sum_{i=3}^Ny_i\alpha_iK_{i2} W(α1,α2)=21K11α12+21K22α22+y1y2K12α1α2−(α1+α2)+y1α1∑i=3NyiαiKi1+y2α2∑i=3NyiαiKi2其中,目标函数式中省略了不含 α 1 , α 2 \alpha_1,\alpha_2 α1,α2的常数项
s.t. α 1 y 1 + α 2 y 2 = − ∑ i = 3 N y i α i = ζ \alpha_1y_1+\alpha_2y_2=-\sum_{i=3}^Ny_i\alpha_i=\zeta α1y1+α2y2=−∑i=3Nyiαi=ζ,其中, 0 ≤ α i ≤ C , ζ 0\leq\alpha_i\leq C,\zeta 0≤αi≤C,ζ:常数
停机条件:
∑ i = 1 N α i y i = 0 , 0 ≤ α i ≤ C , i = 1 , 2 , ⋯ , N \sum_{i=1}^N\alpha_iy_i=0,0\leq\alpha_i\leq C,i=1,2,\cdots,N ∑i=1Nαiyi=0,0≤αi≤C,i=1,2,⋯,N
y i ⋅ g ( x i ) : { ≥ 1 , { x i ∣ α i = 0 } = 1 , { x i ∣ 0 < α i < C } ≤ 1 , { x i ∣ α i = C y_i \cdot g(x_i):\begin{cases} \geq 1,\{x_i|\alpha_i=0\}\\ = 1, \{x_i|0 < \alpha_i < C\} \\ \leq 1,\{x_i|\alpha_i=C\ \end{cases} yi⋅g(xi):⎩
⎨
⎧≥1,{xi∣αi=0}=1,{xi∣0<αi<C}≤1,{xi∣αi=C ,其中 g ( x i ) = ∑ j = 1 N α j y j K ( x j , x i ) + b g(x_i)=\sum_{j=1}^N\alpha_jy_jK(x_j,x_i)+b g(xi)=∑j=1NαjyjK(xj,xi)+b
算法
m, n = np.shape(trainDataMat) #m:训练集数量 n:样本特征数目
sigma = 10 #高斯核分母中的σ
C = 200 #惩罚参数
toler = 0.001 #松弛变量
b = 0 #SVM中的偏置b
alpha = [0] * trainDataMat.shape[0] # α 长度为训练集数目
E = [0 * trainLabelMat[i, 0] for i in range(trainLabelMat.shape[0])] #SMO运算过程中的Ei
supportVecIndex = []
高斯核函数(Gaussian Kernel Function)
K ( x , z ) = e x p ( − ∣ ∣ x − z ∣ ∣ 2 2 σ 2 ) K(x,z)=exp\bigg( -\dfrac{\bigg|\bigg| x-z \bigg|\bigg|^2}{2\sigma^2} \bigg) K(x,z)=exp(−2σ2
x−z
2)
def calcKernel():
'''
计算核函数
:return: 高斯核矩阵
'''
k = [[0 for i in range(m)] for j in range(m)]
for i in range(m):
if i % 100 == 0:
print('kernel:', i, m)
X = trainDataMat[i, :]
for j in range(i, m):
#获得Z
Z = trainDataMat[j, :]
#先计算||X - Z||^2
result = (X - Z) * (X - Z).T
#分子除以分母后去指数,得到的即为高斯核结果
result = np.exp(-1 * result / (2 * sigma**2))
#将Xi*Xj的结果存放入k[i][j]和k[j][i]中
k[i][j] = result
k[j][i] = result
#返回高斯核矩阵
return k
k = calcKernel() #核函数(初始化时提前计算)
是否满足KKT条件
α i = 0 ⟺ y i g ( x i ) ⩾ 1 \alpha_i=0 \iff y_ig(x_i)\geqslant 1 αi=0⟺yig(xi)⩾1
0 < α i < C ⟺ y i g ( x i ) = 1 0<\alpha_i<C \iff y_ig(x_i)=1 0<αi<C⟺yig(xi)=1
α i = C ⟺ y i g ( x i ) ⩽ 1 \alpha_i = C \iff y_ig(x_i) \leqslant 1 αi=C⟺yig(xi)⩽1
其中, g ( x i ) = ∑ j = 1 N α j y j K ( x i , x j ) + b g(x_i)=\sum_{j=1}^N\alpha_jy_jK(x_i,x_j)+b g(xi)=∑j=1NαjyjK(xi,xj)+b
def calc_gxi(i):
'''
:param i:x的下标
:return: g(xi)的值
'''
#初始化g(xi)
gxi = 0
index = [i for i, alpha in enumerate(alpha) if alpha != 0]
#遍历每一个非零α,i为非零α的下标
for j in index:
#计算g(xi)
gxi += alpha[j] * trainLabelMat[j] * k[j][i]
#求和结束后再单独加上偏置b
gxi += b
#返回
return gxi
def isSatisfyKKT(i):
'''
查看第i个α是否满足KKT条件
:param i:α的下标
:return:
True:满足
False:不满足
'''
gxi =calc_gxi(i)
yi = trainLabelMat[i]
if (math.fabs(alpha[i]) < toler) and (yi * gxi >= 1):
return True
elif (math.fabs(alpha[i] - C) < toler) and (yi * gxi <= 1):
return True
elif (alpha[i] > -toler) and (alpha[i] < (C + toler)) \
and (math.fabs(yi * gxi - 1) < toler):
return True
return False
E i = g ( x i ) = ( ∑ j = 1 N α j y j K ( x j , x i ) + b ) − y i , i = 1 , 2 E_i=g(x_i)=\bigg( \sum_{j=1}^N \alpha_jy_jK(x_j,x_i)+b \bigg)-y_i,i=1,2 Ei=g(xi)=(∑j=1NαjyjK(xj,xi)+b)−yi,i=1,2
当 i = 1 , 2 i=1,2 i=1,2时, E i E_i Ei为函数 g ( x ) g(x) g(x)对输入 x i x_i xi的预测值与真实输出 y i y_i yi之差
def calcEi(i):
'''
计算Ei
:param i: E的下标
:return:
'''
#计算g(xi)
gxi = calc_gxi(i)
#Ei = g(xi) - yi,直接将结果作为Ei返回
return gxi - trainLabelMat[i]
α 2 ( n e w , u n c ) = α 2 o l d + y 2 ( E 1 − E 2 ) η \alpha_2^{(new,unc)}=\alpha_2^{old}+\dfrac{y_2(E_1-E_2)}{\eta} α2(new,unc)=α2old+ηy2(E1−E2)
α 2 n e w : { H , α 2 ( n e w , u n c ) > H α 2 ( n e w , u n c ) , L ≤ α 2 ( n e w , u n c ) ≤ H L , α 2 ( n e w , u n c ) < L \alpha_2^{new}:\begin{cases} H,\alpha_2^{(new,unc)}>H\\ \alpha_2^{(new,unc)},L \leq\alpha_2^{(new,unc)}\leq H \\ L,\alpha_2^{(new,unc)}<L \end{cases} α2new:⎩
⎨
⎧H,α2(new,unc)>Hα2(new,unc),L≤α2(new,unc)≤HL,α2(new,unc)<L,其中 L = m a x ( 0 , α 2 o l d + α 1 o l d − C ) , H = m i n ( C , α 2 o l d + α 1 o l d ) L=max\big( 0,\alpha_2^{old} + \alpha_1^{old} - C \big),H=min\big( C, \alpha_2^{old} + \alpha_1^{old} \big) L=max(0,α2old+α1old−C),H=min(C,α2old+α1old)
def getAlphaJ( E1, i):
'''
SMO中选择第二个变量
:param E1: 第一个变量的E1
:param i: 第一个变量α的下标
:return: E2,α2的下标
'''
#初始化E2
E2 = 0
#初始化|E1-E2|为-1
maxE1_E2 = -1
#初始化第二个变量的下标
maxIndex = -1
#获得Ei非0的对应索引组成的列表,列表内容为非0Ei的下标i
nozeroE = [i for i, Ei in enumerate(E) if Ei != 0]
#对每个非零Ei的下标i进行遍历
for j in nozeroE:
#计算E2
E2_tmp = calcEi(j)
#如果|E1-E2|大于目前最大值
if math.fabs(E1 - E2_tmp) > maxE1_E2:
#更新最大值
maxE1_E2 = math.fabs(E1 - E2_tmp)
#更新最大值E2
E2 = E2_tmp
#更新最大值E2的索引j
maxIndex = j
#如果列表中没有非0元素了(对应程序最开始运行时的情况)
if maxIndex == -1:
maxIndex = i
while maxIndex == i:
#获得随机数,如果随机数与第一个变量的下标i一致则重新随机
maxIndex = int(random.uniform(0, m))
#获得E2
E2 = calcEi(maxIndex)
#返回第二个变量的E2值以及其索引
return E2, maxIndex
模型训练
def train(iter = 100):
iterStep = 0; #iterStep:迭代次数,超过设置次数还未收敛则强制停止
parameterChanged = 1 #parameterChanged:单次迭代中有参数改变则增加1
global supportVecIndex
global b
global alpha
while (iterStep < iter) and (parameterChanged > 0):
#打印当前迭代轮数
print('iter:%d:%d'%( iterStep, iter))
#迭代步数加1
iterStep += 1
#新的一轮将参数改变标志位重新置0
parameterChanged = 0
#大循环遍历所有样本,用于找SMO中第一个变量
for i in range(m):
#查看第一个遍历是否满足KKT条件,如果不满足则作为SMO中第一个变量从而进行优化
if isSatisfyKKT(i) == False:
#如果下标为i的α不满足KKT条件,则进行优化
E1 = calcEi(i)
#选择第2个变量
E2, j = getAlphaJ(E1, i)
#获得两个变量的标签
y1 = trainLabelMat[i]
y2 = trainLabelMat[j]
#复制α值作为old值
alphaOld_1 = alpha[i]
alphaOld_2 = alpha[j]
#依据标签是否一致来生成不同的L和H
if y1 != y2:
L = max(0, alphaOld_2 - alphaOld_1)
H = min(C, C + alphaOld_2 - alphaOld_1)
else:
L = max(0, alphaOld_2 + alphaOld_1 - C)
H = min(C, alphaOld_2 + alphaOld_1)
#如果两者相等,说明该变量无法再优化,直接跳到下一次循环
if L == H: continue
#计算α的新值
k11 = k[i][i]
k22 = k[j][j]
k21 = k[j][i]
k12 = k[i][j]
#更新α2,该α2还未经剪切
alphaNew_2 = alphaOld_2 + y2 * (E1 - E2) / (k11 + k22 - 2 * k12)
#剪切α2
if alphaNew_2 < L: alphaNew_2 = L
elif alphaNew_2 > H: alphaNew_2 = H
#更新α1
alphaNew_1 = alphaOld_1 + y1 * y2 * (alphaOld_2 - alphaNew_2)
#计算b1和b2
b1New = -1 * E1 - y1 * k11 * (alphaNew_1 - alphaOld_1) \
- y2 * k21 * (alphaNew_2 - alphaOld_2) + b
b2New = -1 * E2 - y1 * k12 * (alphaNew_1 - alphaOld_1) \
- y2 * k22 * (alphaNew_2 - alphaOld_2) + b
#依据α1和α2的值范围确定新b
if (alphaNew_1 > 0) and (alphaNew_1 < C):
bNew = b1New
elif (alphaNew_2 > 0) and (alphaNew_2 < C):
bNew = b2New
else:
bNew = (b1New + b2New) / 2
#将更新后的各类值写入,进行更新
alpha[i] = alphaNew_1
alpha[j] = alphaNew_2
b = bNew
E[i] = calcEi(i)
E[j] = calcEi(j)
#如果α2的改变量过于小,就认为该参数未改变,不增加parameterChanged值
#反之则自增1
if math.fabs(alphaNew_2 - alphaOld_2) >= 0.00001:
parameterChanged += 1
#打印迭代轮数,i值,该迭代轮数修改α数目
print("iter: %d i:%d, pairs changed %d" % (iterStep, i, parameterChanged))
#全部计算结束后,重新遍历一遍α,查找里面的支持向量
for i in range(m):
#如果α>0,说明是支持向量
if alpha[i] > 0:
#将支持向量的索引保存起来
supportVecIndex.append(i)
train()
假设空间(Hypothesis Space)
{ f ∣ f ( x ) = s i g n ( ∑ i = 1 N α i y i K ( x j , x i ) + b ) } \left\{f|f(x) = sign\bigg( \sum_{i=1}^N \alpha_iy_iK(x_j,x_i)+b \bigg) \right\} {f∣f(x)=sign(∑i=1NαiyiK(xj,xi)+b)}
输出空间
Y ∈ { − 1 , 1 } Y \in \{ -1,1 \} Y∈{−1,1}
模型评估
训练误差
testDataList, testLabelList = loadData('../Mnist/mnist_test.csv')
testDataList = testDataList[:100]
testLabelList = testLabelList[:100]
def calcSinglKernel(x1, x2):
'''
单独计算核函数
:param x1:向量1
:param x2: 向量2
:return: 核函数结果
'''
result = (x1 - x2) * (x1 - x2).T
result = np.exp(-1 * result / (2 * sigma ** 2))
#返回结果
return np.exp(result)
def predict(x):
'''
对样本的标签进行预测
:param x: 要预测的样本x
:return: 预测结果
'''
global supportVecIndex
global alpha
result = 0
for i in supportVecIndex:
tmp = calcSinglKernel(trainDataMat[i, :], np.mat(x))
#对每一项子式进行求和,最终计算得到求和项的值
result += alpha[i] * trainLabelMat[i] * tmp
#求和项计算结束后加上偏置b
result += b
#使用sign函数返回预测结果
return np.sign(result)
def model_test(testDataList, testLabelList):
'''
测试
:param testDataList:测试数据集
:param testLabelList: 测试标签集
:return: 正确率
'''
#错误计数值
errorCnt = 0
#遍历测试集所有样本
for i in range(len(testDataList)):
#打印目前进度
print('test:%d:%d'%(i, len(testDataList)))
#获取预测结果
result = predict(testDataList[i])
#如果预测与标签不一致,错误计数值加一
if result != testLabelList[i]:
errorCnt += 1
#返回正确率
return 1 - errorCnt / len(testDataList)
accuracy = model_test(testDataList, testLabelList)
print('accuracy:%d'%(accuracy * 100), '%')