解:将概率分布代入对数似然函数,
l(ψ,μ0,μ1,∑)=∑i=1mlogpX∣Y(x(i)∣y(i);μ0,μ1,∑)+∑i=1mlogpY(y(i);ψ)l(\psi,\mu_0,\mu_1,\sum)=\sum^m_{i=1}{log{p_{X|Y}(x^{(i)}|y^{(i)};\mu_0,\mu_1,\sum)}}+\sum^m_{i=1}log{p_Y}(y^{(i)};\psi)l(ψ,μ0,μ1,∑)=∑i=1mlogpX∣Y(x(i)∣y(i);μ0,μ1,∑)+∑i=1mlogpY(y(i);ψ)
=∑i=1m(1−y(i))log1(2π)n/2∣∑∣1/2exp(12(x(i)−μ0)T∑−1(x(i)−μ0))=\sum^m_{i=1}(1-y^{(i)}){log \frac{1}{(2\pi)^{n/2}|\sum|^{1/2}}exp(\frac{1}{2}(x^{(i)}-\mu_0)^T\sum^{-1}(x^{(i)}-\mu_0))}=∑i=1m(1−y(i))log(2π)n/2∣∑∣1/21exp(21(x(i)−μ0)T∑−1(x(i)−μ0))
+∑i=1my(i)log1(2π)n/2∣∑∣1/2exp(12(x(i)−μ1)T∑−1(x(i)−μ1))+\sum^m_{i=1}y^{(i)}{log \frac{1}{(2\pi)^{n/2}|\sum|^{1/2}}exp(\frac{1}{2}(x^{(i)}-\mu_1)^T\sum^{-1}(x^{(i)}-\mu_1))}+∑i=1my(i)log(2π)n/2∣∑∣1/21exp(21(x(i)−μ1)T∑−1(x(i)−μ1))
+∑i=1mlogψy(i)(1−ψ)1−y(i)+\sum^m_{i=1}{log\psi^{y^{(i)}}(1-\psi)^{1-y^{(i)}}}+∑i=1mlogψy(i)(1−ψ)1−y(i)
求取l(ψ,μ0,μ1,∑)l(\psi,\mu_0,\mu_1,\sum)l(ψ,μ0,μ1,∑)的最大值,令
∂∂ψl(ψ,μ0,μ1,∑)=0\frac{\partial}{\partial\psi}l(\psi,\mu_0,\mu_1,\sum)=0∂ψ∂l(ψ,μ0,μ1,∑)=0 (1)
∇μ0l(ψ,μ0,μ1,∑)=0\nabla_{\mu_0}l(\psi,\mu_0,\mu_1,\sum)=0∇μ0l(ψ,μ0,μ1,∑)=0 (2)
∇μ1l(ψ,μ0,μ1,∑)=0\nabla_{\mu_1}l(\psi,\mu_0,\mu_1,\sum)=0∇μ1l(ψ,μ0,μ1,∑)=0 (3)
∇∑l(ψ,μ0,μ1,∑)=0\nabla_{\sum}l(\psi,\mu_0,\mu_1,\sum)=0∇∑l(ψ,μ0,μ1,∑)=0 (4)
对于(1)式:
∂∂ψ∑i=1my(i)logψ+(1−y(i))log(1−ψ)=0\frac{\partial}{\partial\psi}{\sum^m_{i=1}y^{(i)}log\psi+(1-y^{(i)})log(1-\psi)}=0∂ψ∂∑i=1my(i)logψ+(1−y(i))log(1−ψ)=0
∑i=1my(i)ψ+1−y(i)1−ψ=0{\sum^m_{i=1}\frac{y^{(i)}}{\psi}+\frac{1-y^{(i)}}{1-\psi}}=0∑i=1mψy(i)+1−ψ1−y(i)=0
∑i=1my(i)(1−ψ)+(1−y(i))ψ=0{\sum^m_{i=1}y^{(i)}{(1-\psi)}+(1-y^{(i)}){\psi}}=0∑i=1my(i)(1−ψ)+(1−y(i))ψ=0
∑i=1my(i)=mψ{\sum^m_{i=1}y^{(i)}}=m\psi∑i=1my(i)=mψ
ψ=∑i=1m1{y(i)=1}m\psi=\frac{\sum^m_{i=1}1\{y^{(i)}=1\}}{m}ψ=m∑i=1m1{y(i)=1}
对于(2)式:
∇μ0∑i=1m(1−y(i))(x(i)−μ0)T∑−1(x(i)−μ0)=0\nabla_{\mu_0}\sum^m_{i=1}(1-y^{(i)})(x^{(i)}-\mu_0)^T\sum^{-1}(x^{(i)}-\mu_0)=0∇μ0∑i=1m(1−y(i))(x(i)−μ0)T∑−1(x(i)−μ0)=0
∑i=1m(1−y(i))(x(i)−μ0)T∑−1(x(i)−μ0)=0\sum^m_{i=1}(1-y^{(i)})(x^{(i)}-\mu_0)^T\sum^{-1}(x^{(i)}-\mu_0)=0∑i=1m(1−y(i))(x(i)−μ0)T∑−1(x(i)−μ0)=0
∑i=1m(1−y(i))[∑−1(x(i)−μ0)d(x(i)−μ0)T+(x(i)−μ0)T∑−1d(x(i)−μ0)]=0\sum^m_{i=1}(1-y^{(i)})[\sum^{-1}(x^{(i)}-\mu_0)d(x^{(i)}-\mu_0)^T+(x^{(i)}-\mu_0)^T\sum^{-1}d(x^{(i)}-\mu_0)]=0∑i=1m(1−y(i))[∑−1(x(i)−μ0)d(x(i)−μ0)T+(x(i)−μ0)T∑−1d(x(i)−μ0)]=0
∑i=1m(1−y(i))∑−1(x(i)−μ0)=0\sum^m_{i=1}(1-y^{(i)})\sum^{-1}(x^{(i)}-\mu_0)=0∑i=1m(1−y(i))∑−1(x(i)−μ0)=0
∑i=1m(1−y(i))(x(i)−μ0)=0\sum^m_{i=1}(1-y^{(i)})(x^{(i)}-\mu_0)=0∑i=1m(1−y(i))(x(i)−μ0)=0
∑i=1m(1−y(i))x(i)=∑i=1m(1−y(i))μ0\sum^m_{i=1}(1-y^{(i)})x^{(i)}=\sum^m_{i=1}(1-y^{(i)})\mu_0∑i=1m(1−y(i))x(i)=∑i=1m(1−y(i))μ0
μ0=∑i=1m1{y(i)=0}x(i)/∑i=1m1{y(i)=0}\mu_0=\sum^m_{i=1}1\{y^{(i)}=0\}x^{(i)}/\sum^m_{i=1}1\{y^{(i)}=0\}μ0=∑i=1m1{y(i)=0}x(i)/∑i=1m1{y(i)=0}
对于(3)式,类同(2)式:
μ0=∑i=1m1{y(i)=1}x(i)/∑i=1m1{y(i)=1}\mu_0=\sum^m_{i=1}1\{y^{(i)}=1\}x^{(i)}/\sum^m_{i=1}1\{y^{(i)}=1\}μ0=∑i=1m1{y(i)=1}x(i)/∑i=1m1{y(i)=1}
对于(4)式:
∇∑(−m2log∣∑∣)−12∑i=1m(1−y(i))(x(i)−μ0)T∑−1(x(i)−μ0)−12∑i=1my(i)(x(i)−μ1)T∑−1(x(i)−μ1)=0\nabla_{\sum}(-\frac{m}{2}log|\sum|)-\frac{1}{2}\sum^m_{i=1}(1-y^{(i)})(x^{(i)}-\mu_0)^T\sum^{-1}(x^{(i)}-\mu_0)-\frac{1}{2}\sum^m_{i=1}y^{(i)}(x^{(i)}-\mu_1)^T\sum^{-1}(x^{(i)}-\mu_1)=0∇∑(−2mlog∣∑∣)−21∑i=1m(1−y(i))(x(i)−μ0)T∑−1(x(i)−μ0)−21∑i=1my(i)(x(i)−μ1)T∑−1(x(i)−μ1)=0
∇∑(mlog∣∑∣)+∇∑∑i=1m(1−y(i))(x(i)−μ0)T∑−1(x(i)−μ0)+∇∑∑i=1my(i)(x(i)−μ1)T∑−1(x(i)−μ1)=0\nabla_{\sum}(mlog|\sum|)+\nabla_{\sum}\sum^m_{i=1}(1-y^{(i)})(x^{(i)}-\mu_0)^T\sum^{-1}(x^{(i)}-\mu_0)+\nabla_{\sum}\sum^m_{i=1}y^{(i)}(x^{(i)}-\mu_1)^T\sum^{-1}(x^{(i)}-\mu_1)=0∇∑(mlog∣∑∣)+∇∑∑i=1m(1−y(i))(x(i)−μ0)T∑−1(x(i)−μ0)+∇∑∑i=1my(i)(x(i)−μ1)T∑−1(x(i)−μ1)=0
已知协方差矩阵Si=1m∑i=1m(x(i)−μi)(x(i)−μi)TS_i=\frac{1}{m}\sum^m_{i=1}(x^{(i)}-\mu_i)(x^{(i)}-\mu_i)^TSi=m1∑i=1m(x(i)−μi)(x(i)−μi)T,将通过SiS_iSi简化表达上式
∇∑∑i=1m(x(i)−μi)T∑−1(x(i)−μi)\nabla_{\sum}\sum^m_{i=1}(x^{(i)}-\mu_i)^T\sum^{-1}(x^{(i)}-\mu_i)∇∑∑i=1m(x(i)−μi)T∑−1(x(i)−μi)
=∇∑tr(∑i=1m(x(i)−μi)T∑−1(x(i)−μi))=\nabla_{\sum}tr(\sum^m_{i=1}(x^{(i)}-\mu_i)^T\sum^{-1}(x^{(i)}-\mu_i))=∇∑tr(∑i=1m(x(i)−μi)T∑−1(x(i)−μi))
=∇∑tr(∑i=1m(x(i)−μi)(x(i)−μi)T∑−1)=\nabla_{\sum}tr(\sum^m_{i=1}(x^{(i)}-\mu_i)(x^{(i)}-\mu_i)^T\sum^{-1})=∇∑tr(∑i=1m(x(i)−μi)(x(i)−μi)T∑−1)
=∇∑tr(miSi∑−1)=\nabla_{\sum}tr(m_iS_i\sum^{-1})=∇∑tr(miSi∑−1)
其中mi=∑k=1m1{y(k)=i}m_i=\sum^m_{k=1}1\{y^{(k)}=i\}mi=∑k=1m1{y(k)=i},
∇∑tr(miSi∑−1)=−miSiT∑−2\nabla_{\sum}tr(m_iS_i\sum^{-1})=-m_iS_i^T\sum^{-2}∇∑tr(miSi∑−1)=−miSiT∑−2,
而∇∑(mlog∣∑∣)=m1∣∑∣∣∑∣∑−1=m∑−1\nabla_{\sum}(mlog|\sum|)=m\frac{1}{|\sum|}|\sum|\sum^{-1}=m\sum^{-1}∇∑(mlog∣∑∣)=m∣∑∣1∣∑∣∑−1=m∑−1,
因此,(4)式可简化为
m∑−1−∑i2miSiT∑−2=0m\sum^{-1}-\sum_i^{2}m_iS_i^T\sum^{-2}=0m∑−1−∑i2miSiT∑−2=0
∑=1m∑i2miSiT\sum=\frac{1}{m}\sum_i^{2}m_iS_i^T∑=m1∑i2miSiT
∑=1m∑i=1m(x(i)−μy(i))T(x(i)−μy(i))\sum=\frac{1}{m}\sum_{i=1}^{m}(x^{(i)}-\mu_{y^{(i)}})^T(x^{(i)}-\mu_{y^{(i)}})∑=m1∑i=1m(x(i)−μy(i))T(x(i)−μy(i))