直接上代码去理解,不搞虚头八脑的东西,不听别人胡扯啦。
梯度下降
在整个数据进行梯度下降, 模型精度高,但每轮计算量大。随机梯度下降计算量小,模型精度不够。minibatch很好的结合了二者的优点,又易于并行。
times = 1000;
[m, n] = size(x);
w = zeros(n, 1); % init with 1
%简单梯度下降
for i=1:times
delta = 1.0/m * x' * (x*w - y);
w = w - alpha * delta;
end
%SGD
for i=1:times
for j=1:m
delta = 1.0/m * x(j, :)' * (x(j, :) * w - y(j));
w = w - alpha * delta;
end
end
%minibatch
batch = 10;
for i=1:times
for j=1:batch:m-1
delta = 1.0/m * x(j:j+batch-1, :)' * (x(j:j+batch-1, :) * w - y(j:j+batch-1));
w = w - alpha * delta;
end
end
神经网络
训练流程
%L the sum squared error for each training batch
nn.learningRate = 2; %typically needs to be lower when using 'sigm' activation function and non-normalized inputs.
nn.scaling_learningRate = 1; %Scaling factor for the learning rate (each epoch)
L = zeros(numepochs*numbatches,1);
n = 1;
for i = 1 : numepochs
%shuffle, 有木有啊
kk = randperm(m);
for l = 1 : numbatches
batch_x = train_x(kk((l - 1) * batchsize + 1 : l * batchsize), :);
batch_y = train_y(kk((l - 1) * batchsize + 1 : l * batchsize), :);
%前向计算
nn = nnff(nn, batch_x, batch_y);
%反向计算梯度
nn = nnbp(nn);
%修正权值
nn = nnapplygrads(nn);
L(n) = nn.L;
n = n + 1;
end
nn.learningRate = nn.learningRate * nn.scaling_learningRate;
end
前向计算
%以sigmod函数和softmax为例
function nn = nnff(nn, x, y)
m = size(x, 1);
x = [ones(m,1) x];
nn.a{1} = x;
%计算所有隐层输入
for i = 2 : n-1
nn.a{i} = sigm(nn.a{i - 1} * nn.W{i - 1}');
%Add the bias term
nn.a{i} = [ones(m,1) nn.a{i}];
end
%softmax, 计算损失
end
反向计算梯度
function nn = nnbp(nn)
d{n} = - nn.e;
%反向传播到第2层
for i = (n - 1) : -1 : 2
d_act = nn.a{i} .* (1 - nn.a{i});
d{i} = (d{i + 1} * nn.W{i}) .* d_act;
end
for i = 1 : (n - 1)
if i+1==n
nn.dW{i} = (d{i + 1}' * nn.a{i}) / size(d{i + 1}, 1);
else
nn.dW{i} = (d{i + 1}(:,2:end)' * nn.a{i}) / size(d{i + 1}, 1);
end
end
end
梯度下降
function nn = nnapplygrads(nn)
for i = 1 : (nn.n - 1)
nn.W{i} = nn.W{i} - nn.learningRate * dW
end
end
Dropout
%初始化参数时
nn.dropoutFraction = 0.5; % Dropout fraction
%in nnff前向计算
if(nn.dropoutFraction > 0)
if(nn.testing)
%测试时这是什么意思
nn.a{i} = nn.a{i}.*(1 - nn.dropoutFraction);
else
%相当与随机在输入向量中采点,即仅一部分用来特征计算下一层
nn.dropOutMask{i} = (rand(size(nn.a{i}))>nn.dropoutFraction);
nn.a{i} = nn.a{i}.*nn.dropOutMask{i};
end
end
%in 反向计算梯度
%计算梯度时用相同的mask, 即有的输出不产生上一层的梯度
if(nn.dropoutFraction>0)
d{i} = d{i} .* [ones(size(d{i},1),1) nn.dropOutMask{i}];
end
Stacked Auto Encoder
作预训练,逐层初始化整个网络,即为w取得比较好的初值。
function sae = saesetup(size)
for u = 2 : numel(size)
sae.ae{u-1} = nnsetup([size(u-1) size(u) size(u-1)]);
end
end
function sae = saetrain(sae, x, opts)
for i = 1 : numel(sae.ae);
%逐层初始化,这一层输出t.a{1}=x, t.a{2}输出
disp(['Training AE ' num2str(i) '/' num2str(numel(sae.ae))]);
%每一次三层网络,输入与输出相等,取前2层的w
sae.ae{i} = nntrain(sae.ae{i}, x, x, opts);
t = nnff(sae.ae{i}, x, x);
x = t.a{2};
%remove bias term
x = x(:,2:end);
end
end
Momentum冲量
%初始化时
nn.momentum = 0.5;
nn.vW{i - 1} = zeros(size(nn.W{i - 1}));
%梯度下降nnapplygrads
if(nn.momentum>0)
nn.vW{i} = nn.momentum*nn.vW{i} + dW;
dW = nn.vW{i};
end
Sparse
%init
nn.nonSparsityPenalty = 0; % Non sparsity penalty
nn.sparsityTarget = 0.05; % Sparsity target
nn.inputZeroMaskedFraction = 0; % Used for Denoising AutoEncoders
nn.p{i} = zeros(1, nn.size(i));
%nnff
if(nn.nonsparsitypenalty>0)
nn.p{i} = 0.99 * nn.p{i} + 0.01 * mean(nn.a{i}, 1);
end
%nnbp
if(nn.nonSparsityPenalty>0)
pi = repmat(nn.p{i}, size(nn.a{i}, 1), 1);
sparsityError = [zeros(size(nn.a{i},1),1) nn.nonSparsityPenalty ...
* (-nn.sparsityTarget ./ pi + (1 - nn.sparsityTarget) ./ (1 - pi))];
end
PREVIOUSShell & Tools