Skip to content

Commit 1fef8e0

Browse files
committed
concat word/char vecs
1 parent 20e0ae2 commit 1fef8e0

File tree

8 files changed

+177
-42
lines changed

8 files changed

+177
-42
lines changed

.#train.lua

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
yoon@yoon-VirtualBox.4277:1434833625

model/LSTMCNN.lua

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ function LSTMCNN.lstmcnn(word_vocab_size, rnn_size, n, dropout, word_vec_size, c
2424
table.insert(inputs, nn.Identity()()) -- prev_h[L]
2525
end
2626

27-
local x, input_size_L, word_vec, char_vec, cnn_output
27+
local x, input_size_L, word_vec, char_vec, cnn_output, pool_layer
2828
local outputs = {}
2929
for L = 1,n do
3030
-- c,h from previous timesteps
@@ -38,13 +38,17 @@ function LSTMCNN.lstmcnn(word_vocab_size, rnn_size, n, dropout, word_vec_size, c
3838
for i = 1, #kernels do
3939
local reduced_l = length - kernels[i] + 1
4040
local conv_layer = nn.TemporalConvolution(char_vec_size, feature_maps[i], kernels[i])(char_vec)
41-
local pool_layer = nn.TemporalMaxPooling(reduced_l)(nn.Tanh()(conv_layer))
41+
pool_layer = nn.TemporalMaxPooling(reduced_l)(nn.Tanh()(conv_layer))
4242
table.insert(layer1, pool_layer)
4343
end
44-
local layer1_concat = nn.JoinTable(3)(layer1)
45-
cnn_output = nn.Squeeze()(layer1_concat)
44+
if #kernels > 1 then
45+
local layer1_concat = nn.JoinTable(3)(layer1)
46+
cnn_output = nn.Squeeze()(layer1_concat)
47+
else
48+
cnn_output = nn.Squeeze()(pool_layer)
49+
end
4650
x = nn.Identity()(cnn_output)
47-
input_size_L = torch.Tensor(feature_maps):sum()
51+
input_size_L = torch.Tensor(feature_maps):sum()
4852
else
4953
x = outputs[(L-1)*2]
5054
if dropout > 0 then x = nn.Dropout(dropout)(x) end -- apply dropout, if any
@@ -58,8 +62,8 @@ function LSTMCNN.lstmcnn(word_vocab_size, rnn_size, n, dropout, word_vec_size, c
5862
local sigmoid_chunk = nn.Narrow(2, 1, 3 * rnn_size)(all_input_sums)
5963
sigmoid_chunk = nn.Sigmoid()(sigmoid_chunk)
6064
local in_gate = nn.Narrow(2, 1, rnn_size)(sigmoid_chunk)
61-
local forget_gate = nn.Narrow(2, rnn_size + 1, rnn_size)(sigmoid_chunk)
62-
local out_gate = nn.Narrow(2, 2 * rnn_size + 1, rnn_size)(sigmoid_chunk)
65+
local out_gate = nn.Narrow(2, rnn_size + 1, rnn_size)(sigmoid_chunk)
66+
local forget_gate = nn.Narrow(2, 2 * rnn_size + 1, rnn_size)(sigmoid_chunk)
6367
-- decode the write inputs
6468
local in_transform = nn.Narrow(2, 3 * rnn_size + 1, rnn_size)(all_input_sums)
6569
in_transform = nn.Tanh()(in_transform)

model/LSTMTDNN.lua renamed to model/LSTMCNN2.lua

Lines changed: 22 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,4 @@
1-
2-
local LSTMTDNN = {}
3-
1+
local LSTMCNN = {}
42

53
local ok, cunn = pcall(require, 'fbcunn')
64
if not ok then
@@ -9,9 +7,8 @@ else
97
LookupTable = fbcunn.LookupTableGPU
108
end
119

12-
function LSTMTDNN.lstmtdnn(word_vocab_size, rnn_size, n, dropout, word_vec_size, char_vec_size, char_vocab_size,
13-
num_feature_maps, kernels, word2char2idx)
14-
-- input_size = vocab size
10+
function LSTMCNN.lstmcnn(word_vocab_size, rnn_size, n, dropout, word_vec_size, char_vec_size, char_vocab_size,
11+
feature_maps, kernels, word2char2idx)
1512
-- rnn_size = dimensionality of hidden layers
1613
-- n = number of layers
1714
-- k = word embedding size
@@ -20,38 +17,39 @@ function LSTMTDNN.lstmtdnn(word_vocab_size, rnn_size, n, dropout, word_vec_size,
2017

2118
-- there will be 2*n+1 inputs
2219
local length = word2char2idx:size(2)
23-
local word_vec_size = word_vec_size or rnn_size
2420
local inputs = {}
25-
--table.insert(inputs, nn.Identity()()) -- batch_size x 1 (word indices)
21+
table.insert(inputs, nn.Identity()()) -- batch_size x 1 (word indices)
2622
table.insert(inputs, nn.Identity()()) -- batch_size x word length (char indices)
2723
for L = 1,n do
2824
table.insert(inputs, nn.Identity()()) -- prev_c[L]
2925
table.insert(inputs, nn.Identity()()) -- prev_h[L]
3026
end
3127

32-
local x, input_size_L, word_vec, char_vec, tdnn_output
28+
local x, input_size_L, word_vec, char_vec, cnn_output, pool_layer
3329
local outputs = {}
3430
for L = 1,n do
3531
-- c,h from previous timesteps
36-
local prev_h = inputs[L*2+2-1]
37-
local prev_c = inputs[L*2+1-1]
32+
local prev_h = inputs[L*2+2]
33+
local prev_c = inputs[L*2+1]
3834
-- the input to this layer
3935
if L == 1 then
40-
char_vec = nn.LookupTable(char_vocab_size, char_vec_size)(inputs[1]) --batch_size * word length * char_vec_size
36+
word_vec = nn.LookupTable(word_vocab_size, word_vec_size)(inputs[1])
37+
char_vec = nn.LookupTable(char_vocab_size, char_vec_size)(inputs[2]) --batch_size * word length * char_vec_size
4138
local layer1 = {}
4239
for i = 1, #kernels do
4340
local reduced_l = length - kernels[i] + 1
44-
local conv_layer = nn.TemporalConvolution(char_vec_size, num_feature_maps, kernels[i])(char_vec)
45-
local pool_layer = nn.TemporalMaxPooling(reduced_l)(nn.Tanh()(conv_layer))
41+
local conv_layer = nn.TemporalConvolution(char_vec_size, feature_maps[i], kernels[i])(char_vec)
42+
pool_layer = nn.TemporalMaxPooling(reduced_l)(nn.Tanh()(conv_layer))
4643
table.insert(layer1, pool_layer)
4744
end
48-
local layer1_concat = nn.JoinTable(3)(layer1)
49-
tdnn_output = nn.Squeeze()(layer1_concat)
50-
--tdnn_output = TDNN.tdnn(length, char_vec_size, tdnn_output_size, kernels) -- batch_size * tdnn_output_size
51-
--word_vec = LookupTable(word_vocab_size, word_vec_size)(inputs[1])
52-
--x = nn.Identity()(word_vec)
53-
x = nn.Identity()(tdnn_output)
54-
input_size_L = word_vec_size
45+
if #kernels > 1 then
46+
local layer1_concat = nn.JoinTable(3)(layer1)
47+
cnn_output = nn.Squeeze()(layer1_concat)
48+
else
49+
cnn_output = nn.Squeeze()(pool_layer)
50+
end
51+
x = nn.JoinTable(2)({cnn_output, word_vec})
52+
input_size_L = torch.Tensor(feature_maps):sum() + word_vec_size
5553
else
5654
x = outputs[(L-1)*2]
5755
if dropout > 0 then x = nn.Dropout(dropout)(x) end -- apply dropout, if any
@@ -65,8 +63,8 @@ function LSTMTDNN.lstmtdnn(word_vocab_size, rnn_size, n, dropout, word_vec_size,
6563
local sigmoid_chunk = nn.Narrow(2, 1, 3 * rnn_size)(all_input_sums)
6664
sigmoid_chunk = nn.Sigmoid()(sigmoid_chunk)
6765
local in_gate = nn.Narrow(2, 1, rnn_size)(sigmoid_chunk)
68-
local forget_gate = nn.Narrow(2, rnn_size + 1, rnn_size)(sigmoid_chunk)
69-
local out_gate = nn.Narrow(2, 2 * rnn_size + 1, rnn_size)(sigmoid_chunk)
66+
local out_gate = nn.Narrow(2, rnn_size + 1, rnn_size)(sigmoid_chunk)
67+
local forget_gate = nn.Narrow(2, 2 * rnn_size + 1, rnn_size)(sigmoid_chunk)
7068
-- decode the write inputs
7169
local in_transform = nn.Narrow(2, 3 * rnn_size + 1, rnn_size)(all_input_sums)
7270
in_transform = nn.Tanh()(in_transform)
@@ -92,5 +90,5 @@ function LSTMTDNN.lstmtdnn(word_vocab_size, rnn_size, n, dropout, word_vec_size,
9290
return nn.gModule(inputs, outputs)
9391
end
9492

95-
return LSTMTDNN
93+
return LSTMCNN
9694

model/LSTMCNN2.lua~

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
local LSTMCNN = {}
2+
3+
local ok, cunn = pcall(require, 'fbcunn')
4+
if not ok then
5+
LookupTable = nn.LookupTable
6+
else
7+
LookupTable = fbcunn.LookupTableGPU
8+
end
9+
10+
function LSTMCNN.lstmcnn(word_vocab_size, rnn_size, n, dropout, word_vec_size, char_vec_size, char_vocab_size,
11+
feature_maps, kernels, word2char2idx)
12+
-- rnn_size = dimensionality of hidden layers
13+
-- n = number of layers
14+
-- k = word embedding size
15+
16+
dropout = dropout or 0
17+
18+
-- there will be 2*n+1 inputs
19+
local length = word2char2idx:size(2)
20+
local inputs = {}
21+
table.insert(inputs, nn.Identity()()) -- batch_size x 1 (word indices)
22+
table.insert(inputs, nn.Identity()()) -- batch_size x word length (char indices)
23+
for L = 1,n do
24+
table.insert(inputs, nn.Identity()()) -- prev_c[L]
25+
table.insert(inputs, nn.Identity()()) -- prev_h[L]
26+
end
27+
28+
local x, input_size_L, word_vec, char_vec, cnn_output, pool_layer
29+
local outputs = {}
30+
for L = 1,n do
31+
-- c,h from previous timesteps
32+
local prev_h = inputs[L*2+2]
33+
local prev_c = inputs[L*2+1]
34+
-- the input to this layer
35+
if L == 1 then
36+
word_vec = nn.LookupTable(word_vocab_size, word_vec_size)(inputs[1])
37+
char_vec = nn.LookupTable(char_vocab_size, char_vec_size)(inputs[2]) --batch_size * word length * char_vec_size
38+
local layer1 = {}
39+
for i = 1, #kernels do
40+
local reduced_l = length - kernels[i] + 1
41+
local conv_layer = nn.TemporalConvolution(char_vec_size, feature_maps[i], kernels[i])(char_vec)
42+
pool_layer = nn.TemporalMaxPooling(reduced_l)(nn.Tanh()(conv_layer))
43+
table.insert(layer1, pool_layer)
44+
end
45+
if #kernels > 1 then
46+
local layer1_concat = nn.JoinTable(3)(layer1)
47+
cnn_output = nn.Squeeze()(layer1_concat)
48+
else
49+
cnn_output = nn.Squeeze()(pool_layer)
50+
end
51+
x = nn.CAddTable()({cnn_output, word_vec})
52+
input_size_L = torch.Tensor(feature_maps):sum()
53+
else
54+
x = outputs[(L-1)*2]
55+
if dropout > 0 then x = nn.Dropout(dropout)(x) end -- apply dropout, if any
56+
input_size_L = rnn_size
57+
end
58+
-- evaluate the input sums at once for efficiency
59+
local i2h = nn.Linear(input_size_L, 4 * rnn_size)(x)
60+
local h2h = nn.Linear(rnn_size, 4 * rnn_size)(prev_h)
61+
local all_input_sums = nn.CAddTable()({i2h, h2h})
62+
-- decode the gates
63+
local sigmoid_chunk = nn.Narrow(2, 1, 3 * rnn_size)(all_input_sums)
64+
sigmoid_chunk = nn.Sigmoid()(sigmoid_chunk)
65+
local in_gate = nn.Narrow(2, 1, rnn_size)(sigmoid_chunk)
66+
local out_gate = nn.Narrow(2, rnn_size + 1, rnn_size)(sigmoid_chunk)
67+
local forget_gate = nn.Narrow(2, 2 * rnn_size + 1, rnn_size)(sigmoid_chunk)
68+
-- decode the write inputs
69+
local in_transform = nn.Narrow(2, 3 * rnn_size + 1, rnn_size)(all_input_sums)
70+
in_transform = nn.Tanh()(in_transform)
71+
-- perform the LSTM update
72+
local next_c = nn.CAddTable()({
73+
nn.CMulTable()({forget_gate, prev_c}),
74+
nn.CMulTable()({in_gate, in_transform})
75+
})
76+
-- gated cells form the output
77+
local next_h = nn.CMulTable()({out_gate, nn.Tanh()(next_c)})
78+
79+
table.insert(outputs, next_c)
80+
table.insert(outputs, next_h)
81+
end
82+
83+
-- set up the decoder
84+
local top_h = outputs[#outputs]
85+
if dropout > 0 then top_h = nn.Dropout(dropout)(top_h) end
86+
local proj = nn.Linear(rnn_size, word_vocab_size)(top_h)
87+
local logsoft = nn.LogSoftMax()(proj)
88+
table.insert(outputs, logsoft)
89+
90+
return nn.gModule(inputs, outputs)
91+
end
92+
93+
return LSTMCNN
94+

train.lua

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ require 'util.misc'
1919
local BatchLoader = require 'util.BatchLoader'
2020
local model_utils = require 'util.model_utils'
2121
local LSTM = require 'model.LSTM'
22-
local LSTMCNN = require 'model.LSTMCNN'
22+
local LSTMCNN = require 'model.LSTMCNN2'
2323

2424
local stringx = require('pl.stringx')
2525

@@ -32,10 +32,10 @@ cmd:text('Options')
3232
cmd:option('-data_dir','data/ptb','data directory. Should contain the file input.txt with input data')
3333
-- model params
3434
cmd:option('-rnn_size', 200, 'size of LSTM internal state')
35-
cmd:option('-word_vec_size', 200, 'dimensionality of word embeddings')
35+
cmd:option('-word_vec_size', 150, 'dimensionality of word embeddings')
3636
cmd:option('-char_vec_size', 30, 'dimensionality of character embeddings')
37-
cmd:option('-feature_maps', '{25,50,75,100,125}', 'number of feature maps in the CNN')
38-
cmd:option('-kernels', '{1,2,3,4,5}', 'conv net kernel widths')
37+
cmd:option('-feature_maps', '{50,50,50}', 'number of feature maps in the CNN')
38+
cmd:option('-kernels', '{2,3,4}', 'conv net kernel widths')
3939
cmd:option('-num_layers', 2, 'number of layers in the LSTM')
4040
cmd:option('-model', 'lstm', 'for now only lstm is supported. keep fixed')
4141
-- optimization
@@ -147,12 +147,12 @@ function eval_split(split_idx, max_batches)
147147
-- have to convert to float because integers can't be cuda()'d
148148
x = x:float():cuda()
149149
y = y:float():cuda()
150-
x_char = x:float():cuda()
150+
x_char = x_char:float():cuda()
151151
end
152152
-- forward pass
153153
for t=1,opt.seq_length do
154154
clones.rnn[t]:evaluate() -- for dropout proper functioning
155-
local lst = clones.rnn[t]:forward{x[{{}, t}], unpack(rnn_state[t-1])}
155+
local lst = clones.rnn[t]:forward{x[{{}, t}], x_char[{{},t}], unpack(rnn_state[t-1])}
156156
rnn_state[t] = {}
157157
for i=1,#init_state do table.insert(rnn_state[t], lst[i]) end
158158
prediction = lst[#lst]
@@ -190,7 +190,7 @@ function feval(x)
190190
local loss = 0
191191
for t=1,opt.seq_length do
192192
clones.rnn[t]:training() -- make sure we are in correct mode (this is cheap, sets flag)
193-
local lst = clones.rnn[t]:forward{x[{{}, t}], unpack(rnn_state[t-1])}
193+
local lst = clones.rnn[t]:forward{x[{{}, t}], x_char[{{},t}], unpack(rnn_state[t-1])}
194194
rnn_state[t] = {}
195195
for i=1,#init_state do table.insert(rnn_state[t], lst[i]) end -- extract the state, without output
196196
predictions[t] = lst[#lst] -- last element is the prediction
@@ -204,13 +204,13 @@ function feval(x)
204204
-- backprop through loss, and softmax/linear
205205
local doutput_t = clones.criterion[t]:backward(predictions[t], y[{{}, t}])
206206
table.insert(drnn_state[t], doutput_t)
207-
local dlst = clones.rnn[t]:backward({x[{{}, t}], unpack(rnn_state[t-1])}, drnn_state[t])
207+
local dlst = clones.rnn[t]:backward({x[{{}, t}], x_char[{{},t}], unpack(rnn_state[t-1])}, drnn_state[t])
208208
drnn_state[t-1] = {}
209209
for k,v in pairs(dlst) do
210-
if k > 1 then -- k == 1 is gradient on x, which we dont need
210+
if k > 2 then -- k == 1 is gradient on x, which we dont need
211211
-- note we do k-1 because first item is dembeddings, and then follow the
212212
-- derivatives of the state, starting at index 2. I know...
213-
drnn_state[t-1][k-1] = v
213+
drnn_state[t-1][k-2] = v
214214
end
215215
end
216216
end

util/BatchLoader.lua

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@ function BatchLoader:next_batch(split_idx)
113113
end
114114
-- pull out the correct next batch
115115
local idx = self.batch_idx[split_idx]
116-
return self.all_batches[split_idx][3][idx], self.all_batches[split_idx][2][idx], self.all_batches[split_idx][3][idx]
116+
return self.all_batches[split_idx][1][idx], self.all_batches[split_idx][2][idx], self.all_batches[split_idx][3][idx]
117117
end
118118

119119
function BatchLoader.text_to_tensor(input_files, out_vocabfile, out_tensorfile)

util/LSTMDTNN.lua

Whitespace-only changes.

util/OuterProduct.lua

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
local OuterProduct, parent = torch.class('nn.OuterProduct', 'nn.Module')
2+
3+
function OuterProduct:__init()
4+
parent.__init(self)
5+
self.gradInput = {torch.Tensor(), torch.Tensor()}
6+
end
7+
8+
function OuterProduct:updateOutput(input)
9+
assert(#input==2, 'only supports outer products of 2 vectors')
10+
local a, b = table.unpack(input)
11+
assert(a:nDimension() == 1 or a:nDimension() == 2, 'input tensors must be 1D or 2D')
12+
if a:nDimension()==1 then
13+
self.output:resize(a:size(1), b:size(1))
14+
self.output:ger(a, b)
15+
else -- mini batch processing
16+
self.output:resize(a:size(1), a:size(2), b:size(2))
17+
for i = 1, a:size(1) do
18+
self.output[i]:ger(a[i], b[i])
19+
end
20+
end
21+
return self.output
22+
end
23+
24+
function OuterProduct:updateGradInput(input, gradOutput)
25+
local a, b = table.unpack(input)
26+
self.gradInput[1]:resizeAs(a)
27+
self.gradInput[2]:resizeAs(b)
28+
if a:nDimension()==1 then
29+
self.gradInput[1]:mv(gradOutput, b)
30+
self.gradInput[2]:mv(gradOutput:t(), a)
31+
else -- mini batch processing
32+
for i = 1, gradOutput:size(1) do
33+
self.gradInput[1][i]:mv(gradOutput[i], b[i])
34+
self.gradInput[2][i]:mv(gradOutput[i]:t(), a[i])
35+
end
36+
end
37+
return self.gradInput
38+
end

0 commit comments

Comments
 (0)