concat word/char vecs

yoonkim · yoonkim · commit 1fef8e084b27 · 2015-06-21T20:38:27.000-04:00
diff --git a/.#train.lua b/.#train.lua
@@ -0,0 +1 @@
+yoon@yoon-VirtualBox.4277:1434833625
diff --git a/model/LSTMCNN.lua b/model/LSTMCNN.lua
@@ -24,7 +24,7 @@ function LSTMCNN.lstmcnn(word_vocab_size, rnn_size, n, dropout, word_vec_size, c
       table.insert(inputs, nn.Identity()()) -- prev_h[L]
     end
 
-    local x, input_size_L, word_vec, char_vec, cnn_output
+    local x, input_size_L, word_vec, char_vec, cnn_output, pool_layer
     local outputs = {}
     for L = 1,n do
 	-- c,h from previous timesteps
@@ -38,13 +38,17 @@ function LSTMCNN.lstmcnn(word_vocab_size, rnn_size, n, dropout, word_vec_size, c
 	    for i = 1, #kernels do
 		local reduced_l = length - kernels[i] + 1 
 		local conv_layer = nn.TemporalConvolution(char_vec_size, feature_maps[i], kernels[i])(char_vec)
-		local pool_layer = nn.TemporalMaxPooling(reduced_l)(nn.Tanh()(conv_layer))
+		pool_layer = nn.TemporalMaxPooling(reduced_l)(nn.Tanh()(conv_layer))
 		table.insert(layer1, pool_layer)
 	    end
-	    local layer1_concat = nn.JoinTable(3)(layer1)
-	    cnn_output = nn.Squeeze()(layer1_concat)
+	    if #kernels > 1 then
+	        local layer1_concat = nn.JoinTable(3)(layer1)
+		cnn_output = nn.Squeeze()(layer1_concat)
+	    else
+	        cnn_output = nn.Squeeze()(pool_layer)
+	    end
 	    x = nn.Identity()(cnn_output)
-	    input_size_L = torch.Tensor(feature_maps):sum()
+	    input_size_L = torch.Tensor(feature_maps):sum()	    
 	else 
 	    x = outputs[(L-1)*2] 
 	    if dropout > 0 then x = nn.Dropout(dropout)(x) end -- apply dropout, if any
@@ -58,8 +62,8 @@ function LSTMCNN.lstmcnn(word_vocab_size, rnn_size, n, dropout, word_vec_size, c
 	local sigmoid_chunk = nn.Narrow(2, 1, 3 * rnn_size)(all_input_sums)
 	sigmoid_chunk = nn.Sigmoid()(sigmoid_chunk)
 	local in_gate = nn.Narrow(2, 1, rnn_size)(sigmoid_chunk)
-	local forget_gate = nn.Narrow(2, rnn_size + 1, rnn_size)(sigmoid_chunk)
-	local out_gate = nn.Narrow(2, 2 * rnn_size + 1, rnn_size)(sigmoid_chunk)
+	local out_gate = nn.Narrow(2, rnn_size + 1, rnn_size)(sigmoid_chunk)
+	local forget_gate = nn.Narrow(2, 2 * rnn_size + 1, rnn_size)(sigmoid_chunk)
 	-- decode the write inputs
 	local in_transform = nn.Narrow(2, 3 * rnn_size + 1, rnn_size)(all_input_sums)
 	in_transform = nn.Tanh()(in_transform)
diff --git a/model/LSTMCNN2.lua b/model/LSTMCNN2.lua
@@ -1,6 +1,4 @@
-
-local LSTMTDNN = {}
-
+local LSTMCNN = {}
 
 local ok, cunn = pcall(require, 'fbcunn')
 if not ok then
@@ -9,9 +7,8 @@ else
     LookupTable = fbcunn.LookupTableGPU
 end
 
-function LSTMTDNN.lstmtdnn(word_vocab_size, rnn_size, n, dropout, word_vec_size, char_vec_size, char_vocab_size,
-	 		num_feature_maps, kernels, word2char2idx)
-    -- input_size = vocab size
+function LSTMCNN.lstmcnn(word_vocab_size, rnn_size, n, dropout, word_vec_size, char_vec_size, char_vocab_size,
+	 		feature_maps, kernels, word2char2idx)
     -- rnn_size = dimensionality of hidden layers
     -- n = number of layers
     -- k = word embedding size
@@ -20,38 +17,39 @@ function LSTMTDNN.lstmtdnn(word_vocab_size, rnn_size, n, dropout, word_vec_size,
 
     -- there will be 2*n+1 inputs
     local length = word2char2idx:size(2)
-    local word_vec_size = word_vec_size or rnn_size
     local inputs = {}
-    --table.insert(inputs, nn.Identity()()) -- batch_size x 1 (word indices) 
+    table.insert(inputs, nn.Identity()()) -- batch_size x 1 (word indices)
     table.insert(inputs, nn.Identity()()) -- batch_size x word length (char indices)
     for L = 1,n do
       table.insert(inputs, nn.Identity()()) -- prev_c[L]
       table.insert(inputs, nn.Identity()()) -- prev_h[L]
     end
 
-    local x, input_size_L, word_vec, char_vec, tdnn_output
+    local x, input_size_L, word_vec, char_vec, cnn_output, pool_layer
     local outputs = {}
     for L = 1,n do
 	-- c,h from previous timesteps
-	local prev_h = inputs[L*2+2-1]
-	local prev_c = inputs[L*2+1-1]
+	local prev_h = inputs[L*2+2]
+	local prev_c = inputs[L*2+1]
 	-- the input to this layer
 	if L == 1 then 
-	    char_vec = nn.LookupTable(char_vocab_size, char_vec_size)(inputs[1]) --batch_size * word length * char_vec_size	    
+	    word_vec = nn.LookupTable(word_vocab_size, word_vec_size)(inputs[1])
+	    char_vec = nn.LookupTable(char_vocab_size, char_vec_size)(inputs[2]) --batch_size * word length * char_vec_size
 	    local layer1 = {}
 	    for i = 1, #kernels do
 		local reduced_l = length - kernels[i] + 1 
-		local conv_layer = nn.TemporalConvolution(char_vec_size, num_feature_maps, kernels[i])(char_vec)
-		local pool_layer = nn.TemporalMaxPooling(reduced_l)(nn.Tanh()(conv_layer))
+		local conv_layer = nn.TemporalConvolution(char_vec_size, feature_maps[i], kernels[i])(char_vec)
+		pool_layer = nn.TemporalMaxPooling(reduced_l)(nn.Tanh()(conv_layer))
 		table.insert(layer1, pool_layer)
 	    end
-	    local layer1_concat = nn.JoinTable(3)(layer1)
-	    tdnn_output = nn.Squeeze()(layer1_concat)
-	    --tdnn_output = TDNN.tdnn(length, char_vec_size, tdnn_output_size, kernels) -- batch_size * tdnn_output_size  
-	    --word_vec = LookupTable(word_vocab_size, word_vec_size)(inputs[1])            
-	    --x = nn.Identity()(word_vec)
-	    x = nn.Identity()(tdnn_output)
-	    input_size_L = word_vec_size
+	    if #kernels > 1 then
+	        local layer1_concat = nn.JoinTable(3)(layer1)
+		cnn_output = nn.Squeeze()(layer1_concat)
+	    else
+	        cnn_output = nn.Squeeze()(pool_layer)
+	    end
+	    x = nn.JoinTable(2)({cnn_output, word_vec})
+	    input_size_L = torch.Tensor(feature_maps):sum() + word_vec_size
 	else 
 	    x = outputs[(L-1)*2] 
 	    if dropout > 0 then x = nn.Dropout(dropout)(x) end -- apply dropout, if any
@@ -65,8 +63,8 @@ function LSTMTDNN.lstmtdnn(word_vocab_size, rnn_size, n, dropout, word_vec_size,
 	local sigmoid_chunk = nn.Narrow(2, 1, 3 * rnn_size)(all_input_sums)
 	sigmoid_chunk = nn.Sigmoid()(sigmoid_chunk)
 	local in_gate = nn.Narrow(2, 1, rnn_size)(sigmoid_chunk)
-	local forget_gate = nn.Narrow(2, rnn_size + 1, rnn_size)(sigmoid_chunk)
-	local out_gate = nn.Narrow(2, 2 * rnn_size + 1, rnn_size)(sigmoid_chunk)
+	local out_gate = nn.Narrow(2, rnn_size + 1, rnn_size)(sigmoid_chunk)
+	local forget_gate = nn.Narrow(2, 2 * rnn_size + 1, rnn_size)(sigmoid_chunk)
 	-- decode the write inputs
 	local in_transform = nn.Narrow(2, 3 * rnn_size + 1, rnn_size)(all_input_sums)
 	in_transform = nn.Tanh()(in_transform)
@@ -92,5 +90,5 @@ function LSTMTDNN.lstmtdnn(word_vocab_size, rnn_size, n, dropout, word_vec_size,
     return nn.gModule(inputs, outputs)
 end
 
-return LSTMTDNN
+return LSTMCNN
 
diff --git a/model/LSTMCNN2.lua~ b/model/LSTMCNN2.lua~
@@ -0,0 +1,94 @@
+local LSTMCNN = {}
+
+local ok, cunn = pcall(require, 'fbcunn')
+if not ok then
+    LookupTable = nn.LookupTable
+else
+    LookupTable = fbcunn.LookupTableGPU
+end
+
+function LSTMCNN.lstmcnn(word_vocab_size, rnn_size, n, dropout, word_vec_size, char_vec_size, char_vocab_size,
+	 		feature_maps, kernels, word2char2idx)
+    -- rnn_size = dimensionality of hidden layers
+    -- n = number of layers
+    -- k = word embedding size
+
+    dropout = dropout or 0 
+
+    -- there will be 2*n+1 inputs
+    local length = word2char2idx:size(2)
+    local inputs = {}
+    table.insert(inputs, nn.Identity()()) -- batch_size x 1 (word indices)
+    table.insert(inputs, nn.Identity()()) -- batch_size x word length (char indices)
+    for L = 1,n do
+      table.insert(inputs, nn.Identity()()) -- prev_c[L]
+      table.insert(inputs, nn.Identity()()) -- prev_h[L]
+    end
+
+    local x, input_size_L, word_vec, char_vec, cnn_output, pool_layer
+    local outputs = {}
+    for L = 1,n do
+	-- c,h from previous timesteps
+	local prev_h = inputs[L*2+2]
+	local prev_c = inputs[L*2+1]
+	-- the input to this layer
+	if L == 1 then 
+	    word_vec = nn.LookupTable(word_vocab_size, word_vec_size)(inputs[1])
+	    char_vec = nn.LookupTable(char_vocab_size, char_vec_size)(inputs[2]) --batch_size * word length * char_vec_size
+	    local layer1 = {}
+	    for i = 1, #kernels do
+		local reduced_l = length - kernels[i] + 1 
+		local conv_layer = nn.TemporalConvolution(char_vec_size, feature_maps[i], kernels[i])(char_vec)
+		pool_layer = nn.TemporalMaxPooling(reduced_l)(nn.Tanh()(conv_layer))
+		table.insert(layer1, pool_layer)
+	    end
+	    if #kernels > 1 then
+	        local layer1_concat = nn.JoinTable(3)(layer1)
+		cnn_output = nn.Squeeze()(layer1_concat)
+	    else
+	        cnn_output = nn.Squeeze()(pool_layer)
+	    end
+	    x = nn.CAddTable()({cnn_output, word_vec})
+	    input_size_L = torch.Tensor(feature_maps):sum()
+	else 
+	    x = outputs[(L-1)*2] 
+	    if dropout > 0 then x = nn.Dropout(dropout)(x) end -- apply dropout, if any
+	    input_size_L = rnn_size
+	end
+	-- evaluate the input sums at once for efficiency
+	local i2h = nn.Linear(input_size_L, 4 * rnn_size)(x)
+	local h2h = nn.Linear(rnn_size, 4 * rnn_size)(prev_h)
+	local all_input_sums = nn.CAddTable()({i2h, h2h})
+	-- decode the gates
+	local sigmoid_chunk = nn.Narrow(2, 1, 3 * rnn_size)(all_input_sums)
+	sigmoid_chunk = nn.Sigmoid()(sigmoid_chunk)
+	local in_gate = nn.Narrow(2, 1, rnn_size)(sigmoid_chunk)
+	local out_gate = nn.Narrow(2, rnn_size + 1, rnn_size)(sigmoid_chunk)
+	local forget_gate = nn.Narrow(2, 2 * rnn_size + 1, rnn_size)(sigmoid_chunk)
+	-- decode the write inputs
+	local in_transform = nn.Narrow(2, 3 * rnn_size + 1, rnn_size)(all_input_sums)
+	in_transform = nn.Tanh()(in_transform)
+	-- perform the LSTM update
+	local next_c           = nn.CAddTable()({
+	    nn.CMulTable()({forget_gate, prev_c}),
+	    nn.CMulTable()({in_gate,     in_transform})
+	  })
+	-- gated cells form the output
+	local next_h = nn.CMulTable()({out_gate, nn.Tanh()(next_c)})
+
+	table.insert(outputs, next_c)
+	table.insert(outputs, next_h)
+    end
+
+  -- set up the decoder
+    local top_h = outputs[#outputs]
+    if dropout > 0 then top_h = nn.Dropout(dropout)(top_h) end
+    local proj = nn.Linear(rnn_size, word_vocab_size)(top_h)
+    local logsoft = nn.LogSoftMax()(proj)
+    table.insert(outputs, logsoft)
+
+    return nn.gModule(inputs, outputs)
+end
+
+return LSTMCNN
+
diff --git a/train.lua b/train.lua
@@ -19,7 +19,7 @@ require 'util.misc'
 local BatchLoader = require 'util.BatchLoader'
 local model_utils = require 'util.model_utils'
 local LSTM = require 'model.LSTM'
-local LSTMCNN = require 'model.LSTMCNN'
+local LSTMCNN = require 'model.LSTMCNN2'
 
 local stringx = require('pl.stringx')
 
@@ -32,10 +32,10 @@ cmd:text('Options')
 cmd:option('-data_dir','data/ptb','data directory. Should contain the file input.txt with input data')
 -- model params
 cmd:option('-rnn_size', 200, 'size of LSTM internal state')
-cmd:option('-word_vec_size', 200, 'dimensionality of word embeddings')
+cmd:option('-word_vec_size', 150, 'dimensionality of word embeddings')
 cmd:option('-char_vec_size', 30, 'dimensionality of character embeddings')
-cmd:option('-feature_maps', '{25,50,75,100,125}', 'number of feature maps in the CNN')
-cmd:option('-kernels', '{1,2,3,4,5}', 'conv net kernel widths')
+cmd:option('-feature_maps', '{50,50,50}', 'number of feature maps in the CNN')
+cmd:option('-kernels', '{2,3,4}', 'conv net kernel widths')
 cmd:option('-num_layers', 2, 'number of layers in the LSTM')
 cmd:option('-model', 'lstm', 'for now only lstm is supported. keep fixed')
 -- optimization
@@ -147,12 +147,12 @@ function eval_split(split_idx, max_batches)
 		-- have to convert to float because integers can't be cuda()'d
 		x = x:float():cuda()
 		y = y:float():cuda()
-		x_char = x:float():cuda()
+		x_char = x_char:float():cuda()
 	    end
 	    -- forward pass
 	    for t=1,opt.seq_length do
 		clones.rnn[t]:evaluate() -- for dropout proper functioning
-		local lst = clones.rnn[t]:forward{x[{{}, t}], unpack(rnn_state[t-1])}
+		local lst = clones.rnn[t]:forward{x[{{}, t}], x_char[{{},t}], unpack(rnn_state[t-1])}
 		rnn_state[t] = {}
 		for i=1,#init_state do table.insert(rnn_state[t], lst[i]) end
 		prediction = lst[#lst] 
@@ -190,7 +190,7 @@ function feval(x)
     local loss = 0
     for t=1,opt.seq_length do
         clones.rnn[t]:training() -- make sure we are in correct mode (this is cheap, sets flag)
-        local lst = clones.rnn[t]:forward{x[{{}, t}], unpack(rnn_state[t-1])}
+        local lst = clones.rnn[t]:forward{x[{{}, t}], x_char[{{},t}], unpack(rnn_state[t-1])}
         rnn_state[t] = {}
         for i=1,#init_state do table.insert(rnn_state[t], lst[i]) end -- extract the state, without output
         predictions[t] = lst[#lst] -- last element is the prediction
@@ -204,13 +204,13 @@ function feval(x)
         -- backprop through loss, and softmax/linear
         local doutput_t = clones.criterion[t]:backward(predictions[t], y[{{}, t}])
         table.insert(drnn_state[t], doutput_t)
-        local dlst = clones.rnn[t]:backward({x[{{}, t}], unpack(rnn_state[t-1])}, drnn_state[t])
+        local dlst = clones.rnn[t]:backward({x[{{}, t}], x_char[{{},t}], unpack(rnn_state[t-1])}, drnn_state[t])
         drnn_state[t-1] = {}
         for k,v in pairs(dlst) do
-            if k > 1 then -- k == 1 is gradient on x, which we dont need
+            if k > 2 then -- k == 1 is gradient on x, which we dont need
                 -- note we do k-1 because first item is dembeddings, and then follow the 
                 -- derivatives of the state, starting at index 2. I know...
-                drnn_state[t-1][k-1] = v
+                drnn_state[t-1][k-2] = v
             end
         end
     end
diff --git a/util/BatchLoader.lua b/util/BatchLoader.lua
@@ -113,7 +113,7 @@ function BatchLoader:next_batch(split_idx)
     end
     -- pull out the correct next batch
     local idx = self.batch_idx[split_idx]
-    return self.all_batches[split_idx][3][idx], self.all_batches[split_idx][2][idx], self.all_batches[split_idx][3][idx]
+    return self.all_batches[split_idx][1][idx], self.all_batches[split_idx][2][idx], self.all_batches[split_idx][3][idx]
 end
 
 function BatchLoader.text_to_tensor(input_files, out_vocabfile, out_tensorfile)
diff --git a/util/LSTMDTNN.lua b/util/LSTMDTNN.lua
diff --git a/util/OuterProduct.lua b/util/OuterProduct.lua
@@ -0,0 +1,38 @@
+local OuterProduct, parent = torch.class('nn.OuterProduct', 'nn.Module')
+
+function OuterProduct:__init()
+    parent.__init(self)
+    self.gradInput = {torch.Tensor(), torch.Tensor()}
+end
+
+function OuterProduct:updateOutput(input)
+    assert(#input==2, 'only supports outer products of 2 vectors')
+    local a, b = table.unpack(input)
+    assert(a:nDimension() == 1 or a:nDimension() == 2, 'input tensors must be 1D or 2D')
+    if a:nDimension()==1 then 
+       	self.output:resize(a:size(1), b:size(1))
+	self.output:ger(a, b)
+    else -- mini batch processing
+        self.output:resize(a:size(1), a:size(2), b:size(2))
+	for i = 1, a:size(1) do
+	    self.output[i]:ger(a[i], b[i])
+	end
+    end
+    return self.output
+end
+
+function OuterProduct:updateGradInput(input, gradOutput)
+    local a, b = table.unpack(input)
+    self.gradInput[1]:resizeAs(a)
+    self.gradInput[2]:resizeAs(b)
+    if a:nDimension()==1 then
+        self.gradInput[1]:mv(gradOutput, b)
+	self.gradInput[2]:mv(gradOutput:t(), a)
+    else -- mini batch processing
+        for i = 1, gradOutput:size(1) do
+	    self.gradInput[1][i]:mv(gradOutput[i], b[i])
+	    self.gradInput[2][i]:mv(gradOutput[i]:t(), a[i])
+	end
+    end
+    return self.gradInput
+end