diff --git a/trained_models/copy.pt b/models/copy.pt similarity index 100% rename from trained_models/copy.pt rename to models/copy.pt diff --git a/trained_models/repeat.pt b/models/repeat.pt similarity index 100% rename from trained_models/repeat.pt rename to models/repeat.pt diff --git a/ntm/controller.py b/ntm/controller.py index c85b5ca..b0263c6 100644 --- a/ntm/controller.py +++ b/ntm/controller.py @@ -37,10 +37,16 @@ def __init__(self, vector_length, hidden_size): nn.init.uniform_(p, -stdev, stdev) def forward(self, x, state): + # LSTM configured to accept : sequence_length * batch_size * input || 1 * 1 * input_representation_size output, state = self.layer(x.unsqueeze(0), state) + # Final outputs : time_step * hidden_representation || i.e output at each unrolling, with batch size one. Thus, the squeezing + # Assumption below : Squeeze the sequence-length dimension, if the first-dimension (sequence-length) is 1. return output.squeeze(0), state def get_initial_state(self, batch_size): + # For multiple Batches, clone the same state + # Currently, Batch_Size is 1. WHY ? .. Gotta Come back in a while + # as we want minimal training .. Right ? lstm_h = self.lstm_h_state.clone().repeat(1, batch_size, 1) lstm_c = self.lstm_c_state.clone().repeat(1, batch_size, 1) return lstm_h, lstm_c diff --git a/ntm/head.py b/ntm/head.py index 4cbafcb..36e5f4b 100644 --- a/ntm/head.py +++ b/ntm/head.py @@ -27,6 +27,10 @@ def get_initial_state(self, batch_size): return F.softmax(self._initial_state, dim=1).repeat(batch_size, 1) def get_head_weight(self, x, previous_state, memory_read): + ''' + Outputs Weight necessary for Read and Write Head + ''' + breakpoint() k = self.k_layer(x) beta = F.softplus(self.beta_layer(x)) g = F.sigmoid(self.g_layer(x)) @@ -50,8 +54,11 @@ def shift(self, w_g, s): class ReadHead(Head): def forward(self, x, previous_state): + # NOTE : memory shape : batch_size * (128 * 20) memory_read = self.memory.read() + # Weight across : 120 rows (1 * 120) w = self.get_head_weight(x, previous_state, memory_read) + # Add dimension to apply same weight across different Batch of inputs return torch.matmul(w.unsqueeze(1), memory_read).squeeze(1), w diff --git a/ntm/memory.py b/ntm/memory.py index 89a65b2..1a8ea98 100644 --- a/ntm/memory.py +++ b/ntm/memory.py @@ -12,7 +12,7 @@ def __init__(self, memory_size): initial_state = torch.ones(memory_size) * 1e-6 self.register_buffer('initial_state', initial_state.data) - # Initial read vector is a learnt parameter + # Initial read vector is a learnt parameter | self.initial_read = Parameter(torch.randn(1, self._memory_size[1]) * 0.01) def get_size(self): diff --git a/ntm/ntm.py b/ntm/ntm.py index c0abaf8..ee24e4d 100644 --- a/ntm/ntm.py +++ b/ntm/ntm.py @@ -26,13 +26,24 @@ def get_initial_state(self, batch_size=1): return (read, read_head_state, write_head_state, controller_state) def forward(self, x, previous_state): + + ## Whether is LSTM or Feedforward or any network of choice, we take in inputs at only one time step + ## This input is processed by network to create feature + ## This feature is then, used to update (read and write ) the Memory-Matrix + ## So at each time step, we keep on retrieving contents and updating the memory matrix + ## The retrieved content + Controller input -> A fully Connected Layer -> Output at each time-step + previous_read, previous_read_head_state, previous_write_head_state, previous_controller_state = previous_state controller_input = torch.cat([x, previous_read], dim=1) + ## LSTM will take in : 1 (seq_length) * 1 (batch_size) * 29 (input_repr_space) + ## is time-step 1 a Design Choice. NEED to look at other NTM implementations + ## and output : seq_len * batch_size * 100 (hidden repr) => with first dimension squeezed, if 1 controller_output, controller_state = self.controller(controller_input, previous_controller_state) # Read read_head_output, read_head_state = self.read_head(controller_output, previous_read_head_state) # Write write_head_state = self.write_head(controller_output, previous_write_head_state) + # fc_input = torch.cat((controller_output, read_head_output), dim=1) state = (read_head_output, read_head_state, write_head_state, controller_state) return F.sigmoid(self.fc(fc_input)), state diff --git a/requirements.txt b/requirements.txt index db1f0a1..65b1b27 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ torch numpy +matplotlib tensorboard