From 8c40e9374ccb1e2e4ed5ecb8e8e4507173fac209 Mon Sep 17 00:00:00 2001
From: vseledkin <vseledkin@gmail.com>
Date: Tue, 27 Oct 2015 01:21:43 +0300
Subject: [PATCH 1/2] UTF-8 support

---
 get_data.sh             |  0
 util/BatchLoaderUnk.lua | 44 ++++++++++++++++++------------------
 util/misc.lua           | 49 ++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 70 insertions(+), 23 deletions(-)
 mode change 100644 => 100755 get_data.sh
diff --git a/get_data.sh b/get_data.sh
old mode 100644
new mode 100755
diff --git a/util/BatchLoaderUnk.lua b/util/BatchLoaderUnk.lua
index e8788ee..5a5a1cd 100644
--- a/util/BatchLoaderUnk.lua
+++ b/util/BatchLoaderUnk.lua
@@ -1,7 +1,7 @@
 
 -- Modified from https://github.com/karpathy/char-rnn
 -- This version is for cases where one has already segmented train/val/test splits
-
+require './misc.lua'
 local BatchLoaderUnk = {}
 local stringx = require('pl.stringx')
 BatchLoaderUnk.__index = BatchLoaderUnk
@@ -40,7 +40,7 @@ function BatchLoaderUnk.create(data_dir, batch_size, seq_length, padding, max_wo
     self.seq_length = seq_length
     self.split_sizes = {}
     self.all_batches = {}
-    print('reshaping tensors...')  
+    print('reshaping tensors...')
     local x_batches, y_batches, nbatches
     for split, data in ipairs(all_data) do
        local len = data:size(1)
@@ -58,7 +58,7 @@ function BatchLoaderUnk.create(data_dir, batch_size, seq_length, padding, max_wo
           x_batches = data:view(batch_size, -1):split(seq_length, 2)
           y_batches = ydata:view(batch_size, -1):split(seq_length, 2)
           x_char_batches = data_char:view(batch_size, -1, self.max_word_l):split(seq_length,2)
-          nbatches = #x_batches	   
+          nbatches = #x_batches
           self.split_sizes[split] = nbatches
           assert(#x_batches == #y_batches)
           assert(#x_batches == #x_char_batches)
@@ -67,7 +67,7 @@ function BatchLoaderUnk.create(data_dir, batch_size, seq_length, padding, max_wo
           y_batches = {ydata:resize(1, ydata:size(1)):expand(batch_size, ydata:size(2))}
           data_char = data_char:resize(1, data_char:size(1), data_char:size(2))
           x_char_batches = {data_char:expand(batch_size, data_char:size(2), data_char:size(3))}
-          self.split_sizes[split] = 1	
+          self.split_sizes[split] = 1
        end
        self.all_batches[split] = {x_batches, y_batches, x_char_batches}
     end
@@ -77,7 +77,7 @@ function BatchLoaderUnk.create(data_dir, batch_size, seq_length, padding, max_wo
     return self
 end
 
-function BatchLoaderUnk:expand(t)    
+function BatchLoaderUnk:expand(t)
     for i = 1, self.padding do
         table.insert(t, 1, 1) -- 1 is always char idx for zero pad
     end
@@ -109,7 +109,7 @@ function BatchLoaderUnk.text_to_tensor(input_files, out_vocabfile, out_tensorfil
     local f, rawdata
     local output_tensors = {} -- output tensors for train/val/test
     local output_chars = {} -- output character tensors for train/val/test sets
-    local vocab_count = {} -- vocab count 
+    local vocab_count = {} -- vocab count
     local max_word_l_tmp = 0 -- max word length of the corpus
     local idx2word = {tokens.UNK} -- unknown word token
     local word2idx = {}; word2idx[tokens.UNK] = 1
@@ -122,32 +122,32 @@ function BatchLoaderUnk.text_to_tensor(input_files, out_vocabfile, out_tensorfil
     -- we use that instead. this is inefficient, but only a one-off thing so should be fine
     -- also counts the number of tokens
     for	split = 1,3 do -- split = 1 (train), 2 (val), or 3 (test)
-       f = io.open(input_files[split], 'r')       
+       f = io.open(input_files[split], 'r')
        local counts = 0
        for line in f:lines() do
           line = stringx.replace(line, '<unk>', tokens.UNK) -- replace unk with a single character
-	  line = stringx.replace(line, tokens.START, '') --start-of-word token is reserved
-	  line = stringx.replace(line, tokens.END, '') --end-of-word token is reserved
+	        line = stringx.replace(line, tokens.START, '') --start-of-word token is reserved
+	        line = stringx.replace(line, tokens.END, '') --end-of-word token is reserved
           for word in line:gmatch'([^%s]+)' do
-	     max_word_l_tmp = math.max(max_word_l_tmp, word:len())
-	     counts = counts + 1
+	          max_word_l_tmp = math.max(max_word_l_tmp, RuneCount(word))
+	          counts = counts + 1
           end
-	  if tokens.EOS ~= '' then
-	      counts = counts + 1 --PTB uses \n for <eos>, so need to add one more token at the end
-	  end
+	        if tokens.EOS ~= '' then
+	          counts = counts + 1 --PTB uses \n for <eos>, so need to add one more token at the end
+	        end
        end
        f:close()
        split_counts[split] = counts
     end
-      
+
     print('After first pass of data, max word length is: ' .. max_word_l_tmp)
-    print(string.format('Token count: train %d, val %d, test %d', 
+    print(string.format('Token count: train %d, val %d, test %d',
     			split_counts[1], split_counts[2], split_counts[3]))
 
     -- if actual max word length is less than the limit, use that
     max_word_l = math.min(max_word_l_tmp, max_word_l)
-   
-    for	split = 1,3 do -- split = 1 (train), 2 (val), or 3 (test)     
+
+    for	split = 1,3 do -- split = 1 (train), 2 (val), or 3 (test)
        -- Preallocate the tensors we will need.
        -- Watch out the second one needs a lot of RAM.
        output_tensors[split] = torch.LongTensor(split_counts[split])
@@ -157,8 +157,8 @@ function BatchLoaderUnk.text_to_tensor(input_files, out_vocabfile, out_tensorfil
        local word_num = 0
        for line in f:lines() do
           line = stringx.replace(line, '<unk>', tokens.UNK)
-	  line = stringx.replace(line, tokens.START, '') -- start and end of word tokens are reserved
-	  line = stringx.replace(line, tokens.END, '')
+	        line = stringx.replace(line, tokens.START, '') -- start and end of word tokens are reserved
+	        line = stringx.replace(line, tokens.END, '')
           for rword in line:gmatch'([^%s]+)' do
              function append(word)
                 word_num = word_num + 1
@@ -177,7 +177,8 @@ function BatchLoaderUnk.text_to_tensor(input_files, out_vocabfile, out_tensorfil
                    end
                    output_tensors[split][word_num] = word2idx[word]
                 end
-                for char in word:gmatch'.' do
+
+                for char_code, char in pairs(UTF8ToCharArray(word)) do
                    if char2idx[char]==nil then
                       idx2char[#idx2char + 1] = char -- create char-idx/idx-char mappings
                       char2idx[char] = #idx2char
@@ -207,4 +208,3 @@ function BatchLoaderUnk.text_to_tensor(input_files, out_vocabfile, out_tensorfil
 end
 
 return BatchLoaderUnk
-
diff --git a/util/misc.lua b/util/misc.lua
index 45cb09f..952484e 100644
--- a/util/misc.lua
+++ b/util/misc.lua
@@ -9,4 +9,51 @@ function clone_list(tensor_list, zero_too)
         if zero_too then out[k]:zero() end
     end
     return out
-end
\ No newline at end of file
+end
+
+-- Multi byte characters start with a byte with bits 7 and 8 set, trailing bytes have bit 7 not set and bit 8 set.
+-- https://forums.coronalabs.com/topic/42019-split-utf-8-string-word-with-foreign-characters-to-letters/ by ingemar
+function UTF8ToCharArray(str)
+    local charArray = {};
+    local iStart = 0;
+    local strLen = str:len();
+
+    local function bit(b)
+        return 2 ^ (b - 1);
+    end
+
+    local function hasbit(w, b)
+        return w % (b + b) >= b;
+    end
+
+    local checkMultiByte = function(i)
+        if (iStart ~= 0) then
+            charArray[#charArray + 1] = str:sub(iStart, i - 1);
+            iStart = 0;
+        end
+    end
+
+    for i = 1, strLen do
+        local b = str:byte(i);
+        local multiStart = hasbit(b, bit(7)) and hasbit(b, bit(8));
+        local multiTrail = not hasbit(b, bit(7)) and hasbit(b, bit(8));
+
+        if (multiStart) then
+            checkMultiByte(i);
+            iStart = i;
+
+        elseif (not multiTrail) then
+            checkMultiByte(i);
+            charArray[#charArray + 1] = str:sub(i, i);
+        end
+    end
+
+    -- process if last character is multi-byte
+    checkMultiByte(strLen + 1);
+
+    return charArray;
+end
+
+function RuneCount(text)
+  return #UTF8ToCharArray(text) -- TODO: can be heavily optimized
+end

From a837c84e23b56f739e0682f70bcc86e3a961bb15 Mon Sep 17 00:00:00 2001
From: vseledkin <haddock@mail.ru>
Date: Tue, 27 Oct 2015 11:28:17 +0300
Subject: [PATCH 2/2] another utf8-fix

---
 util/BatchLoaderUnk.lua | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/util/BatchLoaderUnk.lua b/util/BatchLoaderUnk.lua
index 5a5a1cd..07618d4 100644
--- a/util/BatchLoaderUnk.lua
+++ b/util/BatchLoaderUnk.lua
@@ -167,7 +167,7 @@ function BatchLoaderUnk.text_to_tensor(input_files, out_vocabfile, out_tensorfil
                    collectgarbage()
                 end
                 local chars = {char2idx[tokens.START]} -- start-of-word symbol
-                if string.sub(word,1,1) == tokens.UNK and word:len() > 1 then -- unk token with character info available
+                if string.sub(word,1,1) == tokens.UNK and RuneCount(word) > 1 then -- unk token with character info available
                    word = string.sub(word, 3)
                    output_tensors[split][word_num] = word2idx[tokens.UNK]
                 else