From 8c40e9374ccb1e2e4ed5ecb8e8e4507173fac209 Mon Sep 17 00:00:00 2001 From: vseledkin Date: Tue, 27 Oct 2015 01:21:43 +0300 Subject: [PATCH 1/2] UTF-8 support --- get_data.sh | 0 util/BatchLoaderUnk.lua | 44 ++++++++++++++++++------------------ util/misc.lua | 49 ++++++++++++++++++++++++++++++++++++++++- 3 files changed, 70 insertions(+), 23 deletions(-) mode change 100644 => 100755 get_data.sh diff --git a/get_data.sh b/get_data.sh old mode 100644 new mode 100755 diff --git a/util/BatchLoaderUnk.lua b/util/BatchLoaderUnk.lua index e8788ee..5a5a1cd 100644 --- a/util/BatchLoaderUnk.lua +++ b/util/BatchLoaderUnk.lua @@ -1,7 +1,7 @@ -- Modified from https://github.com/karpathy/char-rnn -- This version is for cases where one has already segmented train/val/test splits - +require './misc.lua' local BatchLoaderUnk = {} local stringx = require('pl.stringx') BatchLoaderUnk.__index = BatchLoaderUnk @@ -40,7 +40,7 @@ function BatchLoaderUnk.create(data_dir, batch_size, seq_length, padding, max_wo self.seq_length = seq_length self.split_sizes = {} self.all_batches = {} - print('reshaping tensors...') + print('reshaping tensors...') local x_batches, y_batches, nbatches for split, data in ipairs(all_data) do local len = data:size(1) @@ -58,7 +58,7 @@ function BatchLoaderUnk.create(data_dir, batch_size, seq_length, padding, max_wo x_batches = data:view(batch_size, -1):split(seq_length, 2) y_batches = ydata:view(batch_size, -1):split(seq_length, 2) x_char_batches = data_char:view(batch_size, -1, self.max_word_l):split(seq_length,2) - nbatches = #x_batches + nbatches = #x_batches self.split_sizes[split] = nbatches assert(#x_batches == #y_batches) assert(#x_batches == #x_char_batches) @@ -67,7 +67,7 @@ function BatchLoaderUnk.create(data_dir, batch_size, seq_length, padding, max_wo y_batches = {ydata:resize(1, ydata:size(1)):expand(batch_size, ydata:size(2))} data_char = data_char:resize(1, data_char:size(1), data_char:size(2)) x_char_batches = {data_char:expand(batch_size, data_char:size(2), data_char:size(3))} - self.split_sizes[split] = 1 + self.split_sizes[split] = 1 end self.all_batches[split] = {x_batches, y_batches, x_char_batches} end @@ -77,7 +77,7 @@ function BatchLoaderUnk.create(data_dir, batch_size, seq_length, padding, max_wo return self end -function BatchLoaderUnk:expand(t) +function BatchLoaderUnk:expand(t) for i = 1, self.padding do table.insert(t, 1, 1) -- 1 is always char idx for zero pad end @@ -109,7 +109,7 @@ function BatchLoaderUnk.text_to_tensor(input_files, out_vocabfile, out_tensorfil local f, rawdata local output_tensors = {} -- output tensors for train/val/test local output_chars = {} -- output character tensors for train/val/test sets - local vocab_count = {} -- vocab count + local vocab_count = {} -- vocab count local max_word_l_tmp = 0 -- max word length of the corpus local idx2word = {tokens.UNK} -- unknown word token local word2idx = {}; word2idx[tokens.UNK] = 1 @@ -122,32 +122,32 @@ function BatchLoaderUnk.text_to_tensor(input_files, out_vocabfile, out_tensorfil -- we use that instead. this is inefficient, but only a one-off thing so should be fine -- also counts the number of tokens for split = 1,3 do -- split = 1 (train), 2 (val), or 3 (test) - f = io.open(input_files[split], 'r') + f = io.open(input_files[split], 'r') local counts = 0 for line in f:lines() do line = stringx.replace(line, '', tokens.UNK) -- replace unk with a single character - line = stringx.replace(line, tokens.START, '') --start-of-word token is reserved - line = stringx.replace(line, tokens.END, '') --end-of-word token is reserved + line = stringx.replace(line, tokens.START, '') --start-of-word token is reserved + line = stringx.replace(line, tokens.END, '') --end-of-word token is reserved for word in line:gmatch'([^%s]+)' do - max_word_l_tmp = math.max(max_word_l_tmp, word:len()) - counts = counts + 1 + max_word_l_tmp = math.max(max_word_l_tmp, RuneCount(word)) + counts = counts + 1 end - if tokens.EOS ~= '' then - counts = counts + 1 --PTB uses \n for , so need to add one more token at the end - end + if tokens.EOS ~= '' then + counts = counts + 1 --PTB uses \n for , so need to add one more token at the end + end end f:close() split_counts[split] = counts end - + print('After first pass of data, max word length is: ' .. max_word_l_tmp) - print(string.format('Token count: train %d, val %d, test %d', + print(string.format('Token count: train %d, val %d, test %d', split_counts[1], split_counts[2], split_counts[3])) -- if actual max word length is less than the limit, use that max_word_l = math.min(max_word_l_tmp, max_word_l) - - for split = 1,3 do -- split = 1 (train), 2 (val), or 3 (test) + + for split = 1,3 do -- split = 1 (train), 2 (val), or 3 (test) -- Preallocate the tensors we will need. -- Watch out the second one needs a lot of RAM. output_tensors[split] = torch.LongTensor(split_counts[split]) @@ -157,8 +157,8 @@ function BatchLoaderUnk.text_to_tensor(input_files, out_vocabfile, out_tensorfil local word_num = 0 for line in f:lines() do line = stringx.replace(line, '', tokens.UNK) - line = stringx.replace(line, tokens.START, '') -- start and end of word tokens are reserved - line = stringx.replace(line, tokens.END, '') + line = stringx.replace(line, tokens.START, '') -- start and end of word tokens are reserved + line = stringx.replace(line, tokens.END, '') for rword in line:gmatch'([^%s]+)' do function append(word) word_num = word_num + 1 @@ -177,7 +177,8 @@ function BatchLoaderUnk.text_to_tensor(input_files, out_vocabfile, out_tensorfil end output_tensors[split][word_num] = word2idx[word] end - for char in word:gmatch'.' do + + for char_code, char in pairs(UTF8ToCharArray(word)) do if char2idx[char]==nil then idx2char[#idx2char + 1] = char -- create char-idx/idx-char mappings char2idx[char] = #idx2char @@ -207,4 +208,3 @@ function BatchLoaderUnk.text_to_tensor(input_files, out_vocabfile, out_tensorfil end return BatchLoaderUnk - diff --git a/util/misc.lua b/util/misc.lua index 45cb09f..952484e 100644 --- a/util/misc.lua +++ b/util/misc.lua @@ -9,4 +9,51 @@ function clone_list(tensor_list, zero_too) if zero_too then out[k]:zero() end end return out -end \ No newline at end of file +end + +-- Multi byte characters start with a byte with bits 7 and 8 set, trailing bytes have bit 7 not set and bit 8 set. +-- https://forums.coronalabs.com/topic/42019-split-utf-8-string-word-with-foreign-characters-to-letters/ by ingemar +function UTF8ToCharArray(str) + local charArray = {}; + local iStart = 0; + local strLen = str:len(); + + local function bit(b) + return 2 ^ (b - 1); + end + + local function hasbit(w, b) + return w % (b + b) >= b; + end + + local checkMultiByte = function(i) + if (iStart ~= 0) then + charArray[#charArray + 1] = str:sub(iStart, i - 1); + iStart = 0; + end + end + + for i = 1, strLen do + local b = str:byte(i); + local multiStart = hasbit(b, bit(7)) and hasbit(b, bit(8)); + local multiTrail = not hasbit(b, bit(7)) and hasbit(b, bit(8)); + + if (multiStart) then + checkMultiByte(i); + iStart = i; + + elseif (not multiTrail) then + checkMultiByte(i); + charArray[#charArray + 1] = str:sub(i, i); + end + end + + -- process if last character is multi-byte + checkMultiByte(strLen + 1); + + return charArray; +end + +function RuneCount(text) + return #UTF8ToCharArray(text) -- TODO: can be heavily optimized +end From a837c84e23b56f739e0682f70bcc86e3a961bb15 Mon Sep 17 00:00:00 2001 From: vseledkin Date: Tue, 27 Oct 2015 11:28:17 +0300 Subject: [PATCH 2/2] another utf8-fix --- util/BatchLoaderUnk.lua | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/util/BatchLoaderUnk.lua b/util/BatchLoaderUnk.lua index 5a5a1cd..07618d4 100644 --- a/util/BatchLoaderUnk.lua +++ b/util/BatchLoaderUnk.lua @@ -167,7 +167,7 @@ function BatchLoaderUnk.text_to_tensor(input_files, out_vocabfile, out_tensorfil collectgarbage() end local chars = {char2idx[tokens.START]} -- start-of-word symbol - if string.sub(word,1,1) == tokens.UNK and word:len() > 1 then -- unk token with character info available + if string.sub(word,1,1) == tokens.UNK and RuneCount(word) > 1 then -- unk token with character info available word = string.sub(word, 3) output_tensors[split][word_num] = word2idx[tokens.UNK] else